@@ -53,6 +53,7 @@ def build_text_graph_dataset(dataset, window_size):
5353
5454
5555def build_edges (doc_list , word_id_map , vocab , word_doc_freq , window_size = 20 ):
56+ # constructing all windows
5657 windows = []
5758 for doc_words in doc_list :
5859 words = doc_words .split ()
@@ -63,13 +64,15 @@ def build_edges(doc_list, word_id_map, vocab, word_doc_freq, window_size=20):
6364 for i in range (doc_length - window_size + 1 ):
6465 window = words [i : i + window_size ]
6566 windows .append (window )
67+ # constructing all single word frequency
6668 word_window_freq = defaultdict (int )
6769 for window in windows :
6870 appeared = set ()
6971 for word in window :
7072 if word not in appeared :
7173 word_window_freq [word ] += 1
7274 appeared .add (word )
75+ # constructing word pair count frequency
7376 word_pair_count = defaultdict (int )
7477 for window in tqdm (windows ):
7578 for i in range (1 , len (window )):
@@ -101,7 +104,8 @@ def build_edges(doc_list, word_id_map, vocab, word_doc_freq, window_size=20):
101104 col .append (num_docs + j )
102105 weight .append (pmi )
103106
104- doc_word_freq = defaultdict (int ) # frequency of document word pair
107+ # frequency of document word pair
108+ doc_word_freq = defaultdict (int )
105109 for i , doc_words in enumerate (doc_list ):
106110 words = doc_words .split ()
107111 for word in words :
0 commit comments