In [None]:
import bz2
import gensim
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
data_file="news.crawl.bz2"

with bz2.open ('news.crawl.bz2', 'rb') as f:
    for i,line in enumerate (f):
        print(line)
        break

b'\xc2\xbf Robert J. Spagnoletti , attorney general : $ 22,903 * * \n'


In [None]:
def read_input(input_file):   
    logging.info("reading file {0}...this may take a while".format(input_file))
    
    with bz2.open (input_file, 'rb') as f:
        for i, line in enumerate (f): 

            if (i%10000==0):
                logging.info ("read {0} reviews".format (i))
            yield gensim.utils.simple_preprocess (line)


documents = list (read_input (data_file))
logging.info ("Done reading data file")

2021-05-08 16:48:50,194 : INFO : reading file news.crawl.bz2...this may take a while
2021-05-08 16:48:50,232 : INFO : read 0 reviews
2021-05-08 16:48:50,783 : INFO : read 10000 reviews
2021-05-08 16:48:51,239 : INFO : read 20000 reviews
2021-05-08 16:48:51,749 : INFO : read 30000 reviews
2021-05-08 16:48:52,208 : INFO : read 40000 reviews
2021-05-08 16:48:52,695 : INFO : read 50000 reviews
2021-05-08 16:48:53,162 : INFO : read 60000 reviews
2021-05-08 16:48:53,653 : INFO : read 70000 reviews
2021-05-08 16:48:54,111 : INFO : read 80000 reviews
2021-05-08 16:48:54,566 : INFO : read 90000 reviews
2021-05-08 16:48:55,169 : INFO : read 100000 reviews
2021-05-08 16:48:55,645 : INFO : read 110000 reviews
2021-05-08 16:48:56,138 : INFO : read 120000 reviews
2021-05-08 16:48:56,607 : INFO : read 130000 reviews
2021-05-08 16:48:57,101 : INFO : read 140000 reviews
2021-05-08 16:48:57,561 : INFO : read 150000 reviews
2021-05-08 16:48:58,029 : INFO : read 160000 reviews
2021-05-08 16:48:58,527 : IN

In [None]:
model = gensim.models.Word2Vec (documents, size=150, window=5, min_count=2, workers=10, iter=10)

2021-05-08 17:00:12,947 : INFO : collecting all words and their counts
2021-05-08 17:00:12,949 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-05-08 17:00:12,999 : INFO : PROGRESS: at sentence #10000, processed 196526 words, keeping 22080 word types
2021-05-08 17:00:13,046 : INFO : PROGRESS: at sentence #20000, processed 391785 words, keeping 31750 word types
2021-05-08 17:00:13,096 : INFO : PROGRESS: at sentence #30000, processed 586214 words, keeping 38689 word types
2021-05-08 17:00:13,148 : INFO : PROGRESS: at sentence #40000, processed 781242 words, keeping 44434 word types
2021-05-08 17:00:13,200 : INFO : PROGRESS: at sentence #50000, processed 976068 words, keeping 49425 word types
2021-05-08 17:00:13,252 : INFO : PROGRESS: at sentence #60000, processed 1172576 words, keeping 53890 word types
2021-05-08 17:00:13,302 : INFO : PROGRESS: at sentence #70000, processed 1368734 words, keeping 57894 word types
2021-05-08 17:00:13,356 : INFO : PROGRESS: a

In [None]:
model.wv.similarity('dirty','clean')

0.36177364

In [None]:
model.wv.similarity('big','dirty')

0.27424753

In [None]:
model.wv.similarity('big','large')

0.4694664

In [None]:
model.wv.similarity('big','small')

0.49960312

In [None]:
w1 = ["polite"]
model.wv.most_similar(w1,topn=5)

2021-05-08 17:15:18,113 : INFO : precomputing L2-norms of word weight vectors


[('respectful', 0.7270292639732361),
 ('gracious', 0.6867384910583496),
 ('courteous', 0.675764799118042),
 ('timid', 0.6272166967391968),
 ('personable', 0.6251391768455505)]

In [None]:
w1 = ["orange"]
model.wv.most_similar(w1,topn=5)

[('emerald', 0.51704341173172),
 ('ventura', 0.5046364068984985),
 ('lackawanna', 0.5046263337135315),
 ('pima', 0.5012227892875671),
 ('cuyahoga', 0.4997793436050415)]

In [None]:
model2 = gensim.models.Word2Vec (documents, size=50, window=2, min_count=2, workers=10, iter=10)

2021-05-08 17:15:58,303 : INFO : collecting all words and their counts
2021-05-08 17:15:58,304 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-05-08 17:15:58,356 : INFO : PROGRESS: at sentence #10000, processed 196526 words, keeping 22080 word types
2021-05-08 17:15:58,403 : INFO : PROGRESS: at sentence #20000, processed 391785 words, keeping 31750 word types
2021-05-08 17:15:58,452 : INFO : PROGRESS: at sentence #30000, processed 586214 words, keeping 38689 word types
2021-05-08 17:15:58,503 : INFO : PROGRESS: at sentence #40000, processed 781242 words, keeping 44434 word types
2021-05-08 17:15:58,563 : INFO : PROGRESS: at sentence #50000, processed 976068 words, keeping 49425 word types
2021-05-08 17:15:58,627 : INFO : PROGRESS: at sentence #60000, processed 1172576 words, keeping 53890 word types
2021-05-08 17:15:58,687 : INFO : PROGRESS: at sentence #70000, processed 1368734 words, keeping 57894 word types
2021-05-08 17:15:58,740 : INFO : PROGRESS: a

In [None]:
model2.wv.similarity('dirty','clean')

0.46674222

In [None]:
model2.wv.similarity('big','dirty')

0.44198802

In [None]:
model2.wv.similarity('big','large')

0.65275484

In [None]:
model2.wv.similarity('big','small')

0.7122918

In [None]:
w1 = ["polite"]
model2.wv.most_similar(w1,topn=5)

2021-05-08 17:27:01,898 : INFO : precomputing L2-norms of word weight vectors


[('courteous', 0.8280003070831299),
 ('timid', 0.7914358973503113),
 ('respectful', 0.7899367809295654),
 ('forthright', 0.7803406715393066),
 ('candid', 0.7640137672424316)]

In [None]:
w1 = ["orange"]
model2.wv.most_similar(w1,topn=5)

[('poinsettia', 0.6837872862815857),
 ('alamo', 0.6731881499290466),
 ('fringed', 0.6637517809867859),
 ('emerald', 0.6446513533592224),
 ('blue', 0.6411835551261902)]