In [1]:
import numpy as np
import pandas as pd

In [2]:
data_file = pd.read_excel('data_f.xlsx')

In [3]:
from django.utils.encoding import smart_str, smart_unicode
book_names = [smart_str(n).lower() for n in data_file['title'] if n!=0]
print(book_names[0:20])

['network theory and filter design', 'fundamentals of data structures', 'fundamentals of data structures', 'electronic measurements', 'digital computer electronics:an introduction to microcomputers', 'electrical engineering materials', 'electronic devices and circuits', 'analysis and design of analog integrated circuits', 'lotus 1-2-3 : quick reference handbook', 'lotus 1-2-3 student workbook and instruction guide', 'optical fiber transmission systems', 'solid state electronic devices', 'interactive computer graphics:data structures, algorithms, languages', 'electrical machines and their applications', 'computer architecture and organization', 'fundamentals of programming languages', 'introduction to digital computer design', 'introduction to operating systems', 'computer programming in cobol', 'programming in pascal']


In [4]:
from nltk.tokenize import RegexpTokenizer
import nltk

In [5]:
tokenizer = RegexpTokenizer(r'\w+')
tokens = [tokenizer.tokenize(i) for i in book_names]
print(tokens[0:20])

[['network', 'theory', 'and', 'filter', 'design'], ['fundamentals', 'of', 'data', 'structures'], ['fundamentals', 'of', 'data', 'structures'], ['electronic', 'measurements'], ['digital', 'computer', 'electronics', 'an', 'introduction', 'to', 'microcomputers'], ['electrical', 'engineering', 'materials'], ['electronic', 'devices', 'and', 'circuits'], ['analysis', 'and', 'design', 'of', 'analog', 'integrated', 'circuits'], ['lotus', '1', '2', '3', 'quick', 'reference', 'handbook'], ['lotus', '1', '2', '3', 'student', 'workbook', 'and', 'instruction', 'guide'], ['optical', 'fiber', 'transmission', 'systems'], ['solid', 'state', 'electronic', 'devices'], ['interactive', 'computer', 'graphics', 'data', 'structures', 'algorithms', 'languages'], ['electrical', 'machines', 'and', 'their', 'applications'], ['computer', 'architecture', 'and', 'organization'], ['fundamentals', 'of', 'programming', 'languages'], ['introduction', 'to', 'digital', 'computer', 'design'], ['introduction', 'to', 'operat

In [6]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [7]:
num_features = 400    # Word vector dimensionality                      
min_word_count = 10  # Minimum word count                        
num_workers = 6    # Number of threads to run in parallel
context = 200         # Context window size                                                                                    
downsampling = 1e-2   # Downsample setting for frequent words

In [8]:
from gensim.models import word2vec
model = word2vec.Word2Vec(tokens, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

2017-12-06 01:17:22,352 : INFO : 'pattern' package not found; tag filters are not available for English
2017-12-06 01:17:22,360 : INFO : collecting all words and their counts
2017-12-06 01:17:22,362 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-12-06 01:17:22,384 : INFO : PROGRESS: at sentence #10000, processed 50115 words, keeping 4233 word types
2017-12-06 01:17:22,426 : INFO : PROGRESS: at sentence #20000, processed 101173 words, keeping 7389 word types
2017-12-06 01:17:22,461 : INFO : PROGRESS: at sentence #30000, processed 153298 words, keeping 8502 word types
2017-12-06 01:17:22,494 : INFO : PROGRESS: at sentence #40000, processed 202797 words, keeping 8971 word types
2017-12-06 01:17:22,518 : INFO : PROGRESS: at sentence #50000, processed 264814 words, keeping 11799 word types
2017-12-06 01:17:22,590 : INFO : PROGRESS: at sentence #60000, processed 377400 words, keeping 16267 word types
2017-12-06 01:17:22,624 : INFO : PROGRESS: at sentence #700

In [9]:
model.init_sims(replace=True)

2017-12-06 01:17:29,657 : INFO : precomputing L2-norms of word weight vectors


In [17]:
model_name = "300features_40minwords_10context"
model.save(model_name)

2017-12-06 01:17:59,741 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2017-12-06 01:17:59,744 : INFO : not storing attribute syn0norm
2017-12-06 01:17:59,746 : INFO : not storing attribute cum_table
2017-12-06 01:17:59,872 : INFO : saved 300features_40minwords_10context


In [20]:
model.most_similar("human")


[('effective', 0.7903382778167725),
 ('interaction', 0.7681287527084351),
 ('administration', 0.7244318723678589),
 ('financial', 0.6917579770088196),
 ('organisation', 0.6910723447799683),
 ('organizational', 0.6895462274551392),
 ('personnel', 0.6874874830245972),
 ('innovation', 0.6839104294776917),
 ('knowledge', 0.6813844442367554),
 ('vedamrit', 0.6796798706054688)]

In [21]:
model.most_similar("life")

[('love', 0.7956965565681458),
 ('ph', 0.7767686247825623),
 ('sri', 0.7733452320098877),
 ('peace', 0.7711155414581299),
 ('stories', 0.7608237266540527),
 ('hindu', 0.7562004327774048),
 ('philosophy', 0.7535654306411743),
 ('god', 0.7371869087219238),
 ('nations', 0.7351016998291016),
 ('literature', 0.7350578308105469)]

In [22]:
model.doesnt_match("machine business managerial vector engineering cost".split())


'machine'

In [16]:
print("---------------------END---------------------")

---------------------END---------------------
