In [1]:
from pprint import pprint  # pretty-printer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import gensim;
import os;
import nltk.data;
import time;
import numpy as np;
import matplotlib.pyplot as plt;
from sklearn.manifold import TSNE;

## Similarity Matrix

The method is used to create a similarity matrix from the provided model, for words provided in arguments.
The similarity matrix is than converted into figure and saved for further use

In [2]:
def simMat(model,array,name):
    simMat = np.zeros((len(array),len(array)));
    for i  in range(0,len(array)):
        for j in range (i+1,len(array)):
            try:
                simMat[i][j]= model.wv.similarity(array[i],array[j])
            except KeyError:
                simMat[i][j] = 0;
    plt.matshow(simMat)
    plt.savefig('%s.png'%(name))
    plt.close()

The figure below shows the similarity matrix generated by one of the models created in preprocessing step. As we can see in the matrix there is no clear distinction between the two classes. 
We can see two regions in the upper and lower region of the graph but there is no strong similarity between any items.

<img src='./sim-mat.png'>

## TSNE Plot

TSNE plot is a polupar dimensional reduction technique used in NLP. We use it to find different patterns in data. For
better visualization, gaussian noise is added on both axis of the graph.

In [3]:
def plotTSNE(model,name):
    plt.close('all')
    X=[];
    words = [];
    for i in range(0,100):
        random = np.random.randint(0,len(model.wv.vocab))
        X.append(model[model.wv.vocab][random])
        words.append(model.wv.index2word[random])
    X_embedded = TSNE(n_components=2).fit_transform(X)
    
    mean = (0,0);
    cov = [[5,0],[0,5]]
    x = np.random.multivariate_normal(mean,cov,100);
    X_embedded = X_embedded + x
    newX = [];
    
    plt.scatter(X_embedded[:,0],X_embedded[:,1])
    
    
    # plt.figure(figsize=(10,10))
    for label,x,y in zip(words,X_embedded[:,0],X_embedded[:,1]):
        plt.annotate(label,xy=(x,y),xytext=(0,0),textcoords='offset points')
    # plt.show()
    plt.savefig('%s-tsne.png'%(name),dpi=400)

The TSNE plot below shows 100 randomly selected tokens from corpus and their grouping. The TSNE result supports the similarity matrix, there there are no explicit grounps found by the model

<img src='./model-sg-1-size-600-window-10-tsne.png'>

## Intrinsic Testing

This method performs intrinsic testing for the model provided.
The most famous good and bads words are extracted from internet and listed. These random words are than used to evaluation the model for different matrices.
- Initially similarity is calculated between the words. This show us how good the model was able to separate the two classes.
- Some standard accuracy tests are conducted
- The most similar words for **good** and **bad** and their similarity value is calculated to access the model.

In [4]:
def findAccuracyOfModel(model,name):
    good_words = ['powerful','surprising','imaginative','fascinating',
    'dazzling','legendary','clever','intriguing','original'];
    bad_words = ['confused','boring','disgusting','senseless',
    'moronic','weak','disappointing','stupid','violent'];
    #simMat(model,good_words,name);
    #simMat(model,bad_words,name);
    # pprint(bad_words + good_words)
    #simMat(model,bad_words + good_words,name); 
    tests = model.wv.accuracy('https://raw.githubusercontent.com/RaRe-Technologies/gensim/develop/gensim/test/test_data/questions-words.txt')
    for test in tests:
        if test['section'] == 'total':
            percentage = (1. * len(test['correct'])) / (len(test['correct']) + len(test['incorrect']))
            pprint('test %s :: correct: %i, incorrect: %i percentage %f'% (test['section'],len(test['correct']),len(test['incorrect']),percentage ))    
    pprint('Similarity of good words')
    pprint(model.wv.most_similar('good'));        
    
    pprint('Similarity of bad words')
    pprint(model.wv.most_similar('bad'));

These are the models that are selected from the models trained in **preprocessing.ipbny**

In [5]:
models = [
'model-sg-1-size-100', 
'model-sg-1-size-100-mincount-10', 
'model-sg-1-size-200-mincount-10', 
'model-sg-1-size-200-mincount-50',
'model-sg-2-size-100'
]

selectedModels = [
'model-sg-1-size-100-mincount-10',
'model-sg-1-size-200-mincount-10',
'model-sentiment-sg-1-size-600-mincount-50-window-10',
'model-sentiment-sg-1-size-100-mincount-10'
];
                              
selectedModels = [
'model-sg-1-size-600-window-10'
  ];                            

sentimentModels = [
'model-sentiment-sg-1-size-100-mincount-10',
'model-sentiment-sg-1-size-200-mincount-10',
'model-sentiment-sg-1-size-600-mincount-10-window-10',
'model-sentiment-sg-1-size-600-mincount-50-window-10'
]

      

After defining all the helping functions we call them to find some intrinsic matrices to compare for different models

In [6]:
for model in selectedModels:          
    newModel = gensim.models.Word2Vec.load('./wordToVec/%s'%(model))
    # pprint('Accuracy for model %s'%(model))
    findAccuracyOfModel(newModel,model);
    plotTSNE(newModel,model);

  import sys
