In [212]:
import json
import time
from stop_words import get_stop_words
from gensim.models.doc2vec import TaggedDocument,Doc2Vec
from cold_start_d2v.create_d2v import create_doc2vec_model
from textblob import TextBlob
import pandas as pd
import numpy as np
from scipy import spatial,stats
import matplotlib.pyplot as plt

In [82]:
def create_doc2vec_model(articles,name,vector_size=100,epochs=10,lang='en'):
    # import stopwords for specific language of model
    stop_words = get_stop_words(lang)
    ## list of just articles (str)
    #strip stopwords article docs
    nostop = [[i for i in doc.lower().split() if i not in stop_words] for doc in articles]
    #tokenize article docs and convert to doc2vec tagged docs - each article has an index number and list of tokens - taggedoc(['token1','token2',[1]])
    tagged = [TaggedDocument(doc,[i]) for i,doc in enumerate(nostop)]
    # instantiate doc2vec model with parameters - size = # of nums representing each doc (100), min_count - occurences of words in vocab (filter out rare words), iter - passes to create vectors
    model = Doc2Vec(vector_size=vector_size, min_count=2, epochs=epochs)
    ## build vocab from all tagged docs
    model.build_vocab(tagged)
    ## train model on tagged docs - total examples - total # of docs
    model.train(tagged,total_examples=model.corpus_count,epochs=epochs)
    # save model with language - eg esmodel.model for spanish docs
    model_name = name + 'model.model'
    model.save(model_name)
    model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
    print('saved as: ' + model_name)

In [46]:
df_isear = pd.read_csv('ISEAR.csv',index_col=False)

In [47]:
df_isear.head()

Unnamed: 0,emotion,text
0,joy,On days when I feel close to my partner and ot...
1,fear,Every time I imagine that someone I love or I ...
2,anger,When I had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...


In [114]:
emotion_id = []
for x in df_isear['emotion']:
    if x == 'anger':
        emotion_id.append(0)
    if x == 'disgust':
        emotion_id.append(1)
    if x == 'fear':
        emotion_id.append(2)
    if x == 'guilt':
        emotion_id.append(3)
    if x == 'joy':
        emotion_id.append(4)
    if x == 'sadness':
        emotion_id.append(5)
    if x == 'shame':
        emotion_id.append(5)

In [115]:
df_isear['id'] = emotion_id

In [116]:
df_isear.head()

Unnamed: 0,emotion,text,id
0,joy,On days when I feel close to my partner and ot...,4
1,fear,Every time I imagine that someone I love or I ...,2
2,anger,When I had been obviously unjustly treated and...,0
3,sadness,When I think about the short time that we live...,5
4,disgust,At a gathering I found myself involuntarily si...,1


In [48]:
dfgrp = df_isear.groupby('emotion')

In [49]:
dfgrp.count()

Unnamed: 0_level_0,text
emotion,Unnamed: 1_level_1
anger,1079
disgust,1066
fear,1076
guilt,1050
joy,1092
sadness,1082
shame,1071


In [50]:
emotions = list(df_isear.groupby(df_isear['emotion']))

In [67]:
anger = ' '.join(emotions[0][1]['text'])
disgust = ' '.join(emotions[1][1]['text'])
fear = ' '.join(emotions[2][1]['text'])
guilt = ' '.join(emotions[3][1]['text'])
joy = ' '.join(emotions[4][1]['text'])
sadness= ' '.join(emotions[5][1]['text'])
shame = ' '.join(emotions[6][1]['text'])

In [75]:
emotion_docs = [anger,disgust,fear,guilt,joy,sadness,shame]
    

In [84]:
create_doc2vec_model(emotion_docs ,'emotions',vector_size=100,epochs=10,lang='en')

saved as: emotionsmodel.model


In [87]:
d2v_model = Doc2Vec.load('emotionsmodel.model')

In [92]:
emot_vecs = [d2v_model.docvecs[x] for x in range(len(emotion_docs))]

In [93]:
emot_vecs

array([-1.0042328 , -0.44503173,  2.2308838 ,  1.1011362 , -4.495658  ,
        0.4499111 , -0.8607967 ,  0.87012994,  0.88638806,  2.7453468 ,
       -2.4398687 , -1.3112376 ,  3.3581843 ,  2.151211  , -1.6381798 ,
       -0.5854171 , -1.470463  ,  1.4098977 , -1.6593001 ,  0.02148012,
       -1.1506696 , -3.701003  ,  2.1204686 , -0.7789566 , -1.3698486 ,
        0.66586137,  1.7162398 , -2.1807022 ,  2.1536644 ,  4.4830565 ,
        1.7157242 ,  3.434307  ,  2.7295308 ,  3.4942107 ,  0.52982974,
        2.8820612 ,  2.3028617 , -0.6379505 , -1.5715272 ,  0.5249855 ,
       -1.6265086 , -3.0113037 ,  0.9604587 ,  2.673249  ,  2.0294416 ,
        3.4063714 ,  3.8245406 ,  0.88762903,  1.0744036 ,  0.13727093,
       -2.9649303 , -4.1322484 , -3.4471936 , -3.4598262 ,  3.5403645 ,
        2.652924  , -2.9043639 , -0.36093375,  0.98706084,  2.643825  ,
        2.7258067 , -1.0791116 , -0.3616428 ,  1.6160258 ,  2.8349075 ,
       -3.6613836 ,  0.2918576 , -1.2985184 ,  0.8329695 , -1.20

In [128]:
def text2vec(textlist,lang,d2v_model):
    stop_words = get_stop_words(lang)
    histnostop = [[i for i in doc.lower().split() if i not in stop_words] for doc in textlist]
    dlhist_tagged = [TaggedDocument(doc,[i]) for i,doc in enumerate(histnostop)]
    ## infer vectors from current doc2model
    vecs = [d2v_model.infer_vector(doc.words) for doc in dlhist_tagged]
    return vecs

In [98]:
sample_sent = ['I hate you man. You are a no good person.']

In [118]:
sample_vec = text2vec(sample_sent,'en','emotionsmodel.model')

In [119]:
sample_vec

[array([-0.00944549, -0.0112141 ,  0.02253851,  0.01964142, -0.04360299,
         0.00300453, -0.01213788,  0.00636311,  0.00496102,  0.0222595 ,
        -0.02674135, -0.00877507,  0.02575549,  0.01298883, -0.01613949,
        -0.00365392, -0.01875368,  0.01845772, -0.01430837,  0.00336273,
        -0.00872731, -0.03929968,  0.0220968 , -0.00593262, -0.01464294,
         0.00696931,  0.01085654, -0.01676128,  0.01817365,  0.03883264,
         0.02632422,  0.02649714,  0.01396551,  0.04047174,  0.00307501,
         0.02098176,  0.02121128, -0.00820728, -0.02550436,  0.00582506,
        -0.01712055, -0.02797487,  0.00939885,  0.02110559,  0.0143266 ,
         0.02865664,  0.02924674,  0.01180853,  0.00147758, -0.00256207,
        -0.02956363, -0.02963953, -0.02635263, -0.02847429,  0.0339122 ,
         0.02481868, -0.02776203, -0.002392  ,  0.00451061,  0.0153569 ,
         0.03235786, -0.00641033, -0.00364277,  0.00806031,  0.02449626,
        -0.03306255, -0.00480531, -0.01152792,  0.0

In [103]:
emot_scan = [[1 - spatial.distance.cosine(sample_vec,x) for x in emot_vecs]]

In [109]:
emot_pred = [np.argmax(x) for x in emot_scan]


[0]

In [129]:
def predict_emotions(text_list,lang,emotion_vecs,model_name):
    d2v_model = Doc2Vec.load(model_name)
    text_vecs = text2vec(text_list,lang,d2v_model)
    emot_scan = [[1 - spatial.distance.cosine(sample_vec,x) for x in emot_vecs]for sample_vec in text_vecs]
    emot_pred = [np.argmax(x) for x in emot_scan]
    return emot_scan,emot_pred
    

In [125]:
isear_sents = list(df_isear['text'])

In [131]:
valence,pred = predict_emotions(isear_sents,'en',emot_vecs,'emotionsmodel.model')

In [159]:
df_isear['valence:anger,disgust,fear,guilt,joy,sadness,shame'] = valence

In [160]:
emot_eval = df_isear[['id','pred','valence']]

In [161]:
pd.set_option('display.max_colwidth', -1)

In [169]:
len(df_isear[df_isear['id']==df_isear['pred']])/len(df_isear)

0.3728046833422033

In [None]:
anger,disgust,fear,guilt,joy,sadness,shame

In [181]:
list(int(x) for x in stats.rankdata(emot_eval['valence'].iloc[0]))

[5, 6, 1, 3, 2, 4, 7]

In [183]:
ranks = list(stats.rankdata(val) for val in emot_eval['valence'])

In [258]:
ranks

[array([5., 6., 1., 3., 2., 4., 7.]),
 array([2., 3., 6., 1., 4., 7., 5.]),
 array([4., 3., 2., 5., 1., 6., 7.]),
 array([7., 4., 1., 5., 2., 3., 6.]),
 array([3., 6., 1., 2., 4., 5., 7.]),
 array([6., 5., 1., 4., 2., 3., 7.]),
 array([6., 5., 1., 4., 2., 3., 7.]),
 array([7., 4., 1., 5., 2., 3., 6.]),
 array([4., 2., 5., 3., 1., 7., 6.]),
 array([3., 1., 7., 2., 5., 6., 4.]),
 array([5., 6., 1., 2., 3., 4., 7.]),
 array([7., 5., 1., 4., 2., 3., 6.]),
 array([3., 2., 6., 1., 5., 7., 4.]),
 array([6., 3., 2., 5., 1., 4., 7.]),
 array([3., 2., 5., 1., 7., 4., 6.]),
 array([5., 2., 3., 4., 1., 7., 6.]),
 array([6., 4., 1., 7., 2., 3., 5.]),
 array([4., 1., 3., 2., 6., 5., 7.]),
 array([5., 7., 1., 3., 2., 4., 6.]),
 array([4., 3., 2., 6., 1., 5., 7.]),
 array([6., 5., 1., 4., 3., 2., 7.]),
 array([3., 2., 4., 1., 6., 7., 5.]),
 array([2., 3., 7., 1., 5., 6., 4.]),
 array([6., 5., 1., 4., 2., 3., 7.]),
 array([4., 3., 2., 6., 1., 7., 5.]),
 array([3., 4., 2., 1., 5., 7., 6.]),
 array([7., 

In [199]:
corr = list(df_isear['id'])

In [201]:
corr_ranks = [ranks[x] for x in corr]
    

In [205]:
corr_ranks = list(zip(ranks,corr))

In [206]:
corr_ranks[0]

(array([5., 6., 1., 3., 2., 4., 7.]), 4)

In [208]:
corr_ranks[0][0][corr_ranks[0][1]]

2.0

In [209]:
corr_position = [x[0][x[1]] for x in corr_ranks]

In [225]:
post_arr = np.array(corr_position)
df_position = pd.DataFrame(post_arr)

In [245]:
position_grp = list(df_position.groupby([0]))


In [250]:
len(position_grp[0][1])

216

In [259]:
## ranking from lowest to highest so highest cosine score will have highest rank - [.4,.8,.9] = [1,2,3] ranks - highest rank is correct prediction

In [256]:
for x in position_grp:
    print(len(x[1])/len(corr_position))

0.028738690792974985
0.05228845130388504
0.11428951569984035
0.12187333688131985
0.1329164449175093
0.17708887706226717
0.3728046833422033


In [255]:
len(corr_position)

7516

In [257]:
df_position

Unnamed: 0,0
0,2.0
1,6.0
2,4.0
3,3.0
4,6.0
5,3.0
6,4.0
7,2.0
8,5.0
9,3.0
