In [1]:
import json
import time
from stop_words import get_stop_words
from gensim.models.doc2vec import TaggedDocument,Doc2Vec
from cold_start_d2v.create_d2v import create_doc2vec_model
from textblob import TextBlob
import pandas as pd
import numpy as np
from scipy import spatial,stats
import matplotlib.pyplot as plt

In [2]:
def create_doc2vec_model(articles,name,vector_size=100,epochs=10,lang='en'):
    # import stopwords for specific language of model
    stop_words = get_stop_words(lang)
    ## list of just articles (str)
    #strip stopwords article docs
    nostop = [[i for i in doc.lower().split() if i not in stop_words] for doc in articles]
    #tokenize article docs and convert to doc2vec tagged docs - each article has an index number and list of tokens - taggedoc(['token1','token2',[1]])
    tagged = [TaggedDocument(doc,[i]) for i,doc in enumerate(nostop)]
    # instantiate doc2vec model with parameters - size = # of nums representing each doc (100), min_count - occurences of words in vocab (filter out rare words), iter - passes to create vectors
    model = Doc2Vec(vector_size=vector_size, min_count=2, epochs=epochs)
    ## build vocab from all tagged docs
    model.build_vocab(tagged)
    ## train model on tagged docs - total examples - total # of docs
    model.train(tagged,total_examples=model.corpus_count,epochs=epochs)
    # save model with language - eg esmodel.model for spanish docs
    model_name = name + 'model.model'
    model.save(model_name)
    model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
    print('saved as: ' + model_name)

In [32]:
def text2vec(textlist,lang,d2v_model):
    stop_words = get_stop_words(lang)
    histnostop = [[i for i in doc.lower().split() if i not in stop_words] for doc in textlist]
    dlhist_tagged = [TaggedDocument(doc,[i]) for i,doc in enumerate(histnostop)]
    ## infer vectors from current doc2model
    vecs = [d2v_model.infer_vector(doc.words) for doc in dlhist_tagged]
    return vecs

In [33]:
def predict_emotions(text_list,lang,emotion_vecs,model_name):
    d2v_model = Doc2Vec.load(model_name)
    text_vecs = text2vec(text_list,lang,d2v_model)
    emot_scan = [[1 - spatial.distance.cosine(sample_vec,x) for x in emot_vecs]for sample_vec in text_vecs]
    emot_pred = [np.argmax(x) for x in emot_scan]
    return emot_scan,emot_pred

In [3]:
df_isear = pd.read_csv('ISEAR.csv',index_col=False)

In [5]:
emotion_id = []
for x in df_isear['emotion']:
    if x == 'anger':
        emotion_id.append(0)
    if x == 'disgust':
        emotion_id.append(1)
    if x == 'fear':
        emotion_id.append(2)
    if x == 'guilt':
        emotion_id.append(3)
    if x == 'joy':
        emotion_id.append(4)
    if x == 'sadness':
        emotion_id.append(5)
    if x == 'shame':
        emotion_id.append(5)

In [6]:
df_isear['id'] = emotion_id

In [7]:
df_isear.head()

Unnamed: 0,emotion,text,id
0,joy,On days when I feel close to my partner and ot...,4
1,fear,Every time I imagine that someone I love or I ...,2
2,anger,When I had been obviously unjustly treated and...,0
3,sadness,When I think about the short time that we live...,5
4,disgust,At a gathering I found myself involuntarily si...,1


In [8]:
emot_text = df_isear['text']

In [10]:
create_doc2vec_model(emot_text ,'emotions',vector_size=100,epochs=10,lang='en')

saved as: emotionsmodel.model


In [11]:
d2v_model = Doc2Vec.load('emotionsmodel.model')

In [12]:
emot_vecs = [d2v_model.docvecs[x] for x in range(len(emot_text))]

In [15]:
df_isear['emot_vecs'] = emot_vecs

In [17]:
emotions_grp = list(df_isear.groupby(df_isear['emotion']))

In [24]:
len(emotions_grp)

7

In [26]:
emot_vec_mean = [{'emotion':emotions_grp[x][0],'vec':emotions_grp[x][1]['emot_vecs'].mean()} for x in range(len(emotions_grp))]

In [31]:
emot_vec_mean[0]['vec']

array([-0.03394571,  0.01575613, -0.02299005,  0.00400032, -0.00243837,
       -0.02370932,  0.02524502,  0.01001316,  0.03191565,  0.03928171,
       -0.020799  ,  0.00927222, -0.02194257, -0.03660933,  0.02896129,
        0.00516501,  0.02101598,  0.04501959, -0.02352156,  0.0122413 ,
       -0.01742102,  0.00820629, -0.00634036, -0.02798223, -0.01981252,
        0.00572883, -0.00220514, -0.00909239,  0.00695971, -0.0202946 ,
       -0.01773105,  0.0097519 ,  0.01678837, -0.00122372, -0.03114676,
        0.02916909,  0.01618617,  0.01141875,  0.01230535, -0.01458297,
       -0.00460585,  0.00677788, -0.00685136, -0.01285849, -0.01681715,
       -0.01826535, -0.01752163,  0.01298876, -0.02389124, -0.02056606,
       -0.003208  , -0.00175442, -0.02779999, -0.04809429,  0.01049049,
        0.01465398,  0.00538773,  0.01597845,  0.03355633, -0.01048584,
       -0.03197055, -0.02174224, -0.04325168, -0.00625255, -0.00358859,
        0.02023767,  0.01100595, -0.03110011, -0.03332488, -0.00

In [124]:
def predict_emotions(text_list,lang,emot_vec_mean,model_name):
    d2v_model = Doc2Vec.load(model_name)
    text_vecs = text2vec(text_list,lang,d2v_model)
    emot_scan = [{x['emotion']:1 - spatial.distance.cosine(sample_vec,x['vec']) for x in emot_vec_mean} for sample_vec in text_vecs]
    emot_scan1 = [[1 - spatial.distance.cosine(sample_vec,x['vec']) for x in emot_vec_mean] for sample_vec in text_vecs]
    emot_pred = [np.argmax(x) for x in emot_scan1]
    ranks = [stats.rankdata(x) for x in emot_scan1]
    return emot_scan,emot_scan1,emot_pred,ranks

In [125]:
sample_sent = ['I hate you. You make me so upset.','I love you man. You make me so happy.','I am so unhappy and I cannot go on.']

In [126]:
emot_scan,emot_scan1,emot_pred,ranks = predict_emotions(sample_sent,'en',emot_vec_mean,'emotionsmodel.model')

In [127]:
emot_pred,ranks

([0, 0, 3],
 [array([7., 5., 4., 6., 1., 2., 3.]),
  array([7., 5., 1., 3., 6., 2., 4.]),
  array([1., 2., 3., 7., 5., 6., 4.])])

In [128]:
from sklearn import preprocessing

In [129]:
[preprocessing.scale(x) for x in emot_scan1]

[array([ 1.26289281,  0.3773383 ,  0.29294954,  0.92111834, -1.59419402,
        -1.3346986 ,  0.07459363]),
 array([ 1.9272342 ,  0.03010905, -1.64427464, -0.08967182,  0.40652682,
        -0.63862501,  0.00870141]),
 array([-1.58932613, -0.87037189, -0.35642639,  1.75559065,  0.41378003,
         0.57536677,  0.07138697])]

In [146]:
for x in emot_scan:
    print(sorted(x.values()))

[0.993009090423584, 0.9930384159088135, 0.9931976795196533, 0.9932223558425903, 0.9932318925857544, 0.9932933449745178, 0.9933319687843323]
[0.9309930205345154, 0.9312282204627991, 0.9313566088676453, 0.9313796162605286, 0.9313846230506897, 0.931472659111023, 0.9318283200263977]
[0.9858265519142151, 0.9858867526054382, 0.9859297871589661, 0.9859656095504761, 0.985994279384613, 0.986007809638977, 0.9861066341400146]


In [162]:
sort_emot = []
for x in emot_scan:
    item = []
    for w in sorted(x, key=x.get, reverse=True):
        item.append([w, x[w]])
    sort_emot.append(item)

In [163]:
sort_emot

[[['anger', 0.9933319687843323],
  ['guilt', 0.9932933449745178],
  ['disgust', 0.9932318925857544],
  ['fear', 0.9932223558425903],
  ['shame', 0.9931976795196533],
  ['sadness', 0.9930384159088135],
  ['joy', 0.993009090423584]],
 [['anger', 0.9318283200263977],
  ['joy', 0.931472659111023],
  ['disgust', 0.9313846230506897],
  ['shame', 0.9313796162605286],
  ['guilt', 0.9313566088676453],
  ['sadness', 0.9312282204627991],
  ['fear', 0.9309930205345154]],
 [['guilt', 0.9861066341400146],
  ['sadness', 0.986007809638977],
  ['joy', 0.985994279384613],
  ['shame', 0.9859656095504761],
  ['fear', 0.9859297871589661],
  ['disgust', 0.9858867526054382],
  ['anger', 0.9858265519142151]]]

In [153]:
type(emot_scan[0])

dict