In [52]:
import json
import time
from stop_words import get_stop_words
from gensim.models.doc2vec import TaggedDocument,Doc2Vec
from textblob import TextBlob
import pandas as pd
import numpy as np
from scipy import spatial,stats
import matplotlib.pyplot as plt
import csv

In [53]:
def create_doc2vec_model(articles,name,vector_size=100,epochs=10,lang='en'):
    # import stopwords for specific language of model
    stop_words = get_stop_words(lang)
    ## list of just articles (str)
    #strip stopwords article docs
    nostop = [[i for i in doc.lower().split() if i not in stop_words] for doc in articles]
    #tokenize article docs and convert to doc2vec tagged docs - each article has an index number and list of tokens - taggedoc(['token1','token2',[1]])
    tagged = [TaggedDocument(doc,[i]) for i,doc in enumerate(nostop)]
    # instantiate doc2vec model with parameters - size = # of nums representing each doc (100), min_count - occurences of words in vocab (filter out rare words), iter - passes to create vectors
    model = Doc2Vec(vector_size=vector_size, min_count=2, epochs=epochs)
    ## build vocab from all tagged docs
    model.build_vocab(tagged)
    ## train model on tagged docs - total examples - total # of docs
    model.train(tagged,total_examples=model.corpus_count,epochs=epochs)
    # save model with language - eg esmodel.model for spanish docs
    model_name = name + 'model.model'
    model.save(model_name)
    model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
    print('saved as: ' + model_name)

In [54]:
def text2vec(textlist,lang,d2v_model):
    stop_words = get_stop_words(lang)
    histnostop = [[i for i in doc.lower().split() if i not in stop_words] for doc in textlist]
    dlhist_tagged = [TaggedDocument(doc,[i]) for i,doc in enumerate(histnostop)]
    ## infer vectors from current doc2model
    vecs = [d2v_model.infer_vector(doc.words) for doc in dlhist_tagged]
    return vecs

In [55]:
def predict_emotions(text_list,lang,emotion_vecs,model_name):
    d2v_model = Doc2Vec.load(model_name)
    text_vecs = text2vec(text_list,lang,d2v_model)
    emot_scan = [[1 - spatial.distance.cosine(sample_vec,x) for x in emot_vecs]for sample_vec in text_vecs]
    emot_pred = [np.argmax(x) for x in emot_scan]
    return emot_scan,emot_pred,text_vecs

In [56]:
df_isear = pd.read_csv('ISEAR.csv',index_col=False)

In [57]:
emotion_id = []
for x in df_isear['emotion']:
    if x == 'anger':
        emotion_id.append(0)
    if x == 'disgust':
        emotion_id.append(1)
    if x == 'fear':
        emotion_id.append(2)
    if x == 'guilt':
        emotion_id.append(3)
    if x == 'joy':
        emotion_id.append(4)
    if x == 'sadness':
        emotion_id.append(5)
    if x == 'shame':
        emotion_id.append(5)

In [58]:
df_isear['id'] = emotion_id

In [59]:
df_isear.head()

Unnamed: 0,emotion,text,id
0,joy,On days when I feel close to my partner and ot...,4
1,fear,Every time I imagine that someone I love or I ...,2
2,anger,When I had been obviously unjustly treated and...,0
3,sadness,When I think about the short time that we live...,5
4,disgust,At a gathering I found myself involuntarily si...,1


In [60]:
emot_text = df_isear['text']

In [61]:
create_doc2vec_model(emot_text ,'emotions1',vector_size=100,epochs=10,lang='en')

saved as: emotions1model.model


In [62]:
d2v_model = Doc2Vec.load('emotions1model.model')

In [63]:
emot_vecs = [d2v_model.docvecs[x] for x in range(len(emot_text))]

In [64]:
len(emot_vecs)

7516

In [34]:
df_isear['emot_vecs'].to_csv('output.tsv',sep='\t',index=False,header=False)

In [65]:
df_isear['emot_vecs'] = emot_vecs

In [66]:
emotions_grp = list(df_isear.groupby(df_isear['emotion']))

In [67]:
len(emotions_grp)

7

In [79]:
emot_vec_mean = [{'emotion':emotions_grp[x][0],'vec':emotions_grp[x][1]['emot_vecs'].mean()} for x in range(len(emotions_grp))]

In [77]:
emot_vec_mean1 = [emot_vec_mean[x]['vec'] for x in range(len(emot_vec_mean))]

KeyError: 0

In [87]:
emot_vecs_ctr = [x['vec'] for x in emot_vec_mean]
emot_lablel_ctr = [x['emotion'] for x in emot_vec_mean]


In [93]:
def output_tab_vecs(vecs,filename):
    csv.register_dialect('tabDialect', delimiter='\t', quoting=csv.QUOTE_NONE)
    myFile = open(filename, 'w')  
    with myFile:  
        writer = csv.writer(myFile, dialect='tabDialect')
        writer.writerows(vecs)
    print('saved tab file as',filename)

In [97]:
def output_tab_meta(meta,filename):
    csv.register_dialect('tabDialect', delimiter='\t', quoting=csv.QUOTE_NONE,escapechar='\\')
    myFile = open(filename, 'w')  
    with myFile:  
        writer = csv.writer(myFile, dialect='tabDialect')
        writer.writerows(meta)
    print('saved tab file as',filename)

In [None]:
def output_tab_meta(meta,filename):
    csv.register_dialect('tabDialect', delimiter='\t', quoting=csv.QUOTE_NONE,escapechar='\\')
    myFile = open(filename, 'w')  
    with myFile:  
        writer = csv.writer(myFile, dialect='tabDialect')
        writer.writerows(meta)
    print('saved tab file as',filename)

In [98]:
output_tab_vecs(emot_vecs_ctr,'emot_vecs_ctr.tsv')

saved tab file as emot_vecs_ctr.tsv


['anger', 'disgust', 'fear', 'guilt', 'joy', 'sadness', 'shame']

In [103]:
def output_single_meta(metalist,filename):
    with open (filename, 'w') as fo:
        for d in metalist:
            fo.write(str(d) + '\n')
    print('saved tab file as',filename)

In [104]:
output_single_meta(emot_lablel_ctr,'emot_lablel_ctr.tsv')

saved tab file as emot_lablel_ctr.tsv


In [42]:
def predict_emotions(text_list,lang,emot_vec_mean,model_name):
    d2v_model = Doc2Vec.load(model_name)
    text_vecs = text2vec(text_list,lang,d2v_model)
    emot_scan = [{x['emotion']:1 - spatial.distance.cosine(sample_vec,x['vec']) for x in emot_vec_mean} for sample_vec in text_vecs]
    emot_scan1 = [[1 - spatial.distance.cosine(sample_vec,x['vec']) for x in emot_vec_mean] for sample_vec in text_vecs]
    emot_pred = [np.argmax(x) for x in emot_scan1]
    ranks = [stats.rankdata(x) for x in emot_scan1]
    return emot_scan,emot_scan1,emot_pred,ranks,text_vecs

In [44]:
sample_sent = ['I hate you. You make me so upset.','I love you man. You make me so happy.','I am so unhappy and I cannot go on.']

In [45]:
emot_scan,emot_scan1,emot_pred,ranks,text_vecs = predict_emotions(sample_sent,'en',emot_vec_mean,'emotions1model.model')

In [47]:
text_vecs

array([-1.73958484e-02, -4.73076291e-02,  2.44500283e-02,  1.18435808e-02,
        5.73316915e-03, -9.49813332e-03, -1.14797102e-02,  1.42994467e-02,
        2.32999083e-02,  2.07103174e-02,  4.00470616e-03, -1.66813973e-02,
       -3.19475010e-02, -1.60261635e-02, -8.92755948e-03, -3.88043337e-02,
        7.72011513e-03,  4.28962894e-03, -3.63979265e-02,  9.55944136e-03,
        2.72409450e-02, -1.91568006e-02, -4.82461974e-02,  2.43208394e-03,
        5.93350502e-03, -4.11963426e-02, -2.72584595e-02,  7.64230965e-03,
       -1.34155359e-02,  1.88325197e-02, -2.51454189e-02,  4.58628759e-02,
       -9.59073193e-03,  3.14019360e-02, -4.01573442e-02, -1.92378424e-02,
       -2.12946255e-02,  7.19871977e-03, -6.12819521e-03, -4.49165329e-03,
       -9.75790899e-03,  1.26261115e-02,  4.73949611e-02, -1.71859004e-02,
        1.86703242e-02, -2.43627802e-02, -1.54108563e-02, -1.48116974e-02,
       -9.29673624e-05, -4.66659851e-03, -3.38539295e-02,  3.07363719e-02,
        7.83427339e-03, -

In [48]:
def output_tab_vecs(vecs,filename):
    csv.register_dialect('tabDialect', delimiter='\t', quoting=csv.QUOTE_NONE)
    myFile = open(filename, 'w')  
    with myFile:  
        writer = csv.writer(myFile, dialect='tabDialect')
        writer.writerows(vecs)
    print('saved tab file as',filename)
 

In [51]:
output_tab_vecs(text_vecs,'emot_ctr_vecs.tsv')

saved tab file as emot_ctr_vecs.tsv


In [128]:
from sklearn import preprocessing

In [129]:
[preprocessing.scale(x) for x in emot_scan1]

[array([ 1.26289281,  0.3773383 ,  0.29294954,  0.92111834, -1.59419402,
        -1.3346986 ,  0.07459363]),
 array([ 1.9272342 ,  0.03010905, -1.64427464, -0.08967182,  0.40652682,
        -0.63862501,  0.00870141]),
 array([-1.58932613, -0.87037189, -0.35642639,  1.75559065,  0.41378003,
         0.57536677,  0.07138697])]

In [146]:
for x in emot_scan:
    print(sorted(x.values()))

[0.993009090423584, 0.9930384159088135, 0.9931976795196533, 0.9932223558425903, 0.9932318925857544, 0.9932933449745178, 0.9933319687843323]
[0.9309930205345154, 0.9312282204627991, 0.9313566088676453, 0.9313796162605286, 0.9313846230506897, 0.931472659111023, 0.9318283200263977]
[0.9858265519142151, 0.9858867526054382, 0.9859297871589661, 0.9859656095504761, 0.985994279384613, 0.986007809638977, 0.9861066341400146]


In [162]:
sort_emot = []
for x in emot_scan:
    item = []
    for w in sorted(x, key=x.get, reverse=True):
        item.append([w, x[w]])
    sort_emot.append(item)

In [163]:
sort_emot

[[['anger', 0.9933319687843323],
  ['guilt', 0.9932933449745178],
  ['disgust', 0.9932318925857544],
  ['fear', 0.9932223558425903],
  ['shame', 0.9931976795196533],
  ['sadness', 0.9930384159088135],
  ['joy', 0.993009090423584]],
 [['anger', 0.9318283200263977],
  ['joy', 0.931472659111023],
  ['disgust', 0.9313846230506897],
  ['shame', 0.9313796162605286],
  ['guilt', 0.9313566088676453],
  ['sadness', 0.9312282204627991],
  ['fear', 0.9309930205345154]],
 [['guilt', 0.9861066341400146],
  ['sadness', 0.986007809638977],
  ['joy', 0.985994279384613],
  ['shame', 0.9859656095504761],
  ['fear', 0.9859297871589661],
  ['disgust', 0.9858867526054382],
  ['anger', 0.9858265519142151]]]

In [153]:
type(emot_scan[0])

dict