In [98]:
import pandas as pd
import re
import spacy
from nltk.stem import WordNetLemmatizer
import gensim
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [30]:
def remove_link_menzione(doc):
    pattern_link="https\S+"
    pattern_menzione="@\S+"
    doc=re.sub(pattern_link, '',doc)
    doc=re.sub(pattern_menzione," ",doc)
    return doc

In [31]:
def find_and_merge_not(lista,all_stopwords):
    tmp=''
    parole_esaminate=[]
    for parole in lista:
        if parole == 'not':
            tmp='not_'
        elif tmp=='not_' and parole not in all_stopwords:
            parole=tmp+parole
            tmp=''
        if parole != 'not' and parole not in all_stopwords:
            parole_esaminate.append(parole)
    return parole_esaminate

In [32]:
def clear_corpus(tweets):                
    tokens_ = [list(gensim.utils.tokenize(remove_link_menzione(tweet), lower=True)) for tweet in tweets]
    
    sp = spacy.load('en_core_web_sm')
    all_stopwords = sp.Defaults.stop_words

    wnl = WordNetLemmatizer()
    lemmatized_string=[[wnl.lemmatize(words) for words in find_and_merge_not(lista,all_stopwords)] for lista in tokens_]
    bigram_mdl = gensim.models.phrases.Phrases(lemmatized_string, min_count=1, threshold=2)##farglielo fare su tutto il corpus tokenizzato altrimenti non funziona se lo fai su una sola frase
    
    post=[bigram_mdl[l] for l in lemmatized_string]
    
    #lemmatized_bigram_string=[' '.join(word) for word in post]
    return post,bigram_mdl

In [33]:
df=pd.read_csv("text_emotion_esercizio.csv")
df=df[["sentiment","content"]]
bigramLS_post,bigram_mdl=clear_corpus(df["content"])
df["content"]=[Counter(tmp) for tmp in bigramLS_post]

In [34]:
vectorizer = DictVectorizer(sparse = True)

In [35]:
X_all=vectorizer.fit_transform(list(df["content"]))
y_all=list(df["sentiment"])

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.2, random_state = 123)

In [37]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

MultinomialNB()

In [38]:
print("Accuracy Train",accuracy_score(y_train, mnb.predict(X_train)))
print(classification_report(y_train, mnb.predict(X_train),zero_division=0))

Accuracy Train 0.61659375
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        83
     boredom       1.00      0.01      0.01       151
       empty       0.98      0.06      0.11       672
  enthusiasm       1.00      0.04      0.08       629
         fun       0.98      0.20      0.33      1425
   happiness       0.66      0.72      0.69      4148
        hate       0.99      0.19      0.32      1093
        love       0.77      0.61      0.68      3059
     neutral       0.60      0.78      0.68      6861
      relief       0.99      0.13      0.23      1224
     sadness       0.76      0.59      0.66      4157
    surprise       0.99      0.19      0.32      1758
       worry       0.51      0.89      0.65      6740

    accuracy                           0.62     32000
   macro avg       0.79      0.34      0.37     32000
weighted avg       0.71      0.62      0.58     32000



In [39]:
print("Accuracy Test",accuracy_score(y_test, mnb.predict(X_test)))
print(classification_report(y_test, mnb.predict(X_test),zero_division=0))

Accuracy Test 0.316375
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        27
     boredom       0.00      0.00      0.00        28
       empty       0.00      0.00      0.00       155
  enthusiasm       0.00      0.00      0.00       130
         fun       0.00      0.00      0.00       351
   happiness       0.31      0.31      0.31      1061
        hate       0.27      0.01      0.02       230
        love       0.45      0.30      0.36       783
     neutral       0.33      0.44      0.38      1777
      relief       0.00      0.00      0.00       302
     sadness       0.28      0.18      0.22      1008
    surprise       0.09      0.00      0.01       429
       worry       0.30      0.58      0.40      1719

    accuracy                           0.32      8000
   macro avg       0.16      0.14      0.13      8000
weighted avg       0.27      0.32      0.27      8000



In [57]:
def tweet_adapter(tweet,vectorizer,bigram_mdl):
    lista_token = gensim.utils.tokenize(remove_link_menzione(tweet), lower=True)
    
    sp = spacy.load('en_core_web_sm')
    all_stopwords = sp.Defaults.stop_words

    wnl = WordNetLemmatizer()
    lemmatized_string=[wnl.lemmatize(words) for words in find_and_merge_not(lista_token,all_stopwords)]
        
    post=bigram_mdl[lemmatized_string]
    return vectorizer.transform([Counter(post)])

In [94]:
def list_prob(tweet,vectorizer,bigram_mdl,model):
    tmp = model.predict_proba(tweet_adapter(tweet,vectorizer,bigram_mdl))
    tmp2=list(tmp[0])
    class2=list(model.classes_)
    return sorted(dict(zip(class2,tmp2)).items(), key=lambda x: x[1], reverse=True)


In [96]:
a=list_prob("I am so happy",vectorizer,bigram_mdl,mnb)