In [1]:
import pandas as pd
import numpy as np

import re
import string
import pickle


from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer


import spacy
from es_lemmatizer import lemmatize

import gensim.parsing.preprocessing as gsp
from gensim import utils

from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
tweets_corpus = pd.read_csv("AnnotatedTweets.csv", index_col=0)

In [3]:
tweets_corpus.head()

Unnamed: 0,text,polarity,favourite_count,retweet_count,created_at
1,Another class show tonight #TommyTiernanShow U...,positive,5,0,2022-02-12 23:57:06+00:00
2,The county needs more people like Tommy and Br...,positive,6,0,2022-02-12 23:55:45+00:00
3,What age are ya????\nHave you ever been examin...,positive,1,0,2022-02-12 23:47:26+00:00
4,Tommy was almost as good as Croker earlier #Go...,positive,1,0,2022-02-12 23:46:54+00:00
5,#TommyTiernanShow Brush Shields walks out and ...,neutral,2,0,2022-02-12 23:46:12+00:00


In [4]:
def create_cat_columns(tweets_corpus, choices, name_res_column):
    conditions = [
        (tweets_corpus["polarity"]  == 'positive'),
        (tweets_corpus["polarity"]  == 'neutral'),
        (tweets_corpus["polarity"]  == 'negative')]
  #  tweets_corpus[name_res_column] = np.select(conditions, choices, default='NONE')
 #   tweets_corpus[name_res_column] = tweets_corpus[name_res_column].astype(int)
    return tweets_corpus


tweets_corpus = create_cat_columns(tweets_corpus=tweets_corpus,
                                   choices=[1, 0, -1],
                                   name_res_column= "polarity")

Functions to clean tweets by removing stopwords etc

In [5]:
def create_custom_stop_words(list_new_stopwords, remove_words):
    custom_stop_words = list(set(stopwords.words('english')))
    custom_stop_words.extend(list_new_stopwords)
    custom_stop_words = [word for word in custom_stop_words if word not in remove_words]
    return custom_stop_words

def lemmatizer(text):
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)

def clean_text(s):
    filters = [gsp.strip_tags,
               gsp.strip_punctuation,
               gsp.strip_multiple_whitespaces,
               gsp.strip_numeric]
    s = re.sub(r'http\S+', '', s)
    s = s.lower()
    s = utils.to_unicode(s)
    s = utils.deaccent(s)
    for f in filters:
        s = f(s)
    return s

Clean Tweets

In [6]:
stopwords_nltk = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")
#nlp.add_pipe(lemmatize, after="tagger")


list_new_stopwords = ["yet", "be", "watch", "today", "your", "here",
                              "we go", "to have", "do", "have", "to go",
                              "say", "eat","so", "well", "latelateshow","https", "tommy"]
remove_words = ["no", "yes", "tommytiernanshow"]

custom_stop_words = create_custom_stop_words(list_new_stopwords, remove_words)

tweets_corpus["content_clean"] =  tweets_corpus["text"].apply(lambda x: ' '.join([word for word in x.split() if word not in (custom_stop_words)]))
tweets_corpus["content_clean"] = tweets_corpus["content_clean"].apply(lambda x: clean_text(x))
tweets_corpus = tweets_corpus.dropna(subset=["content_clean"])
tweets_corpus["content_clean"] = tweets_corpus["content_clean"].replace('\s+', ' ', regex=True)
tweets_corpus["content_clean"] = tweets_corpus["content_clean"].replace("", np.nan)
tweets_corpus["content_clean"] = tweets_corpus["content_clean"].replace(" ", np.nan)
tweets_corpus = tweets_corpus.dropna(subset=["content_clean"])

tweets_corpus = tweets_corpus[tweets_corpus['content_clean'].apply(lambda x: len(x) > 3)]
tweets_corpus.dropna(subset=['content_clean'], inplace=True)
tweets_corpus.reset_index(drop=True, inplace=True)

tweets_corpus["lemma_clean_text"] = tweets_corpus["content_clean"].apply(lambda x: lemmatizer(x))

In [7]:
tweets_corpus.head()

Unnamed: 0,text,polarity,favourite_count,retweet_count,created_at,content_clean,lemma_clean_text
0,Another class show tonight #TommyTiernanShow U...,positive,5,0,2022-02-12 23:57:06+00:00,another class show tonight tommytiernanshow un...,another class show tonight tommytiernanshow un...
1,The county needs more people like Tommy and Br...,positive,6,0,2022-02-12 23:55:45+00:00,the county needs people like tommy brush tommy...,the county need people like tommy brush tommyt...
2,What age are ya????\nHave you ever been examin...,positive,1,0,2022-02-12 23:47:26+00:00,what age ya have ever examined simple question...,what age ya have ever examine simple question ...
3,Tommy was almost as good as Croker earlier #Go...,positive,1,0,2022-02-12 23:46:54+00:00,tommy almost good croker earlier goodsaturday ...,tommy almost good croker early goodsaturday to...
4,#TommyTiernanShow Brush Shields walks out and ...,neutral,2,0,2022-02-12 23:46:12+00:00,tommytiernanshow brush shields walks i self c...,tommytiernanshow brush shields walk i self c...


Show counts of positive, negative and neutral tweets

In [8]:
tweets_corpus["polarity"].value_counts()

positive    761
negative    356
neutral     347
Name: polarity, dtype: int64

Separate 5% of tweets

In [9]:
#separate 5% of posts for final checkup
msk = np.random.rand(len(tweets_corpus)) < 0.9
test_corpus = tweets_corpus[~msk]
tweets_corpus = tweets_corpus[msk]
tweets_corpus.reset_index(drop=True, inplace = True)

functions for training and testing model

In [10]:
def get_feature_vector(train_fit, custom_stop_words):
    vector = TfidfVectorizer(sublinear_tf=True, max_df=.9, stop_words=custom_stop_words)
    vector.fit(train_fit)
    return vector

def create_train_test(tweets_corpus, col_name_label, col_name_text, custom_stop_words):
    # Same tf vector will be used for Testing sentiments on unseen trending data
    tf_vector = get_feature_vector(np.array(tweets_corpus.loc[:, col_name_text]).ravel(),
                                   custom_stop_words)
    X = tf_vector.transform(np.array(tweets_corpus.loc[:, col_name_text]).ravel())
    y = np.array(tweets_corpus.loc[:, col_name_label]).ravel()
    indices = list(tweets_corpus.index)
    X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, indices, test_size=0.2, random_state=125)
    return X_train, X_test, y_train, y_test, indices_train, indices_test, tf_vector

def train_model(model_selected, X_train, X_test, y_train, y_test):
    model = model_selected
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    print()
    print(confusion_matrix(y_test, y_predict))
    print()
    print(classification_report(y_test, y_predict))
    return model, y_predict

In [11]:
tweets_corpus.head()

Unnamed: 0,text,polarity,favourite_count,retweet_count,created_at,content_clean,lemma_clean_text
0,Another class show tonight #TommyTiernanShow U...,positive,5,0,2022-02-12 23:57:06+00:00,another class show tonight tommytiernanshow un...,another class show tonight tommytiernanshow un...
1,The county needs more people like Tommy and Br...,positive,6,0,2022-02-12 23:55:45+00:00,the county needs people like tommy brush tommy...,the county need people like tommy brush tommyt...
2,What age are ya????\nHave you ever been examin...,positive,1,0,2022-02-12 23:47:26+00:00,what age ya have ever examined simple question...,what age ya have ever examine simple question ...
3,Tommy was almost as good as Croker earlier #Go...,positive,1,0,2022-02-12 23:46:54+00:00,tommy almost good croker earlier goodsaturday ...,tommy almost good croker early goodsaturday to...
4,#TommyTiernanShow Brush Shields walks out and ...,neutral,2,0,2022-02-12 23:46:12+00:00,tommytiernanshow brush shields walks i self c...,tommytiernanshow brush shields walk i self c...


Apply different models

In [12]:
X_train, X_test, y_train, y_test, indices_train, indices_test, tf_vector = create_train_test(tweets_corpus, "polarity", 'lemma_clean_text', custom_stop_words)

model_svc, y_predict_svc= train_model(SVC(kernel="linear", tol=1e-7, C=.9),
                                       X_train, X_test, y_train, y_test)

model_rbf, y_predict_rbf = train_model(SVC(kernel="rbf", tol=1e-7, C=.9),
                                       X_train, X_test, y_train, y_test)

model_nb, y_predict_nb = train_model(MultinomialNB(alpha= 0.5),
                                       X_train, X_test, y_train, y_test)

model_LR, y_predict_LR = train_model(LogisticRegression(solver='lbfgs'),
                                       X_train, X_test, y_train, y_test)

model_RF, y_predict_RF = train_model(RandomForestClassifier(max_depth= 14, max_features= 'sqrt',n_estimators= 10),
                                       X_train, X_test, y_train, y_test)

model_GB, y_predict_GB = train_model(GradientBoostingClassifier(max_depth= 14, n_estimators= 500),
                                       X_train, X_test, y_train, y_test)

model_XG, y_predict_XG = train_model(XGBClassifier(eta= 1, gamma= 1, reg_lambda = 5, 
                                                           max_depth= 12, n_estimators= 500),
                                       X_train, X_test, y_train, y_test)




[[ 37  10  14]
 [ 14  25  25]
 [ 10  11 118]]

              precision    recall  f1-score   support

    negative       0.61      0.61      0.61        61
     neutral       0.54      0.39      0.45        64
    positive       0.75      0.85      0.80       139

    accuracy                           0.68       264
   macro avg       0.63      0.62      0.62       264
weighted avg       0.67      0.68      0.67       264


[[ 22   3  36]
 [  6  10  48]
 [  1   3 135]]

              precision    recall  f1-score   support

    negative       0.76      0.36      0.49        61
     neutral       0.62      0.16      0.25        64
    positive       0.62      0.97      0.75       139

    accuracy                           0.63       264
   macro avg       0.67      0.50      0.50       264
weighted avg       0.65      0.63      0.57       264


[[ 27   7  27]
 [  9  13  42]
 [  3   4 132]]

              precision    recall  f1-score   support

    negative       0.69      0.44      




[[ 30  11  20]
 [ 15  24  25]
 [ 12  18 109]]

              precision    recall  f1-score   support

    negative       0.53      0.49      0.51        61
     neutral       0.45      0.38      0.41        64
    positive       0.71      0.78      0.74       139

    accuracy                           0.62       264
   macro avg       0.56      0.55      0.55       264
weighted avg       0.60      0.62      0.61       264



In [13]:
# save the models.
path_model = ""

with open(path_model + 'model_svc', 'wb') as picklefile:
    pickle.dump(model_svc, picklefile)

with open(path_model + 'model_rbf', 'wb') as picklefile:
    pickle.dump(model_rbf,picklefile)

with open(path_model + 'model_nb', 'wb') as picklefile:
    pickle.dump(model_nb,picklefile)

with open(path_model + 'model_LR', 'wb') as picklefile:
    pickle.dump(model_LR,picklefile)

with open(path_model + 'model_RF', 'wb') as picklefile:
    pickle.dump(model_RF,picklefile)
    
with open(path_model + 'model_GB', 'wb') as picklefile:
    pickle.dump(model_GB,picklefile)

with open(path_model + 'model_XG', 'wb') as picklefile:
    pickle.dump(model_XG,picklefile)

with open(path_model + 'vectorizer.pk', 'wb') as fin:
     pickle.dump(tf_vector, fin)

In [22]:
class SentimentAnalysis:
    
    #path_model = "/Users/fergalflattery/OneDrive - Athlone Institute Of Technology/ML & NN/"
    with open('model_rbf', 'rb') as training_model:
        model_rbf = pickle.load(training_model)

#    with open(path_model + 'model_rbf', 'rb') as training_model:
#        model_rbf_pos = pickle.load(training_model)
#
    with open('vectorizer.pk', 'rb') as fin:
        tf_vector = pickle.load(fin)
    
    def create_custom_stop_words(self, list_new_stopwords, remove_words):
        custom_stop_words = list(set(stopwords.words('english')))
        custom_stop_words.extend(list_new_stopwords)
        custom_stop_words = [word for word in custom_stop_words if word not in remove_words]
        return custom_stop_words

    def lemmatizer(self, text,nlp):
        sent = []
        doc = nlp(text)
        for word in doc:
            sent.append(word.lemma_)
        return " ".join(sent)

    def clean_text(self, s):
        filters = [gsp.strip_tags,
                   gsp.strip_punctuation,
                   gsp.strip_multiple_whitespaces,
                   gsp.strip_numeric]
        s = re.sub(r'http\S+', '', s)
        s = s.lower()
        s = utils.to_unicode(s)
        s = utils.deaccent(s)
        for f in filters:
            s = f(s)
        return s
    
    def preprocess(self, text, list_new_stopwords, remove_words, nlp):
        custom_stop_words = self.create_custom_stop_words(list_new_stopwords, remove_words)
        text = ' '.join([word for word in text.split() if word not in (custom_stop_words)])
        text = self.clean_text(text)
        text = self.lemmatizer(text, nlp)
        return text
    
    def get_sentiment(self, text, model= model_rbf, vector= tf_vector):
        stopwords_nltk = set(stopwords.words('english'))
        exclude = set(string.punctuation)
        lemma = WordNetLemmatizer()
        nlp = spacy.load("en_core_web_sm")
        #nlp.add_pipe(lemmatize, after="tagger")

        list_new_stopwords = ["yet", "be", "watch", "today", "your", "here",
                              "we go", "to have", "do", "have", "to go",
                                      "say", "eat","so", "well"]
        remove_words = ["no", "yes"]
        text = self.preprocess(text, list_new_stopwords, remove_words, nlp)
        
        X_new = vector.transform([text])

        predict_rbf = model.predict(X_new)
#       predict_rbf_pos = model_pos.predict(X_new)

        pred = ""
        if (predict_rbf[0]== 'positive'): 
            pred = "positive"
        elif (predict_rbf[0]== 'negative'): 
            pred = "negative"
        elif (predict_rbf[0]== 'neutral'):
            pred = "neutral"
        return pred
    


In [23]:
tweets_corpus

Unnamed: 0,text,polarity,favourite_count,retweet_count,created_at,content_clean,lemma_clean_text
0,Another class show tonight #TommyTiernanShow U...,positive,5,0,2022-02-12 23:57:06+00:00,another class show tonight tommytiernanshow un...,another class show tonight tommytiernanshow un...
1,The county needs more people like Tommy and Br...,positive,6,0,2022-02-12 23:55:45+00:00,the county needs people like tommy brush tommy...,the county need people like tommy brush tommyt...
2,What age are ya????\nHave you ever been examin...,positive,1,0,2022-02-12 23:47:26+00:00,what age ya have ever examined simple question...,what age ya have ever examine simple question ...
3,Tommy was almost as good as Croker earlier #Go...,positive,1,0,2022-02-12 23:46:54+00:00,tommy almost good croker earlier goodsaturday ...,tommy almost good croker early goodsaturday to...
4,#TommyTiernanShow Brush Shields walks out and ...,neutral,2,0,2022-02-12 23:46:12+00:00,tommytiernanshow brush shields walks i self c...,tommytiernanshow brush shields walk i self c...
...,...,...,...,...,...,...,...
1312,The line-up for this Friday's episode of the L...,neutral,0,0,2022-02-24 13:00:01+00:00,the line up friday s episode late late show re...,the line up friday s episode late late show re...
1313,Friday night's show will be jam-packed with gu...,positive,2,0,2022-02-24 12:00:01+00:00,friday night s show jam packed guests host rya...,friday night s show jam pack guest host ryan t...
1314,Comedian Patrick Kielty leads lineup for this ...,positive,0,0,2022-02-24 10:47:10+00:00,comedian patrick kielty leads lineup week s la...,comedian patrick kielty lead lineup week s lat...
1315,Will you be tuning in? #latelate #latelateshow...,neutral,2,0,2022-02-24 09:46:08+00:00,will tuning in latelate latelateshow,will tune in latelate latelateshow


New tweets - test 

In [21]:
sentiment = SentimentAnalysis()
text = "that sports game was awful"
sentiment.get_sentiment(text)

'negative'

In [22]:
sentiment = SentimentAnalysis()
text = "the late late show was so bad tonight i hate the presenter "
sentiment.get_sentiment(text)

'positive'

In [23]:
sentiment = SentimentAnalysis()
text = "what a dreadful show that was."
sentiment.get_sentiment(text)

'positive'

In [24]:
sentiment = SentimentAnalysis()
text = "that was great"
sentiment.get_sentiment(text)

'positive'

In [25]:
sentiment = SentimentAnalysis()
text = "i dont know whether to be happy or sad"
sentiment.get_sentiment(text)

'positive'

In [26]:
sentiment = SentimentAnalysis()
text = "load of rubbush"
sentiment.get_sentiment(text)

'negative'

In [28]:
sentiment = SentimentAnalysis()
text = "i dont understand the person talking"
sentiment.get_sentiment(text)

'positive'

In [None]:
sentiment = SentimentAnalysis()
text = "tl"
sentiment.get_sentiment(text)