In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import numpy as np

import nltk
# if it is the first time using nltk uncomment line below and run it and download it
# nltk.download()
from nltk import word_tokenize
from nltk.corpus import stopwords
import string

In [2]:
# Import csv training dataset. Please refer to the readme; due to size this was not included in the repository.
df = pd.read_csv('Sentiment_Analysis_Dataset.csv', error_bad_lines=False)
df.head()

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...


In [3]:
# Select Columns to use
df_cleaned = df[['Sentiment', 'SentimentText']]
df_cleaned = df_cleaned[pd.notnull(df_cleaned['SentimentText'])]
df_cleaned.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


In [4]:
# Print shape of dataset
df_cleaned.shape

(1578612, 2)

In [5]:
# Negative is represented by the number 0 and Positive by 1
df_cleaned.Sentiment.value_counts()

1    790177
0    788435
Name: Sentiment, dtype: int64

In [6]:
# Reset index due to some skipping rows (error) while loading csv
df_cleaned = df_cleaned.reset_index(drop=True)

In [7]:
def text_cleaning(df):
    import pandas as pd
    import numpy as np
    import nltk
    # nltk.download()
    from nltk import word_tokenize
    from nltk.corpus import stopwords
    import string
    
    df['cleaned_text'] = ""
    df['tokenized'] = ""
    
    for index, row in df.iterrows():
        text = row['SentimentText'].lower()
        cleaned = [char for char in text if char not in string.punctuation]
        cleaned = "".join(cleaned)
        tokenized = word_tokenize(cleaned)
        df.at[index, 'cleaned_text'] = cleaned
        df.at[index, 'tokenized'] = tokenized
        
    return df

In [8]:
def text_pred_cleaning(data, tfidf_vect, model):
    import pandas as pd
    import numpy as np
    import nltk
    # nltk.download()
    from nltk import word_tokenize
    from nltk.corpus import stopwords
    import string

    text = data.lower()
    cleaned = [char for char in text if char not in string.punctuation]
    cleaned = "".join(cleaned)
    result = np.array([cleaned])
    
    result_prediction = text_pred_features(result, tfidf_vect, model)
    
    
    return result_prediction

In [9]:
def text_features(df, test_size):
    import pandas as pd
    import numpy as np
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.model_selection import train_test_split
    from sklearn.naive_bayes import MultinomialNB
    import pickle

    X = df['cleaned_text']
    y = df['Sentiment']
    
    X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=test_size, random_state=0)
    
    tfidf_vect = TfidfVectorizer(min_df=1, stop_words='english')
    
    tfidf_vect.fit(X_train)
    
    cls = MultinomialNB()
    
    cls.fit(tfidf_vect.transform(X_train), y_train)
    
    # Saving Model
    filename = "text_emotions_model.sav"
    pickle.dump(cls, open(filename, 'wb'))
    
    # Saving Verctorizer
    with open('tfidf_vect.pk', 'wb') as fin:
        pickle.dump(tfidf_vect, fin)
    
    return cls, tfidf_vect, X_train,X_test,y_train,y_test

In [10]:
def text_pred_features(text, tfidf_vect, model):
    import pandas as pd
    import numpy as np
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.model_selection import train_test_split
    from sklearn.naive_bayes import MultinomialNB
    
    text_vect = tfidf_vect.transform(text).toarray()
    
    emotion = model.predict(text_vect.reshape(1, -1))[0]
    
    emotions = {0:'Negative',
                1:'Positive'}
    
    result = emotions[emotion]
    
    return result


In [11]:
df2 = text_cleaning(df_cleaned)
df2.head(15)

Unnamed: 0,Sentiment,SentimentText,cleaned_text,tokenized
0,0,is so sad for my APL frie...,is so sad for my apl friend,"[is, so, sad, for, my, apl, friend]"
1,0,I missed the New Moon trail...,i missed the new moon trailer,"[i, missed, the, new, moon, trailer]"
2,1,omg its already 7:30 :O,omg its already 730 o,"[omg, its, already, 730, o]"
3,0,.. Omgaga. Im sooo im gunna CRy. I'...,omgaga im sooo im gunna cry ive be...,"[omgaga, im, sooo, im, gunna, cry, ive, been, ..."
4,0,i think mi bf is cheating on me!!! ...,i think mi bf is cheating on me tt,"[i, think, mi, bf, is, cheating, on, me, tt]"
5,0,or i just worry too much?,or i just worry too much,"[or, i, just, worry, too, much]"
6,1,Juuuuuuuuuuuuuuuuussssst Chillin!!,juuuuuuuuuuuuuuuuussssst chillin,"[juuuuuuuuuuuuuuuuussssst, chillin]"
7,0,Sunny Again Work Tomorrow :-| ...,sunny again work tomorrow ...,"[sunny, again, work, tomorrow, tv, tonight]"
8,1,handed in my uniform today . i miss you ...,handed in my uniform today i miss you a...,"[handed, in, my, uniform, today, i, miss, you,..."
9,1,hmmmm.... i wonder how she my number @-),hmmmm i wonder how she my number,"[hmmmm, i, wonder, how, she, my, number]"


In [12]:
model, tfidf_vect,X_train,X_test,y_train,y_test = text_features(df2, test_size=0.2)

In [13]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(tfidf_vect.transform(X_test))
print(accuracy_score(y_test, y_pred))

0.7593396743347808


In [14]:
txt = 'This is crazy guys'
text_pred_cleaning(txt, tfidf_vect, model)

'Positive'

In [15]:
txt2 = 'Fuck all of you'
text_pred_cleaning(txt2, tfidf_vect, model)

'Negative'

In [16]:
txt3 = 'I am happy with my life'
text_pred_cleaning(txt3, tfidf_vect, model)

'Positive'

In [17]:
txt4 = 'I am feeling depressed'
text_pred_cleaning(txt4, tfidf_vect, model)

'Negative'