## Import Libraries

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import re
import string
import nltk
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('../artifacts/sentiment_analysis.csv')

In [3]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


## Data Preprocessing

In [4]:
data['tweet'] = data['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [5]:
data['tweet'] = data['tweet'].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*','',x, flags=re.MULTILINE) for x in x.split()))

In [6]:
def remove_punctuations(text):
    for punct in string.punctuation:
        text = text.replace(punct,'')
    return text
data['tweet'] = data['tweet'].apply(remove_punctuations)

In [7]:
data['tweet'] = data['tweet'].str.replace('\d+', '', regex=True)

In [8]:
nltk.download('stopwords', download_dir='../static/model')

[nltk_data] Downloading package stopwords to ../static/model...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
with open('../static/model/corpora/stopwords/english','r') as file:
    stopwords = file.read().splitlines()

In [10]:
data['tweet'] = data['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))

In [11]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [12]:
data['tweet'] = data['tweet'].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))

## Vocabulary

In [13]:
from collections import Counter
counter = Counter()

In [14]:
for sentence in data['tweet']:
    counter.update(sentence.split())

In [15]:
tokens = [key for key in counter if counter[key]>20]

In [16]:
def save_vocab(sentence,filename):
    data = '\n'.join(sentence)
    file = open(filename, 'w', encoding='utf-8')
    file.write(data)
    file.close()

save_vocab(tokens, '../static/model/vocabulary.txt')

In [17]:
X = data['tweet']
y = data['label']

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

## Vectorization

In [19]:
def vectorization(data,vocab):
    vectorized_list=[]

    for sentance in data:
        sentence_list = np.zeros(len(vocab))
        for i in range(len(vocab)):
            if vocab[i] in sentance.split():
                sentence_list[i] = 1
        vectorized_list.append(sentence_list)
    vectorized_list = np.asarray(vectorized_list, dtype=np.float32)
    return vectorized_list


In [20]:
vectorized_X_train = vectorization(X_train,tokens)

In [21]:
vectorized_X_test = vectorization(X_test,tokens)

In [23]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
vectorized_X_train, y_train = smote.fit_resample(vectorized_X_train,y_train)

## Model Building and Validation

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [28]:
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score

def training_test(y_true, y_pred):
    accuracy = round(accuracy_score(y_true,y_pred),3)
    precision = round(precision_score(y_true,y_pred),3)
    recall = round(recall_score(y_true,y_pred),3)
    f1 = round(f1_score(y_true,y_pred),3)
    print(f"Traning Score\n    Accuracy: {accuracy}\n    Precission: {precision}\n    Recall: {recall}\n    F1_score: {f1}")

def testing_test(y_true, y_pred):
    accuracy = round(accuracy_score(y_true,y_pred),3)
    precision = round(precision_score(y_true,y_pred),3)
    recall = round(recall_score(y_true,y_pred),3)
    f1 = round(f1_score(y_true,y_pred),3)
    print(f"Testing Score\n    Accuracy: {accuracy}\n    Precission: {precision}\n    Recall: {recall}\n    F1_score: {f1}")

### Logistic regresiom model

In [31]:
logisticRegression = LogisticRegression()
logisticRegression.fit(vectorized_X_train, y_train)

y_train_pred = logisticRegression.predict(vectorized_X_train)
training_test(y_train, y_train_pred)

y_test_pred = logisticRegression.predict(vectorized_X_test)
testing_test(y_test, y_test_pred)

Traning Score
    Accuracy: 0.922
    Precission: 0.898
    Recall: 0.953
    F1_score: 0.925
Testing Score
    Accuracy: 0.859
    Precission: 0.686
    Recall: 0.856
    F1_score: 0.761


### RandomForestClassifier model

In [32]:
randomForestClassifier = RandomForestClassifier()
randomForestClassifier.fit(vectorized_X_train, y_train)

y_train_pred = randomForestClassifier.predict(vectorized_X_train)
training_test(y_train, y_train_pred)

y_test_pred = randomForestClassifier.predict(vectorized_X_test)
testing_test(y_test, y_test_pred)

Traning Score
    Accuracy: 0.999
    Precission: 0.999
    Recall: 0.999
    F1_score: 0.999
Testing Score
    Accuracy: 0.852
    Precission: 0.721
    Recall: 0.714
    F1_score: 0.717


### MultinomialNB Model

In [33]:
multinomialNB = MultinomialNB()
multinomialNB.fit(vectorized_X_train, y_train)

y_train_pred = multinomialNB.predict(vectorized_X_train)
training_test(y_train, y_train_pred)

y_test_pred = multinomialNB.predict(vectorized_X_test)
testing_test(y_test, y_test_pred)

Traning Score
    Accuracy: 0.888
    Precission: 0.858
    Recall: 0.931
    F1_score: 0.893
Testing Score
    Accuracy: 0.864
    Precission: 0.674
    Recall: 0.93
    F1_score: 0.782


### DecisionTreeClassifier model

In [34]:
decisionTreeClassifier = DecisionTreeClassifier()
decisionTreeClassifier.fit(vectorized_X_train, y_train)

y_train_pred = decisionTreeClassifier.predict(vectorized_X_train)
training_test(y_train, y_train_pred)

y_test_pred = decisionTreeClassifier.predict(vectorized_X_test)
testing_test(y_test, y_test_pred)

Traning Score
    Accuracy: 0.999
    Precission: 1.0
    Recall: 0.999
    F1_score: 0.999
Testing Score
    Accuracy: 0.816
    Precission: 0.642
    Recall: 0.675
    F1_score: 0.658


### Support Vector Clasifier model

In [35]:
supportVector = SVC()
supportVector.fit(vectorized_X_train, y_train)

y_train_pred = supportVector.predict(vectorized_X_train)
training_test(y_train, y_train_pred)

y_test_pred = supportVector.predict(vectorized_X_test)
testing_test(y_test, y_test_pred)

Traning Score
    Accuracy: 0.968
    Precission: 0.948
    Recall: 0.991
    F1_score: 0.969
Testing Score
    Accuracy: 0.876
    Precission: 0.728
    Recall: 0.841
    F1_score: 0.78


In [37]:
import pickle

with open("../static/model/supportVector.pickle", "wb") as file:
    pickle.dump(supportVector, file)