In [None]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv(r"/content/Corona_NLP.csv", encoding='latin1')

In [None]:
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [None]:
df1 = df.copy()

In [None]:
df1.dropna(inplace=True)

In [None]:
df1.shape

(32567, 6)

In [None]:
df1.drop(['UserName','ScreenName','Location','TweetAt'],axis=1,inplace=True)

In [None]:
df1.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
5,As news of the regionÂs first confirmed COVID...,Positive
6,Cashier at grocery store was sharing his insig...,Positive


In [None]:
df1['Sentiment'].nunique()

5

In [None]:
df1['Sentiment'].value_counts()

Positive              9110
Negative              7763
Neutral               6172
Extremely Positive    5273
Extremely Negative    4249
Name: Sentiment, dtype: int64

In [None]:
df1.loc[df['Sentiment']=='Extremely Positive','Sentiment'] = 'Positive'
df1.loc[df['Sentiment']=='Extremely Negative','Sentiment'] = 'Negative'

In [None]:
df1['Sentiment'].value_counts()

Positive    14383
Negative    12012
Neutral      6172
Name: Sentiment, dtype: int64

In [None]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [None]:
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

In [None]:
df1['OriginalTweet'] = df1['OriginalTweet'].apply(lambda x: remove_punctuation(x))
df1['OriginalTweet'].head()

0    MeNyrbie PhilGahan Chrisitv httpstcoiFz9FAn2Pa...
1    advice Talk to your neighbours family to excha...
2    Coronavirus Australia Woolworths to give elder...
5    As news of the regionÂs first confirmed COVID...
6    Cashier at grocery store was sharing his insig...
Name: OriginalTweet, dtype: object

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
df1['OriginalTweet'] = df1['OriginalTweet'].apply(lambda x: tokenizer.tokenize(x.lower()))
df1['OriginalTweet'].head()

0    [menyrbie, philgahan, chrisitv, httpstcoifz9fa...
1    [advice, talk, to, your, neighbours, family, t...
2    [coronavirus, australia, woolworths, to, give,...
5    [as, news, of, the, regionâ, s, first, confirm...
6    [cashier, at, grocery, store, was, sharing, hi...
Name: OriginalTweet, dtype: object

In [None]:
lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text

In [None]:
df1['OriginalTweet'] = df1['OriginalTweet'].apply(lambda x: word_lemmatizer(x))
df1['OriginalTweet'].head()

0    [menyrbie, philgahan, chrisitv, httpstcoifz9fa...
1    [advice, talk, to, your, neighbour, family, to...
2    [coronavirus, australia, woolworth, to, give, ...
5    [a, news, of, the, regionâ, s, first, confirme...
6    [cashier, at, grocery, store, wa, sharing, his...
Name: OriginalTweet, dtype: object

In [None]:
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
stemmer = PorterStemmer()

def word_stemmer(text):
    stem_text = " ".join([stemmer.stem(i) for i in text])
    return stem_text

In [None]:
df1['OriginalTweet'] = df1['OriginalTweet'].apply(lambda x: word_stemmer(x))
df1['OriginalTweet']

0        menyrbi philgahan chrisitv httpstcoifz9fan2pa ...
1        advic talk to your neighbour famili to exchang...
2        coronaviru australia woolworth to give elderli...
5        a news of the regionâ s first confirm covid19 ...
6        cashier at groceri store wa share hi insight o...
                               ...                        
41147    yâ all realli shit that much more at home covi...
41149    still shock by the number of toronto supermark...
41150    i never that weâ d be in a situat amp world th...
41152    airlin pilot offer to stock supermarket shelf ...
41156    tartiicat well newus rift s are go for 70000 o...
Name: OriginalTweet, Length: 32567, dtype: object

In [None]:
x = df1['OriginalTweet']
y = df1['Sentiment']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=101)

In [None]:
X_train.shape

(22796,)

In [None]:
y_train.shape

(22796,)

In [None]:
X_test.shape

(9771,)

In [None]:
y_test.shape

(9771,)

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
count_vectorizer = CountVectorizer(stop_words='english')

In [None]:
my_pipeline = Pipeline(steps=[('CountVectorizer', CountVectorizer()),
                        ('Model', SVC())])

In [None]:
my_pipeline.fit(X_train,y_train)

Pipeline(steps=[('CountVectorizer', CountVectorizer()), ('Model', SVC())])

In [None]:
pred1 = my_pipeline.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print("classification_report",classification_report(pred1,y_test))
print("Confusion Matrix", confusion_matrix(pred1,y_test))
print("Accuracy_score", accuracy_score(pred1,y_test))

classification_report               precision    recall  f1-score   support

    Negative       0.74      0.76      0.75      3502
     Neutral       0.59      0.64      0.62      1698
    Positive       0.80      0.76      0.78      4571

    accuracy                           0.74      9771
   macro avg       0.71      0.72      0.71      9771
weighted avg       0.74      0.74      0.74      9771

Confusion Matrix [[2652  289  561]
 [ 311 1087  300]
 [ 634  460 3477]]
Accuracy_score 0.7385119230375601


In [None]:
my_pipeline2 = Pipeline(steps=[('Countvectorizer', CountVectorizer()),
                               ('Model', MultinomialNB())])

In [None]:
my_pipeline2.fit(X_train,y_train)

Pipeline(steps=[('Countvectorizer', CountVectorizer()),
                ('Model', MultinomialNB())])

In [None]:
pred2 = my_pipeline2.predict(X_test)

In [None]:

print("classification_report",classification_report(pred2,y_test))
print("Confusion Matrix", confusion_matrix(pred2,y_test))
print("Accuracy_score", accuracy_score(pred2,y_test))

classification_report               precision    recall  f1-score   support

    Negative       0.73      0.67      0.70      3906
     Neutral       0.11      0.77      0.20       270
    Positive       0.82      0.64      0.72      5595

    accuracy                           0.66      9771
   macro avg       0.56      0.69      0.54      9771
weighted avg       0.77      0.66      0.70      9771

Confusion Matrix [[2624  550  732]
 [  28  208   34]
 [ 945 1078 3572]]
Accuracy_score 0.6554088629618258


In [None]:
my_pipeline3 = Pipeline(steps=[('Countvectorizer', CountVectorizer()),
                               ('Model', KNeighborsClassifier())])

In [None]:
my_pipeline.fit(X_train,y_train)

Pipeline(steps=[('CountVectorizer', CountVectorizer()), ('Model', SVC())])

In [None]:
pred3 = my_pipeline.predict(X_test)

In [None]:

print("classification_report",classification_report(pred3,y_test))
print("Confusion Matrix", confusion_matrix(pred3,y_test))
print("Accuracy_score", accuracy_score(pred3,y_test))

classification_report               precision    recall  f1-score   support

    Negative       0.74      0.76      0.75      3502
     Neutral       0.59      0.64      0.62      1698
    Positive       0.80      0.76      0.78      4571

    accuracy                           0.74      9771
   macro avg       0.71      0.72      0.71      9771
weighted avg       0.74      0.74      0.74      9771

Confusion Matrix [[2652  289  561]
 [ 311 1087  300]
 [ 634  460 3477]]
Accuracy_score 0.7385119230375601


In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
my_pipeline4 = Pipeline(steps=[('CountVectorizer', CountVectorizer()),
                               ('Model', LogisticRegression())])

In [None]:
my_pipeline4.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('CountVectorizer', CountVectorizer()),
                ('Model', LogisticRegression())])

In [None]:
pred4 = my_pipeline4.predict(X_test)

In [None]:
print("classification_report",classification_report(pred4,y_test))
print("Confusion Matrix", confusion_matrix(pred4,y_test))
print("Accuracy_score", accuracy_score(pred4,y_test))

classification_report               precision    recall  f1-score   support

    Negative       0.80      0.80      0.80      3578
     Neutral       0.68      0.68      0.68      1818
    Positive       0.83      0.82      0.82      4375

    accuracy                           0.79      9771
   macro avg       0.77      0.77      0.77      9771
weighted avg       0.79      0.79      0.79      9771

Confusion Matrix [[2861  261  456]
 [ 285 1241  292]
 [ 451  334 3590]]
Accuracy_score 0.7872275099785079


In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
my_pipeline5 = Pipeline(steps=[('Countvectorizer', CountVectorizer()),
                               ('Model', RandomForestClassifier())])

In [None]:
my_pipeline5.fit(X_train,y_train)

Pipeline(steps=[('Countvectorizer', CountVectorizer()),
                ('Model', RandomForestClassifier())])

In [None]:
pred5 = my_pipeline5.predict(X_test)

In [None]:
print("classification_report",classification_report(pred5,y_test))
print("Confusion Matrix", confusion_matrix(pred5,y_test))
print("Accuracy_score", accuracy_score(pred5,y_test))

classification_report               precision    recall  f1-score   support

    Negative       0.67      0.75      0.71      3187
     Neutral       0.49      0.70      0.58      1279
    Positive       0.83      0.68      0.75      5305

    accuracy                           0.71      9771
   macro avg       0.66      0.71      0.68      9771
weighted avg       0.73      0.71      0.71      9771

Confusion Matrix [[2402  251  534]
 [ 172  897  210]
 [1023  688 3594]]
Accuracy_score 0.7054549176133457


SVC : 73%
Multinominal Naive Bayes : 65%
KNN : 73%
Higher Accuracy was notched up SVC and KNN by 73%.

In [None]:
import pickle

In [None]:
import joblib

In [None]:
joblib.dump(my_pipeline,'Best Model')

['Best Model']

In [None]:
test = joblib.load("/content/Best Model")

In [None]:
test.predict(['I have good immune'])

array(['Neutral'], dtype=object)

In [None]:
with open ('Best Model','wb') as f:
  pickle.dump(my_pipeline,f)

In [None]:
with open('Best Model','rb') as f:
  a=pickle.load (f)