In [None]:
#  Importing the libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
import re

In [None]:
# downloading the stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# making dataframe object from the dataset and renaming the columns of the dataframe

columns = ['id', 'country', 'sentiment', 'text']
df = pd.read_csv('twitter_training.csv', names=columns)
df

Unnamed: 0,id,country,sentiment,text
0,target,word,label,tweet
1,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
2,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
3,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
4,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
...,...,...,...,...
74678,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74679,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74680,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74681,9200,Nvidia,Positive,Just realized between the windows partition of...


In [None]:
# dropping the first row containing previous column names 

df.drop(0, inplace=True)
df

Unnamed: 0,id,country,sentiment,text
1,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
2,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
3,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
4,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
5,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74678,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74679,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74680,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74681,9200,Nvidia,Positive,Just realized between the windows partition of...


In [None]:
# checking the dataset for overall info about column data types, null values

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 1 to 74682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         74682 non-null  object
 1   country    74682 non-null  object
 2   sentiment  74682 non-null  object
 3   text       73996 non-null  object
dtypes: object(4)
memory usage: 2.3+ MB


In [None]:
# getting info about the classes of sentiments  
df['sentiment'].value_counts()

sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [None]:
# null value count in each column

df.isnull().sum()

id             0
country        0
sentiment      0
text         686
dtype: int64

In [None]:
# only 686 rows in 'text' column contain null values out of 74682 values which is less than 1% of the total, 
# hence dropping the rows that contain null values 

df.dropna(inplace=True)

In [None]:
# reducing the number of columns which are relevant

df = df[['sentiment', 'text']]

In [12]:
df

Unnamed: 0,sentiment,text
1,Positive,im getting on borderlands and i will murder yo...
2,Positive,I am coming to the borders and I will kill you...
3,Positive,im getting on borderlands and i will kill you ...
4,Positive,im coming on borderlands and i will murder you...
5,Positive,im getting on borderlands 2 and i will murder ...
...,...,...
74678,Positive,Just realized that the Windows partition of my...
74679,Positive,Just realized that my Mac window partition is ...
74680,Positive,Just realized the windows partition of my Mac ...
74681,Positive,Just realized between the windows partition of...


In [None]:
# cleaning the dataset

from nltk.stem.porter import PorterStemmer

stop_words = set(stopwords.words('english')) # storing all the stopwords in variable 'stop_words'
stop_words.remove('not') # removing 'not' from the set of stopwords 

def clean_text(text):
    text = re.sub(r'https\S+','', text) # remove urls
    text = re.sub(r'@\w+|#\w+','',text) # remove mentions and hashtags
    text = re.sub(r'[^a-zA-Z\s]','',text) # removes punctuation/numbres
    text = text.lower() # turns text into lowercase
    words = text.split()
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words if w not in stop_words] # removes stopwords
    return ' '.join(words)

In [14]:
# applying the function to clean texts in df['text'] row wise

df['clean_text'] = df['text'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_text'] = df['text'].apply(clean_text)


In [15]:
# load english language model and create nlp object from it
# nlp = spacy.load("en_core_web_sm")

In [None]:
# Another way to preprocess the text data using inbuilt "en_core_web_sm" model of spacy module

# def preprocess(text):
#     # remove stop words and lemmatize the text
#     doc = nlp(text)
#     filtered_tokens = [token.lemma_ for token in doc if token.is_stop or token.is_punct]    
#     return " ".join(filtered_tokens)

# df['clean_text'] = df['text'].apply(preprocess)

In [None]:
# label encoding the target column

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['sentiment'] = encoder.fit_transform(df['sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = encoder.fit_transform(df['sentiment'])


In [None]:
# checking the dataframe for encoded target column
df.head()

Unnamed: 0,sentiment,text,clean_text
1,3,im getting on borderlands and i will murder yo...,im get borderland murder
2,3,I am coming to the borders and I will kill you...,come border kill
3,3,im getting on borderlands and i will kill you ...,im get borderland kill
4,3,im coming on borderlands and i will murder you...,im come borderland murder
5,3,im getting on borderlands 2 and i will murder ...,im get borderland murder


In [None]:
# Vectorization of the text column

vectorizer = CountVectorizer()
x = vectorizer.fit_transform(df['clean_text'])
y = df['sentiment']

In [22]:
print(x)

  (0, 11930)	1
  (0, 9654)	1
  (0, 2966)	1
  (0, 16204)	1
  (1, 4694)	1
  (1, 2964)	1
  (1, 13451)	1
  (2, 11930)	1
  (2, 9654)	1
  (2, 2966)	1
  (2, 13451)	1
  (3, 11930)	1
  (3, 2966)	1
  (3, 16204)	1
  (3, 4694)	1
  (4, 11930)	1
  (4, 9654)	1
  (4, 2966)	1
  (4, 16204)	1
  (5, 11930)	1
  (5, 9654)	1
  (5, 2966)	1
  (5, 16204)	1
  (6, 23789)	1
  (6, 11428)	1
  :	:
  (73993, 18169)	1
  (73994, 14171)	1
  (73994, 29394)	1
  (73994, 9139)	1
  (73994, 7740)	1
  (73994, 20941)	1
  (73994, 11719)	1
  (73994, 2140)	1
  (73994, 17012)	1
  (73994, 14746)	1
  (73994, 3744)	1
  (73994, 28657)	1
  (73994, 6829)	1
  (73994, 17148)	1
  (73994, 18169)	1
  (73995, 14171)	2
  (73995, 29394)	1
  (73995, 11719)	1
  (73995, 6192)	1
  (73995, 2140)	1
  (73995, 17012)	1
  (73995, 14746)	1
  (73995, 28657)	1
  (73995, 6829)	1
  (73995, 18169)	1


In [None]:
# splitting the dataset into the train and the test set

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# applying the multinomial naive bayes model to the training set

model = MultinomialNB()
model.fit(x_train, y_train)

In [None]:
# Above code can be run using a pipeline that makes vectorizer and model one after another:

# Create classifier
# clf = Pipeline([
#     ('vectorizer_tri_grams', CountVectorizer()),
#     ('naive_bayes', (MultinomialNB()))         
# ])

In [None]:
# prdicting the results using the test set

y_pred = model.predict(x_test)

In [None]:
# Evaluating the results based on a number of parameters

print('Accuracy: \n', accuracy_score(y_test, y_pred))

Accuracy: 
 0.7114864864864865


In [None]:
# getting the classification report

print('Classification_report: \n', classification_report(y_test, y_pred))

Classification_report: 
               precision    recall  f1-score   support

           0       0.79      0.54      0.64      2575
           1       0.68      0.83      0.75      4472
           2       0.79      0.59      0.67      3622
           3       0.68      0.79      0.73      4131

    accuracy                           0.71     14800
   macro avg       0.73      0.69      0.70     14800
weighted avg       0.72      0.71      0.71     14800



### Prediting the results using a random text string

In [29]:
test_string = 'we are sad'

In [30]:
# test_string = input('enter the string for prediction: ')

In [31]:
cleaned_text = clean_text(test_string)

In [32]:
data = vectorizer.transform([cleaned_text])
res = encoder.inverse_transform(model.predict(data))

In [33]:
print(res)

['Negative']


### Saving the model as trhe pickle file

In [None]:
import joblib

joblib.dump(model, 'twitter_sentiment_analysis.pkl')

['twitter_sentiment_analysis.pkl']

In [None]:
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']