# NLP Tutorial - Sentiment Analysis using Scikit Sklearn Python on IMDB Dataset

# Install Dependencies
- pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall
- pip install spacy
- from spacy.cli import download  # line 1
- download("en_core_web_sm")  - # line 2
- pip install beautifulsoup4==4.9.1
-  import spacy
- from spacy.lang.en.examples import sentences 
- pip install textblob==0.15.3

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import dython
from datetime import datetime
from dateutil import parser
import openpyxl
import spacy

In [6]:
df = pd.read_excel('train.xlsx')
df

Unnamed: 0,Reviews,Sentiment
0,"When I first tuned in on this morning news, I ...",neg
1,"Mere thoughts of ""Going Overboard"" (aka ""Babes...",neg
2,Why does this movie fall WELL below standards?...,neg
3,Wow and I thought that any Steven Segal movie ...,neg
4,"The story is seen before, but that does'n matt...",neg
...,...,...
24995,Everyone plays their part pretty well in this ...,pos
24996,It happened with Assault on Prescient 13 in 20...,neg
24997,My God. This movie was awful. I can't complain...,neg
24998,"When I first popped in Happy Birthday to Me, I...",neg


### TFIDF  = term frequency–inverse document frequency

# Preprocessing

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import preprocess_kgptalkie as ps  # import the cleaning package

## Removing stopwords

In [None]:
# loading spacy for stop word removal
nlp = spacy.load('en_core_web_sm')
stopwords = nlp.Defaults.stop_words

In [None]:
# removing stopword
stopwords = stopwords
df.Reviews = df.Reviews.apply(lambda x: [item for item in x if item not in stopwords])

TypeError: 'int' object is not iterable

In [9]:
df.Reviews

# Cleaning the texts of special characters

In [11]:
import preprocess_kgptalkie as ps  # import the cleaning package
import re # regular expression

def get_clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = ps.cont_exp(x)
    x = ps.remove_emails(x)
    x = ps.remove_urls(x)
    x = ps.remove_html_tags(x)
    x = ps.remove_rt(x)
    x = ps.remove_accented_chars(x)
    x = ps.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    return x

In [13]:
df.Reviews = df.Reviews.apply(lambda x: get_clean(x))

In [14]:
df.Reviews

# Converting the text into numeric

In [22]:
tfidf = TfidfVectorizer(max_features = 5000, ngram_range =(1,2))

In [23]:
X = df['Reviews']
y = df['Sentiment']

In [24]:
# tranforming data to sparse matrix
X = tfidf.fit_transform(X)

In [25]:
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42 )

# Instantiating or fitting the model

In [27]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train, y_train)
print(clf.get_params())

# Testing the Model

In [28]:
y_pred = clf.predict(X_test)

In [29]:
y_pred

# Classification Report

In [30]:
print( classification_report (y_test, y_pred))

# Confusion matrix

In [31]:
confusion = confusion_matrix(y_test, y_pred)
FN = confusion[1][0]
TN = confusion[0][0]
TP = confusion[1][1]
FP = confusion[0][1]

In [32]:
plt.bar(['False Negative' , 'True Negative' , 'True Positive' , 'False Positive'],[FN,TN,TP,FP])

# Accuracy Score

In [33]:
clf.score(X_test, y_test)

In [34]:
accuracy_score(y_test, y_pred)

# Now lets predict on the custom data set

In [35]:
x = 'this movie was bad'
x = get_clean(x)
vec = tfidf.transform([x])

In [36]:
vec

In [37]:
# now to test

clf.predict(vec)

# Saving the model

In [38]:
import pickle

In [39]:
pickle.dump(clf, open('prediction.pkl', 'wb'))

# Loading the model

In [40]:
loaded_model = pickle.load(open('prediction.pkl', 'rb'))

In [41]:
df_test = pd.read_excel('test.xlsx')

# Testing the model on a new dataset

In [42]:
df_test[:5]

# Preprocesisng to clean the model

In [43]:
# lets clean thew review column and appy get_clean
df_test['Reviews'] = df_test['Reviews'].apply(lambda x: get_clean(x))


In [44]:
reviews = df_test['Reviews']

# Transforming the model

In [45]:
testing = tfidf.fit_transform(reviews)

In [46]:
testing

In [47]:
testing_pred = loaded_model.predict(testing)

In [48]:
df_testing = pd.DataFrame(testing_pred, columns = ['Predicted_Sentiment'])
df_testing.head()   

# Concat the dataset with predicted model

In [49]:
testing_dataset = pd.concat([df_test, df_testing], ignore_index = False, axis =1)

In [50]:
testing_dataset.head()

In [51]:
original = testing_dataset.Sentiment.value_counts()

In [52]:
predicted = testing_dataset.Predicted_Sentiment.value_counts()

In [53]:
conc = pd.concat ([original,predicted], axis = 1)

In [54]:
conc

In [55]:
fig,ax = plt.subplots(figsize = (10,4))
conc.plot(kind ='bar', ax= ax)
