# NLP Tutorial - Sentiment Analysis using Scikit Sklearn Python on IMDB Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import dython
from datetime import datetime
from dateutil import parser
import openpyxl


In [2]:
df_train = pd.read_excel('train.xlsx')


## TFIDF - sklearn.feature_extraction.text.TfidfVectorizer¶
### TFIDF  = term frequency–inverse document frequency

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

In [4]:
df_train.head()

Unnamed: 0,Reviews,Sentiment
0,"When I first tuned in on this morning news, I ...",neg
1,"Mere thoughts of ""Going Overboard"" (aka ""Babes...",neg
2,Why does this movie fall WELL below standards?...,neg
3,Wow and I thought that any Steven Segal movie ...,neg
4,"The story is seen before, but that does'n matt...",neg


# Install

pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall

# Uninstall

pip uninstall preprocess_kgptalkie

# Install Dependencies
- pip install spacy
- from spacy.cli import download  # line 1
- download("en_core_web_sm")  - # line 2
- pip install beautifulsoup4==4.9.1
-  import spacy
from spacy.lang.en.examples import sentences pip install textblob==0.15.3

In [5]:
import preprocess_kgptalkie as ps  # import the cleaning package
import re # regular expression

In [6]:

def get_clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = ps.cont_exp(x)
    x = ps.remove_emails(x)
    x = ps.remove_urls(x)
    x = ps.remove_html_tags(x)
    x = ps.remove_rt(x)
    x = ps.remove_accented_chars(x)
    x = ps.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    return x

In [7]:
df_train.Reviews = df_train.Reviews.apply(lambda x: get_clean(x))

In [24]:
df_train

Unnamed: 0,Reviews,Sentiment
0,when i first tuned in on this morning news i t...,neg
1,mere thoughts of going overboard aka babes aho...,neg
2,why does this movie fall well below standards ...,neg
3,wow and i thought that any steven segal movie ...,neg
4,the story is seen before but that doesand matt...,neg
...,...,...
24995,everyone plays their part pretty well in this ...,pos
24996,it happened with assault on prescient 13 in 20...,neg
24997,my god this movie was awful i cannot complain ...,neg
24998,when i first popped in happy birthday to me i ...,neg


# Preprocessing - Splitting and tranforming 

In [25]:
tfidf = TfidfVectorizer(max_features = 5000)
X = df_train['Reviews']
y = df_train['Sentiment']

# tranforming data to sparse matrix
X = tfidf.fit_transform(X)

In [26]:
X

<25000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 2845641 stored elements in Compressed Sparse Row format>

In [27]:
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42 )

# Instantiating or fitting the model

In [34]:
clf = LinearSVC()
clf.fit(X_train, y_train)
print(clf.get_params())

{'C': 1.0, 'class_weight': None, 'dual': True, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'max_iter': 1000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': None, 'tol': 0.0001, 'verbose': 0}


# Testing the Model

In [35]:
y_pred = clf.predict(X_test)

In [36]:
y_pred

array(['pos', 'pos', 'neg', ..., 'pos', 'neg', 'pos'], dtype=object)

In [40]:
print( classification_report (y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.88      0.87      0.87      2600
         pos       0.86      0.87      0.87      2400

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000



# Now lets predict on the custom data set

In [50]:
x = 'this movie was bad'
x = get_clean(x)
vec = tfidf.transform([x])

In [51]:
vec

<1x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [52]:
# now to test

clf.predict(vec)

array(['neg'], dtype=object)

# Saving the model

In [54]:
import pickle

In [59]:
pickle.dump(clf, open('prediction.pkl', 'wb'))

# Loading the model

In [60]:
loaded_model = pickle.load('prediction.pkl', 'rb')

TypeError: load() takes exactly 1 positional argument (2 given)