In [1]:
#importing Modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import re
import string
import nltk

In [2]:
#Loading the Dataset
dataset=pd.read_csv("labeledTrainData.tsv",sep="\t")
dataset.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
#determining the shape of the dataset
dataset.shape

(25000, 3)

In [4]:
#checking for null values are the datatypes of the values
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [5]:
#Analysing od the Dataset
print("The Number of Unique Id",dataset['id'].nunique())
print("The Number of Unique sentiment",dataset['sentiment'].nunique())
print("The Number of Unique review",dataset['review'].nunique())

The Number of Unique Id 25000
The Number of Unique sentiment 2
The Number of Unique review 24904


In [6]:
#droping the id as the id is unique for all the data
dataset=dataset.drop('id',axis=1)
dataset.head()

Unnamed: 0,sentiment,review
0,1,With all this stuff going down at the moment w...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,0,The film starts with a manager (Nicholas Bell)...
3,0,It must be assumed that those who praised this...
4,1,Superbly trashy and wondrously unpretentious 8...


In [7]:
#converting review to a proper format for tokeniszation by removing punctuation and stopwords
def remove_punct(text):
    text_nonpunct="".join([char for char in text if char not in string.punctuation])
    return(text_nonpunct)
dataset['review_array_cleaned']=dataset['review'].apply(lambda x:remove_punct(x))
dataset.head()

Unnamed: 0,sentiment,review,review_array_cleaned
0,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...",The Classic War of the Worlds by Timothy Hines...
2,0,The film starts with a manager (Nicholas Bell)...,The film starts with a manager Nicholas Bell g...
3,0,It must be assumed that those who praised this...,It must be assumed that those who praised this...
4,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy and wondrously unpretentious 8...


In [8]:
#tokenization
def tokenize(text):
    tokens=re.split('\W+',text)
    return tokens
dataset['review_array_tokenize']=dataset['review_array_cleaned'].apply(lambda x:tokenize(x.lower()))
dataset.head()

Unnamed: 0,sentiment,review,review_array_cleaned,review_array_tokenize
0,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...,"[with, all, this, stuff, going, down, at, the,..."
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...",The Classic War of the Worlds by Timothy Hines...,"[the, classic, war, of, the, worlds, by, timot..."
2,0,The film starts with a manager (Nicholas Bell)...,The film starts with a manager Nicholas Bell g...,"[the, film, starts, with, a, manager, nicholas..."
3,0,It must be assumed that those who praised this...,It must be assumed that those who praised this...,"[it, must, be, assumed, that, those, who, prai..."
4,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy and wondrously unpretentious 8...,"[superbly, trashy, and, wondrously, unpretenti..."


In [9]:
stopword=nltk.corpus.stopwords.words('english')
def remove_stopword(tokenized_list):
    text=[word for word in tokenized_list if word not in stopword]
    return text
dataset['review_array_withoutStopword']=dataset['review_array_tokenize'].apply(lambda x:remove_stopword(x))
dataset.head()

Unnamed: 0,sentiment,review,review_array_cleaned,review_array_tokenize,review_array_withoutStopword
0,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...,"[with, all, this, stuff, going, down, at, the,...","[stuff, going, moment, mj, ive, started, liste..."
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...",The Classic War of the Worlds by Timothy Hines...,"[the, classic, war, of, the, worlds, by, timot...","[classic, war, worlds, timothy, hines, enterta..."
2,0,The film starts with a manager (Nicholas Bell)...,The film starts with a manager Nicholas Bell g...,"[the, film, starts, with, a, manager, nicholas...","[film, starts, manager, nicholas, bell, giving..."
3,0,It must be assumed that those who praised this...,It must be assumed that those who praised this...,"[it, must, be, assumed, that, those, who, prai...","[must, assumed, praised, film, greatest, filme..."
4,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy and wondrously unpretentious 8...,"[superbly, trashy, and, wondrously, unpretenti...","[superbly, trashy, wondrously, unpretentious, ..."


In [11]:
ps=nltk.PorterStemmer()
def stemming(tokenized_text):
    text=[ps.stem(word) for word in tokenized_text]
    return text
dataset['review_array_stemmer']=dataset['review_array_withoutStopword'].apply(lambda x:stemming(x))
dataset.head()

Unnamed: 0,sentiment,review,review_array_cleaned,review_array_tokenize,review_array_withoutStopword,review_array_stemmer
0,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...,"[with, all, this, stuff, going, down, at, the,...","[stuff, going, moment, mj, ive, started, liste...","[stuff, go, moment, mj, ive, start, listen, mu..."
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...",The Classic War of the Worlds by Timothy Hines...,"[the, classic, war, of, the, worlds, by, timot...","[classic, war, worlds, timothy, hines, enterta...","[classic, war, world, timothi, hine, entertain..."
2,0,The film starts with a manager (Nicholas Bell)...,The film starts with a manager Nicholas Bell g...,"[the, film, starts, with, a, manager, nicholas...","[film, starts, manager, nicholas, bell, giving...","[film, start, manag, nichola, bell, give, welc..."
3,0,It must be assumed that those who praised this...,It must be assumed that those who praised this...,"[it, must, be, assumed, that, those, who, prai...","[must, assumed, praised, film, greatest, filme...","[must, assum, prais, film, greatest, film, ope..."
4,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy and wondrously unpretentious 8...,"[superbly, trashy, and, wondrously, unpretenti...","[superbly, trashy, wondrously, unpretentious, ...","[superbl, trashi, wondrous, unpretenti, 80, ex..."


In [12]:
def clean_text(text):
    text="".join([word.lower() for word in text if word not in string.punctuation])
    tokens=re.split('\W+',text)
    text=[ps.stem(word) for word in tokens if word not in stopword]
    return text

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect=CountVectorizer(analyzer=clean_text)
X_counts=count_vect.fit_transform(dataset['review'])
print(X_counts.shape)
print(count_vect.get_feature_names())

(25000, 92247)




In [14]:
X_counts

<25000x92247 sparse matrix of type '<class 'numpy.int64'>'
	with 2450004 stored elements in Compressed Sparse Row format>

In [15]:
X_counts_df=pd.DataFrame(X_counts.toarray())
X_counts_df

MemoryError: Unable to allocate 17.2 GiB for an array with shape (25000, 92247) and data type int64

In [None]:
#as its a huge database taking a part of it

In [16]:
data_sample=dataset[0:100]
data_sample.head()

Unnamed: 0,sentiment,review,review_array_cleaned,review_array_tokenize,review_array_withoutStopword,review_array_stemmer
0,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...,"[with, all, this, stuff, going, down, at, the,...","[stuff, going, moment, mj, ive, started, liste...","[stuff, go, moment, mj, ive, start, listen, mu..."
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...",The Classic War of the Worlds by Timothy Hines...,"[the, classic, war, of, the, worlds, by, timot...","[classic, war, worlds, timothy, hines, enterta...","[classic, war, world, timothi, hine, entertain..."
2,0,The film starts with a manager (Nicholas Bell)...,The film starts with a manager Nicholas Bell g...,"[the, film, starts, with, a, manager, nicholas...","[film, starts, manager, nicholas, bell, giving...","[film, start, manag, nichola, bell, give, welc..."
3,0,It must be assumed that those who praised this...,It must be assumed that those who praised this...,"[it, must, be, assumed, that, those, who, prai...","[must, assumed, praised, film, greatest, filme...","[must, assum, prais, film, greatest, film, ope..."
4,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy and wondrously unpretentious 8...,"[superbly, trashy, and, wondrously, unpretenti...","[superbly, trashy, wondrously, unpretentious, ...","[superbl, trashi, wondrous, unpretenti, 80, ex..."


In [17]:
count_vect=CountVectorizer(analyzer=clean_text)
X_counts=count_vect.fit_transform(data_sample['review'])
print(X_counts.shape)
print(count_vect.get_feature_names())

(100, 3743)
['', '0', '089', '1', '10', '10000', '101', '10x10', '12', '145', '147', '147br', '14minut', '15', '1600', '166', '17', '1880', '1930', '1933', '1938', '1939', '1950', '1951', '1960', '1964', '1968', '1971', '1974', '1975', '1978', '1980', '1982br', '1983', '1985', '1986', '1988', '1991', '1996', '2', '20', '2000', '2004', '2007', '2022', '2995', '2d', '3', '30', '300', '3000', '3d', '4', '40', '40orso', '410', '45', '4510', '4th', '5', '50', '6', '60sa', '65', '7', '70', '710', '75', '8', '80', '810', '87', '9', '90', '93', '995', 'abandon', 'abc', 'abil', 'abl', 'abomin', 'absenc', 'absolut', 'absorb', 'absurd', 'abundantli', 'abyss', 'academi', 'accent', 'accentu', 'accept', 'accid', 'accident', 'accompani', 'accomplic', 'accomplish', 'accord', 'accustom', 'achiev', 'ackland', 'acquir', 'across', 'act', 'actingther', 'action', 'activ', 'activist', 'actor', 'actorsthey', 'actress', 'actsbr', 'actual', 'ad', 'adam', 'adamantli', 'adapt', 'add', 'addict', 'addit', 'admir', 



In [19]:
X_counts_df=pd.DataFrame(X_counts.toarray())
X_counts_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3733,3734,3735,3736,3737,3738,3739,3740,3741,3742
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
X_counts_df['len']=data_sample['review'].apply(lambda x:len(x)-x.count(" "))
X_counts_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3734,3735,3736,3737,3738,3739,3740,3741,3742,len
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1870
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,789
2,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,2072
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1867
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1865


In [24]:
import string
def count_punct(text):
    count=sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-text.count(" ")),3)
X_counts_df['punct%']=data_sample['review'].apply(lambda x:count_punct(x))
X_counts_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3735,3736,3737,3738,3739,3740,3741,3742,len,punct%
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1870,0.036
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,789,0.053
2,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,2072,0.034
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1867,0.05
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1865,0.035


In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold,cross_val_score

rf=RandomForestClassifier(n_jobs=-1)
k_fold=KFold(n_splits=5)
cross_val_score(rf,X_counts_df,data_sample['sentiment'],cv=k_fold,scoring='accuracy',n_jobs=-1)

array([0.45, 0.55, 0.6 , 0.65, 0.7 ])

In [None]:
#the maximum accuracy of the model is 7