# Sentiment Analysis

In [1]:
# importing libraries
import pandas as pd 
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation
from nltk.corpus import stopwords
import contractions
from unidecode import unidecode
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv(r"C:\Users\pjosh\Documents\NLP\22_JAN_2022\Train.csv")
data.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


### Preprocessing

In [3]:
def remove_spaces(data):
    formatted_text = data.replace('\\n',' ').replace('\t',' ').replace('\\',' ')
    return formatted_text

def contraction_mapping(data):
    expanded_text = contractions.fix(data)
    return expanded_text

def handle_accented(data):
    fixed_text = unidecode(data)
    return fixed_text


stopword_list = stopwords.words("english")
stopword_list.remove("no")
stopword_list.remove("not")
stopword_list.remove("nor")
def clean_data(data):
    tokens = word_tokenize(data)
    lower_text = [word.lower() for word in tokens]
    text_without_punct = [word for word in lower_text if word not in punctuation]
    text_without_stop = [word for word in text_without_punct if (word not in stopword_list) and(len(word)>1) and(word.isalpha())]
    return text_without_stop

def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    final_text = []
    for word in data:
        lemmatized_word = lemmatizer.lemmatize(word)
        final_text.append(lemmatized_word)
    return final_text

def join_list(data):
    return " ".join(data)
    

In [4]:
x_train,x_test,y_train,y_test = train_test_split(data.text,data.label,test_size=0.25,random_state=42)

In [5]:
clean_train_data = x_train.apply(remove_spaces)
clean_test_data = x_test.apply(remove_spaces)

clean_train_data = clean_train_data.apply(contraction_mapping)
clean_test_data = clean_test_data.apply(contraction_mapping)


clean_train_data = clean_train_data.apply(handle_accented)
clean_test_data = clean_test_data.apply(handle_accented)

clean_train_data = clean_train_data.apply(clean_data)
clean_test_data = clean_test_data.apply(clean_data)

clean_train_data = clean_train_data.apply(lemmatization)
clean_test_data = clean_test_data.apply(lemmatization)

In [6]:
def filter_2(data):
    clean_text = [ word for word in data if word.isalpha()]
    return clean_text

In [7]:
clean_train_data = clean_train_data.apply(filter_2)
clean_test_data = clean_test_data.apply(filter_2)

In [8]:
clean_train_data

26898    [fifth, grade, language, art, teacher, read, b...
27635    [low, budget, brit, pop, melodrama, focus, gir...
3036     [well, ok, watched, movie, little, year, ago, ...
5604     [would, almost, give, however, confusing, part...
36111    [full, length, feature, film, world, bridge, f...
                               ...                        
6265     [movie, one, worst, movie, ever, seen, life, w...
11284    [movie, inspiring, anyone, tough, jam, whether...
38158    [east, side, story, documentary, musical, come...
860      [one, boot, one, point, doctor, assistant, ref...
15795    [movie, horrible, lighting, terrible, camera, ...
Name: text, Length: 30000, dtype: object

In [9]:
clean_train_data = clean_train_data.apply(join_list)
clean_test_data = clean_test_data.apply(join_list)

In [10]:
clean_train_data

26898    fifth grade language art teacher read book stu...
27635    low budget brit pop melodrama focus girl want ...
3036     well ok watched movie little year ago pulled d...
5604     would almost give however confusing part well ...
36111    full length feature film world bridge found fi...
                               ...                        
6265     movie one worst movie ever seen life waste tim...
11284    movie inspiring anyone tough jam whether finan...
38158    east side story documentary musical comedy sta...
860      one boot one point doctor assistant refers br ...
15795    movie horrible lighting terrible camera moveme...
Name: text, Length: 30000, dtype: object

### Count Vectorizer

In [11]:
count = CountVectorizer(max_features=1000,max_df=0.95)
train_val = count.fit_transform(clean_train_data)
test_val = count.transform(clean_test_data)

In [12]:
train_val

<30000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 1651566 stored elements in Compressed Sparse Row format>

In [13]:
pd.DataFrame(train_val.A,columns=count.get_feature_names())



Unnamed: 0,ability,able,absolutely,accent,across,act,acted,acting,action,actor,...,wrong,wrote,yeah,year,yes,yet,york,young,younger,zombie
0,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
29996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
29998,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [16]:
count.get_feature_names()

['ability',
 'able',
 'absolutely',
 'accent',
 'across',
 'act',
 'acted',
 'acting',
 'action',
 'actor',
 'actress',
 'actual',
 'actually',
 'adaptation',
 'add',
 'admit',
 'adult',
 'adventure',
 'age',
 'ago',
 'agree',
 'air',
 'alien',
 'alive',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'amazing',
 'america',
 'american',
 'among',
 'amount',
 'animal',
 'animation',
 'annoying',
 'another',
 'answer',
 'anyone',
 'anything',
 'anyway',
 'apart',
 'apparently',
 'appeal',
 'appear',
 'appearance',
 'appears',
 'appreciate',
 'around',
 'art',
 'aside',
 'ask',
 'aspect',
 'atmosphere',
 'attack',
 'attempt',
 'attention',
 'audience',
 'average',
 'avoid',
 'award',
 'away',
 'awesome',
 'awful',
 'baby',
 'back',
 'background',
 'bad',
 'badly',
 'band',
 'barely',
 'based',
 'basic',
 'basically',
 'battle',
 'beautiful',
 'beauty',
 'became',
 'become',
 'becomes',
 'begin',
 'beginning',
 'behind',
 'belief',
 'believable',
 'believe',
 '

In [17]:
y_train

26898    0
27635    1
3036     0
5604     1
36111    1
        ..
6265     0
11284    1
38158    1
860      0
15795    0
Name: label, Length: 30000, dtype: int64

In [18]:
mnb_count = MultinomialNB()
mnb_count.fit(train_val.A,y_train)

In [20]:
predict_mnb_count = mnb_count.predict(test_val.A)

In [21]:
predict_mnb_count

array([1, 1, 0, ..., 1, 1, 0], dtype=int64)

In [22]:
mnb_accuracy = accuracy_score(y_test,predict_mnb_count)*100
mnb_accuracy

83.17999999999999

# Tfidf Vectorizer

In [23]:
tfidf = TfidfVectorizer(max_features=1000,max_df=0.95)
train_val = tfidf.fit_transform(clean_train_data)
test_val = tfidf.transform(clean_test_data)

In [24]:
pd.DataFrame(train_val.A,columns=tfidf.get_feature_names())



Unnamed: 0,ability,able,absolutely,accent,across,act,acted,acting,action,actor,...,wrong,wrote,yeah,year,yes,yet,york,young,younger,zombie
0,0.0,0.0,0.00000,0.000000,0.0,0.000000,0.0,0.182468,0.0,0.0,...,0.00000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.00000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.00000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.00000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.00000,0.000000,0.0,0.055163,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.00000,0.139425,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.00000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.29823,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.00000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0.0,0.0,0.00000,0.000000,0.0,0.132672,0.0,0.086150,0.0,0.0,...,0.00000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
29996,0.0,0.0,0.00000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.00000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
29997,0.0,0.0,0.00000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.00000,0.000000,0.0,0.073111,0.0,0.0,0.0,0.0,0.0,0.0
29998,0.0,0.0,0.00000,0.000000,0.0,0.000000,0.0,0.047364,0.0,0.0,...,0.07032,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
mnb_tfidf = MultinomialNB()
mnb_tfidf.fit(train_val.A,y_train)

In [26]:
predict_mnb_tfidf = mnb_tfidf.predict(test_val.A)

In [27]:
predict_mnb_tfidf

array([1, 1, 0, ..., 0, 1, 0], dtype=int64)

In [28]:
accuracy_tfidf = accuracy_score(y_test,predict_mnb_tfidf)*100
accuracy_tfidf

83.88

In [29]:
mnb_accuracy

83.17999999999999