<a href="https://colab.research.google.com/github/chandutr/Natural-Language-Toolkit-Projects/blob/main/imdb_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

df=pd.read_csv(r'/content/IMDB Dataset.csv', encoding='latin-1')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [None]:
df = df.rename(columns={'sentiment': 'labels', 'review':'text'})
df.head()

Unnamed: 0,text,labels
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
#Text preprocessing
#create a function to do normalization(for removing punctuation and convert to lower case)
import re
def remove_punctuation_lower(text):
  text=re.sub(r'[^a-zA-Z\s]','',text)
  text=text.lower()
  return text

df['text']=df['text'].apply(remove_punctuation_lower)

In [None]:
df.head()

Unnamed: 0,text,labels
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production br br the filmin...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [None]:
#Function to remove stop words
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize # Import word_tokenize
nltk.download('punkt_tab')
nltk.download('stopwords')

stop_words=set(stopwords.words('english'))

def remove_stopwords(text):
  # Use word_tokenize instead of split()
  tokens = word_tokenize(text)
  filtered_tokens=[word for word in tokens if word not in stop_words]
  return filtered_tokens

df['text']=df['text'].apply(remove_stopwords)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df.head()

Unnamed: 0,text,labels
0,"[one, reviewers, mentioned, watching, oz, epis...",positive
1,"[wonderful, little, production, br, br, filmin...",positive
2,"[thought, wonderful, way, spend, time, hot, su...",positive
3,"[basically, theres, family, little, boy, jake,...",negative
4,"[petter, matteis, love, time, money, visually,...",positive


In [None]:
# Stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_text(tokens):
  """Applies stemming to a list of tokens."""
  stemmed_tokens = [stemmer.stem(word) for word in tokens]
  return stemmed_tokens

df['text'] = df['text'].apply(stem_text)

In [None]:
df.head()

Unnamed: 0,text,labels
0,"[one, review, mention, watch, oz, episod, youl...",positive
1,"[wonder, littl, product, br, br, film, techniq...",positive
2,"[thought, wonder, way, spend, time, hot, summe...",positive
3,"[basic, there, famili, littl, boy, jake, think...",negative
4,"[petter, mattei, love, time, money, visual, st...",positive


In [None]:
!pip install gensim



In [None]:
import numpy as np
from gensim.models import Word2Vec

In [None]:
w2v_model=Word2Vec(sentences=df['text'], vector_size=100, window=5, min_count=1)
w2v_model

<gensim.models.word2vec.Word2Vec at 0x7e85213ed110>

In [None]:
#convert sentences to averaged Word2Vec vectors, then only we can apply ML Algo
def get_avg_w2v(tokens, model, vector_size):
  vec=np.zeros(vector_size) #will store the sum of all word vector for a sentence
  count=0 #used to count how many words were found in the word2vec vocabulary
  for word in tokens:
    if word in model.wv:
      vec+=model.wv[word]
      count+=1
  return vec/count if count!=0 else vec #if atleast one word got matched => return the avg of all vectors
  #else return the original zero vector

df['w2v_vector']=df['text'].apply(lambda x: get_avg_w2v(x, w2v_model, 100))
df.head()

Unnamed: 0,text,labels,w2v_vector
0,"[one, review, mention, watch, oz, episod, youl...",positive,"[-0.17848895807093118, 0.24618826412376674, -0..."
1,"[wonder, littl, product, br, br, film, techniq...",positive,"[-0.1594821669989162, -0.053873066309218606, -..."
2,"[thought, wonder, way, spend, time, hot, summe...",positive,"[-0.25528292496683314, 0.43889036793904057, -0..."
3,"[basic, there, famili, littl, boy, jake, think...",negative,"[-0.018025807946521255, 0.20446873399695115, -..."
4,"[petter, mattei, love, time, money, visual, st...",positive,"[-0.20101051629115738, 0.3009760553124719, -0...."


In [None]:
df['w2v_vector'][0].shape

(100,)

In [None]:
type(df['w2v_vector'][0])

numpy.ndarray

In [None]:
df['label_encoded']=df['labels'].map({'negative':0,'positive':1})

In [None]:
x=np.array(df['w2v_vector'].tolist())
y=df['label_encoded'].values

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(x_train,y_train)

In [None]:
y_pred=model.predict(x_test)
y_pred

array([0, 1, 0, ..., 0, 0, 1])

In [None]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.8554
              precision    recall  f1-score   support

           0       0.86      0.85      0.85      4961
           1       0.85      0.86      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

