Importing Required libaries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import spacy
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Loading the Dataset from Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Reviewing the dataset

In [3]:
data = pd.read_csv('/content/drive/MyDrive/Exam/labeledTrainData.tsv', delimiter='\t')

In [4]:
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [6]:
data.dtypes

id           object
sentiment     int64
review       object
dtype: object

In [7]:
data.shape

(25000, 3)

Checking for missing values

In [8]:
data.isna().sum()

id           0
sentiment    0
review       0
dtype: int64

In [9]:
data['sentiment'].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [10]:
data.columns

Index(['id', 'sentiment', 'review'], dtype='object')

Removing punctuations

In [11]:
punctuationfree = ''
def remove_pun(text):
  punctuationfree = ''.join([i for i in text if i not in string.punctuation])
  return punctuationfree
data['review'] = data['review'].apply(lambda x: remove_pun(x))

Lowercase text

In [12]:
data['review'] = data['review'].apply(lambda x:x.lower())

Tokenization

In [13]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
data['review'] = data['review'].apply(lambda x: tokenizer.tokenize(x))

Stopword removal

In [None]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
  output = [i for i in text if i not in stopwords]
  return output
data['review'] = data['review'].apply(lambda x:remove_stopwords(x))

Lemmatization

In [14]:
lemma = WordNetLemmatizer()
def lemmat(text):
   lemm_text = [lemma.lemmatize(x) for x in text]
   return lemm_text
data['review'] = data['review'].apply(lambda x:lemmat(x))

Splitting X & Y

In [15]:
x = data.iloc[:,2:3]
x.head()

Unnamed: 0,review
0,"[with, all, this, stuff, going, down, at, the,..."
1,"[the, classic, war, of, the, world, by, timoth..."
2,"[the, film, start, with, a, manager, nicholas,..."
3,"[it, must, be, assumed, that, those, who, prai..."
4,"[superbly, trashy, and, wondrously, unpretenti..."


In [16]:
y= data['sentiment']
y.head()

0    1
1    1
2    0
3    0
4    1
Name: sentiment, dtype: int64

In [17]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)

In [18]:
x_train['review'] = x_train['review'].apply(lambda x: ' '.join(x))
x_test['review'] = x_test['review'].apply(lambda x: ' '.join(x))

Bag of words

In [19]:
cv = CountVectorizer(stop_words="english", min_df=10, max_df=200, max_features=2000)

# Fit and transform on the training set
x_train_bow = cv.fit_transform(x_train['review']).toarray()

# Transform on the test set
x_test_bow = cv.transform(x_test['review']).toarray()

In [20]:
print(cv.vocabulary_)

{'ralph': 1415, 'mike': 1129, 'hammer': 799, 'exercise': 626, 'solve': 1659, 'surrounding': 1765, 'popcorn': 1326, 'stooge': 1714, 'accidentally': 36, 'continuity': 383, 'tight': 1829, 'lower': 1064, 'leg': 1025, 'pass': 1278, 'grand': 767, 'finale': 675, 'granted': 770, 'beer': 159, 'text': 1804, 'saint': 1539, 'mel': 1113, 'faithful': 655, '2000': 13, 'mountain': 1160, 'cash': 278, 'jackson': 943, 'roman': 1518, 'combination': 351, 'statement': 1699, 'thomas': 1816, 'designed': 491, 'program': 1374, 'hype': 875, 'worthless': 1989, 'universe': 1900, 'river': 1509, 'drew': 543, 'spiritual': 1679, 'frustrated': 723, 'southern': 1664, 'california': 254, 'creation': 420, 'driven': 546, 'montana': 1151, 'landscape': 1008, 'lake': 1007, 'worry': 1988, 'inspiring': 913, 'insight': 910, 'goal': 753, 'colour': 348, 'crowd': 429, 'posse': 1334, 'daniel': 453, 'lewis': 1033, 'brown': 232, 'learned': 1022, 'paint': 1265, 'loving': 1062, '2005': 17, 'russian': 1533, 'offensive': 1233, 'symbolism':

In [21]:
!pip install nltk
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
import gensim

In [28]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
!pip install nltk
import nltk
from nltk import sent_tokenize
from nltk import simple_preprocess



ImportError: cannot import name 'simple_preprocess' from 'nltk' (/usr/local/lib/python3.10/dist-packages/nltk/__init__.py)

In [26]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2

SyntaxError: incomplete input (<ipython-input-26-a320c1cc880f>, line 3)