In [None]:
!pip install kaggle



In [None]:
import os
# javascript object notation
import json
# for extracting data from the zip file returned by the kaggle api
from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
# the basic NN architecture we use
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , Embedding , LSTM
# we'll use below dependency to map the words to specific numbers
from tensorflow.keras.preprocessing.text import Tokenizer
# just to ensure all the input data is of same shape
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Data Collection and preprocessing // -->> we'll using kaggle api
kaggle_dictionary = json.load(open('kaggle.json'))

In [None]:
kaggle_dictionary.keys()

dict_keys(['username', 'key'])

In [None]:
kaggle_dictionary.values()

dict_values(['growingfire', '85e40029fe1ff199d6d697f6a910afa3'])

In [None]:
# this kaggle dictionary now has all of my kaggle credentials
# we need to move this to os environment
os.environ['KAGGLE_USERNAME'] = kaggle_dictionary['username']
os.environ['KAGGLE_KEY'] = kaggle_dictionary['key']

In [None]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 35% 9.00M/25.7M [00:00<00:00, 92.9MB/s]
100% 25.7M/25.7M [00:00<00:00, 139MB/s] 


In [None]:
# extracting the dataset from the zip

with ZipFile('/content/imdb-dataset-of-50k-movie-reviews.zip') as zip:
  zip.extractall()

In [None]:
# for listing all the directories we're having in the uploads
!ls

'IMDB Dataset.csv'   imdb-dataset-of-50k-movie-reviews.zip   kaggle.json   sample_data


In [None]:
# Loading the dataset and preprocessing
data = pd.read_csv('/content/IMDB Dataset.csv')

In [None]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [None]:
data.shape

(50000, 2)

In [None]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [None]:
# we're having a pretty balanced data

# we need to encode the sentiment labels as 0/1 ,to feed into the model
data.replace({'sentiment':{'positive':1,'negative':0}},inplace=True)

In [None]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [None]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0
49999,No one expects the Star Trek movies to be high...,0


In [None]:
# its encoded well
# Splitting the data into train and test

train_data , test_data = train_test_split(data,test_size=0.2,random_state=2)
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


In [None]:
# Data pre processing

# Tokenize text data - Tokenizer: This is a class from the Keras library used for converting text into sequences of integers.
# num_words=5000: This parameter specifies that the tokenizer should only consider the top 5000 most frequent words in the dataset. Less frequent words will be ignored.
tokenizer = Tokenizer(num_words=5000)
# This method updates the internal vocabulary of the tokenizer based on the list of texts provided (in this case, the 'review' column from the train_data dataframe). It learns the word-to-index mapping from the training data.
tokenizer.fit_on_texts(train_data['review'])
# tokenizer.texts_to_sequences(train_data['review']): This method transforms each text in the 'review' column into a sequence of integers, where each integer represents the index of a word in the tokenizer's word index.
# pad_sequences(..., maxlen=200): This function pads each sequence to the same length, specified by maxlen=200. Sequences shorter than 200 tokens are padded with zeros, and sequences longer than 200 tokens are truncated. This ensures all sequences in X_train have the same length of 200 tokens.
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']),maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']),maxlen=200)

In [None]:
X_train

array([[3474,   13,  847, ...,   78,  547,  166],
       [   0,    0,    0, ...,  105, 3444,  176],
       [ 133,    6,  429, ...,  143,  155, 1198],
       ...,
       [ 195,  117,   32, ...,   27,    4,   91],
       [   0,    0,    0, ...,   19,   30,  125],
       [  38,   88, 2252, ...,   23,   30,    9]], dtype=int32)

In [None]:
X_train.shape

(40000, 200)

In [None]:
X_test

array([[   0,    0,    0, ...,   30,   29, 2207],
       [  19,   13,  154, ...,   11, 3764, 2469],
       [   0,    0,    0, ...,  443,  223,   50],
       ...,
       [ 321,   18,  416, ...,  196,    9,   12],
       [   0,    0,    0, ...,  706,    2,   19],
       [   0,    0,    0, ...,    8,   11,   19]], dtype=int32)

In [None]:
X_test.shape

(10000, 200)

In [None]:
# concatinating the sentiments/labels to the train and test data
Y_train = train_data['sentiment']
Y_test = test_data['sentiment']

In [None]:
print(Y_train)

5478     0
22132    0
33533    1
42605    1
48740    1
        ..
44566    0
30280    0
6637     1
35343    0
23720    0
Name: sentiment, Length: 40000, dtype: int64


In [None]:
print(Y_test)

23656    0
27442    0
40162    1
8459     1
8051     1
        ..
44231    0
18034    0
33856    0
15906    1
40899    1
Name: sentiment, Length: 10000, dtype: int64


In [None]:
Y_train.shape

(40000,)

In [None]:
Y_test.shape

(10000,)

LSTM - Long Short-term memory , kind of RNN used for sequence datasets , it tries to understand sequence of words present in the input and not consider it a single entity

In [None]:
# The main and most important feature of RNN is its Hidden state, which remembers some information about a sequence. The state is also referred to as **Memory State** since it remembers the previous input to the network

In [None]:
# Building the model
model = Sequential([])
# adding the embedding layer
model.add(Embedding(input_dim=5000,output_dim=128,input_length=200))
model.add(LSTM(128,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          640000    
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 771713 (2.94 MB)
Trainable params: 771713 (2.94 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# 5000*128 = 640000 -> input_dim*output_dim for embedding layer
# lstm is recurrent layer and has 131584 parameters
# dense layer has input 128 but outputs 1 , total parameters = 128*1 + 1(bias parameter) ,
# dropout means how many values to be fed null to neurons to reduce overfitting
# Dropout is a regularization technique in which some proportion of the neurons in a network are randomly "dropped out" or ignored during training. By dropping out neurons, dropout helps to reduce the co-adaptation between neurons and to prevent overfitting on the training data.
# Dropout is a regularization method where input and recurrent connections to LSTM units are probabilistically excluded from activation and weight updates while training a network. This has the effect of reducing overfitting and improving model performance.


In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
# Training the model
model.fit(X_train,Y_train,epochs=5,batch_size=64,validation_split=0.2)

In [None]:
# evaluation
loss,accuracy = model.evaluate(X_test,Y_test)
print(loss)
print(accuracy)

In [None]:
# Building a predictive System //

def predict_sentiment(review):
  # tokenize and pad the data/review
  padded_sequence = pad_sequences(tokenizer.texts_to_sequences([review],maxlen=200))
  prediction = model.predict(pad_sequence)
  sentiment='Positive' if prediction[0][0]>0.5 else 'Negative'
  return sentiment

In [None]:
# testing the predictive system //
review = 'The movie was very very romantic , I just loved it!'
sentiment = predict_sentiment(review)
print(f'The sentiment of the movie is {sentiment}')

In [None]:
# testing the predictive system //
review = 'The movie was totally my 3 hr time waste , I want my money back !'
sentiment = predict_sentiment(review)
print(f'The sentiment of the movie is {sentiment}')

In [None]:
print('This project too is very practical, its good')

This project too is very practical, its good


In [None]:
print('20LPA+plsbg')

20LPA+plsbg
