In [1]:
import pandas as pd

In [2]:
df= pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.sample(5)

Unnamed: 0,review,sentiment
5651,Ugh. Stephen Baldwin. I never noticed until I ...,negative
5308,"This is a thoroughly enjoyable, well-acted fil...",positive
29254,It's very sad that Lucian Pintilie does not st...,negative
7215,I recently watched Caprica again and thought I...,positive
17588,This is beyond stupid. <br /><br />Two high sc...,negative


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
df.shape

(50000, 2)

In [7]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [8]:
## applying the lowecasing
df['review'] = df['review'].str.lower()

In [9]:
df['review'].head()

0    one of the other reviewers has mentioned that ...
1    a wonderful little production. <br /><br />the...
2    i thought this was a wonderful way to spend ti...
3    basically there's a family where a little boy ...
4    petter mattei's "love in the time of money" is...
Name: review, dtype: object

In [10]:
# removing the html tags
import re
def remove_tags(text):
    if isinstance(text, str):
        pattern = re.compile('<.*?>')
        return pattern.sub(r'', text)
    else:
        return text

In [11]:
df['review'] = df['review'].apply(remove_tags)

In [12]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [13]:
# remove urls
import re
def remove_url(text):
    if isinstance(text, str):
        pattern = re.compile(r'https?://\S+|www\.\S+')
        return pattern.sub(r'', text)
    else:
        return text  # Return non-strings as is

In [14]:
df['review'] = df['review'].apply(remove_url)

In [15]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [16]:
## remove punctuation
import string
string.punctuation
exclude = string.punctuation


In [17]:
def remove_punc1(text):
    if isinstance(text, str):
        return text.translate(str.maketrans('', '', exclude))
    else:
        return text  # Return non-strings as is

In [18]:
df['review'] = df['review'].apply(remove_punc1)

In [19]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [20]:
# ## second method using spellcheck library
# from spellchecker import SpellChecker

# def correct_spelling1(text):
#     # Initialize the spell checker
#     spell = SpellChecker()

#     # Tokenize the text into words
#     words = text.split()

#     # Correct the spelling of each word, or keep the original if not found
#     corrected_words = [spell.correction(word) if spell.correction(word) else word for word in words]

#     # Reconstruct the corrected text
#     corrected_text = ' '.join(corrected_words)

#     return corrected_text

In [21]:
#df['review'] = df['review'].apply(correct_spelling1)

In [22]:
#removing stopwords
from nltk.corpus import stopwords

In [23]:
def remove_stopwords1(text):
    if isinstance(text, str):
        stop_words = set(stopwords.words('english'))
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in stop_words]
        return " ".join(filtered_words)
    else:
        return text

In [24]:
df['review'] = df['review'].apply(remove_stopwords1)

In [25]:
df.head(5)

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive


In [26]:
#tokenization
from nltk.tokenize import word_tokenize, sent_tokenize

In [27]:
# Tokenize the 'title' column
df['review'] = df['review'].apply(lambda x: word_tokenize(x) if isinstance(x, str) else x)

In [28]:
df.head(5)

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, 1, oz, e...",positive
1,"[wonderful, little, production, filming, techn...",positive
2,"[thought, wonderful, way, spend, time, hot, su...",positive
3,"[basically, theres, family, little, boy, jake,...",negative
4,"[petter, matteis, love, time, money, visually,...",positive


## Find out the number of words in the entire corpus and also the total number of unique words(vocabulary) using just python

In [29]:
# Flatten the list of tokenized words into a single list
all_words = [word for review in df['review'] if isinstance(review, list) for word in review]

# Total number of words
total_words = len(all_words)

# Number of unique words
unique_words = len(set(all_words))

print(f"Total number of words: {total_words}")
print(f"Total number of unique words: {unique_words}")

Total number of words: 5996452
Total number of unique words: 222184


## feature engineering techniques 1. one hot encoding 2. bag of words 3. N gram 4. Tf-IDF

#### BAG OF WORDS

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
cv = CountVectorizer()

In [32]:
# Assuming df['review'] contains lists of lemmas
df['review_joined'] = df['review'].apply(lambda x: ' '.join(x))

# Now use CountVectorizer
bow = cv.fit_transform(df['review_joined'])

In [33]:
print(cv.vocabulary_)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [34]:
print(bow[0].toarray())

[[0 0 0 ... 0 0 0]]


## Apply bag of bi-gram and bag of tri-gram and write down your observation about the dimensionality of the vocabulary

In [35]:
cv1 = CountVectorizer(ngram_range=(2,2))

In [36]:
# Assuming df['review'] contains lists of lemmas
df['review_joined'] = df['review'].apply(lambda x: ' '.join(x))

# Now use CountVectorizer
bow1 = cv1.fit_transform(df['review_joined'])

In [37]:
print(cv1.vocabulary_)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [38]:
print(bow1[0].toarray())
print(bow1[1].toarray())

[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]


In [39]:
cv2 = CountVectorizer(ngram_range=(3,3))

In [40]:
# Assuming df['review'] contains lists of lemmas
df['review_joined'] = df['review'].apply(lambda x: ' '.join(x))

# Now use CountVectorizer
bow2 = cv2.fit_transform(df['review_joined'])

In [41]:
print(bow2[0].toarray())
print(bow2[1].toarray())

[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]


## Apply tf-idf and find out the idf scores of words, also find out the vocabulary.

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [43]:
tfidf = TfidfVectorizer()

In [44]:
Tf = tfidf.fit_transform(df['review_joined'])
#similarity = cosine_similarity(Tf)

In [45]:
#similarity[0]

## Applying the machine learning algorithm

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [47]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(Tf, df['sentiment'], test_size=0.2, random_state=42)


In [48]:
X_train.shape

(40000, 221215)

In [49]:
y_train.shape

(40000,)

In [50]:
le = LabelEncoder()

In [51]:
le.fit(y_train)

In [52]:
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [53]:
# Check the encoding
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

print("Label Encoding Mapping:")
print(label_mapping)

Label Encoding Mapping:
{'negative': 0, 'positive': 1}


In [54]:
y_train

array([0, 0, 1, ..., 0, 1, 1])

In [55]:
y_test

array([1, 1, 0, ..., 1, 0, 1])

In [56]:
# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(X_train, y_train)


In [57]:
# Predict the labels for the test set
y_pred = nb_classifier.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8689

Confusion Matrix:
 [[4373  588]
 [ 723 4316]]

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.88      0.87      4961
           1       0.88      0.86      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [60]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score

# Example classifiers
clf1 = LogisticRegression(random_state=1)


clf1.fit(X_train, y_train)

y_pred = clf1.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))




Accuracy: 0.8961

Confusion Matrix:
 [[4376  585]
 [ 454 4585]]

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.88      0.89      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [61]:
# Example classifiers
clf2 = RandomForestClassifier(random_state=1)


clf2.fit(X_train, y_train)

y_pred = clf2.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

KeyboardInterrupt: 

In [None]:
# Example classifiers
clf4 = SVC(probability=True)


clf4.fit(X_train, y_train)

y_pred = clf4.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
clf5 = DecisionTreeClassifier(random_state=1)


clf5.fit(X_train, y_train)

y_pred = clf5.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
logreg = LogisticRegression()
multinb = MultinomialNB()
voting_clf = VotingClassifier(estimators=[('lr', logreg), ('nb', multinb)], voting='hard')
# Assuming X_train and y_train are your data
voting_clf.fit(X_train, y_train)
# Making predictions
predictions = voting_clf.predict(X_test)

# Evaluate the model
# Assuming you have some evaluation metric in place
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Example new reviews
new_reviews = ["This movie was excellent, with great acting and storyline.",
               "I did not like this movie, it was boring and too long."]

# Convert new reviews to TF-IDF representation (using the same tfidf vectorizer)
new_reviews_tfidf = tfidf.transform(new_reviews)

# Predict the classes for the new reviews
predicted_classes = nb_classifier.predict(new_reviews_tfidf)

print(predicted_classes)


In [None]:
import pickle

filename = 'sentiment_model.sav'
pickle.dump(nb_classifier, open(filename, 'wb'))

loaded_model = pickle.load(open('sentiment_model.sav', 'rb'))


In [None]:
loaded_model = pickle.load(open('sentiment_model.sav', 'rb'))

In [None]:
# Example new reviews
new_reviews = ["This movie was excellent, with great acting and storyline.",
               "I did not like this movie, it was boring and too long."]

# Convert new reviews to TF-IDF representation (using the same tfidf vectorizer)
new_reviews_tfidf = tfidf.transform(new_reviews)

# Predict the classes for the new reviews
predicted_classes = loaded_model.predict(new_reviews_tfidf)

print(predicted_classes)