In [29]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

data = pd.read_csv('IMDB Dataset.csv')
data
columns = data.columns
print("Columns in the CSV file:")
for column in columns:
    print(column)



Columns in the CSV file:
review
sentiment


In [30]:
# Split the data into training and test sets (80% train, 20% test).
train_data, test_data, train_labels, test_labels = train_test_split(
    data['review'], data['sentiment'], test_size=0.2, random_state=42
)

In [31]:
# Define preprocessing functions
def preprocessed_text(text):
    # Lowercasing
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Removing Punctuation and Non-Alphanumeric Characters
    tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens]
    # Removing Stop Words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Stemming (Porter Stemmer)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed
train_features = tfidf_vectorizer.fit_transform(train_data)
test_features = tfidf_vectorizer.transform(test_data)

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# Model Selection (using Logistic Regression):
model = LogisticRegression()

# Model Training:
model.fit(train_features, train_labels)

# Model Evaluation:
predictions = model.predict(test_features)
accuracy = accuracy_score(test_labels, predictions)
report = classification_report(test_labels, predictions)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Accuracy: 0.90
Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.88      0.89      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [34]:
# Fine-Tuning Hyperparameters
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(train_features, train_labels)
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Hyperparameters: {'C': 1}


In [35]:
# Model Training with Best Hyperparameters:
best_C = 1
model = LogisticRegression(C=best_C)
model.fit(train_features, train_labels)

In [36]:
# Error Analysis:
misclassified_samples = test_data[test_labels != predictions]
misclassified_true_labels = test_labels[test_labels != predictions]
misclassified_predictions = predictions[test_labels != predictions]
misclassified_probabilities = model.predict_proba(test_features)[test_labels != predictions]

print("Misclassified Samples:")
for i, (sample, true_label, predicted_label, probs) in enumerate(zip(
        misclassified_samples, misclassified_true_labels, misclassified_predictions, misclassified_probabilities)):
    if i >= 10:
        break
    print(f"Sample: {sample}\nTrue Label: {true_label}\nPredicted Label: {predicted_label}")
    print(f"Probabilities for Classes [0, 1]: {probs}")
    print("\n")


Misclassified Samples:
Sample: I really liked this Summerslam due to the look of the arena, the curtains and just the look overall was interesting to me for some reason. Anyways, this could have been one of the best Summerslam's ever if the WWF didn't have Lex Luger in the main event against Yokozuna, now for it's time it was ok to have a huge fat man vs a strong man but I'm glad times have changed. It was a terrible main event just like every match Luger is in is terrible. Other matches on the card were Razor Ramon vs Ted Dibiase, Steiner Brothers vs Heavenly Bodies, Shawn Michaels vs Curt Hening, this was the event where Shawn named his big monster of a body guard Diesel, IRS vs 1-2-3 Kid, Bret Hart first takes on Doink then takes on Jerry Lawler and stuff with the Harts and Lawler was always very interesting, then Ludvig Borga destroyed Marty Jannetty, Undertaker took on Giant Gonzalez in another terrible match, The Smoking Gunns and Tatanka took on Bam Bam Bigelow and the Headshrin

In [37]:
# Train a logistic regression model
model = LogisticRegression()
model.fit(train_features, train_labels)

# Make predictions on the test data
predictions = model.predict(test_features)

# Display the results
for i in range(len(test_data)):
    review = test_data.iloc[i]
    true_label = test_labels.iloc[i]
    predicted_label = predictions[i]

    print("Review:", review)
    print("True Label:", true_label)
    print("Predicted Label:", predicted_label)
    print("\n")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Review: Even thought I'm not the biggest of Cher fans, this movie was her crowning achievement. Granted, there were long term side-effects and risks of brain damage, memory loss (and) intellectual impairment, upon the screening such a film. A 1989 survey of Moonstruck fans by the UK Advocacy Network revealed that one-third of 300 Moonstruck fans surveyed believed Moonstruck had damaged them and an astounding 80% claimed it had irreparably destroyed their minds.<br /><br />Cher plays someone very un-Cher in this movie, a dowdy young widow named Loretta living in New York with her extended family. They're anti-American, pro-Italian and always at each other in someway. She has been going out with Johnny Camarary for a while, a nice mamma's boy man, and he asks her to marry him. She says yes. I loved her mom's questions: "Do you love him Loretta?", "No.", "Good. If you love him he'll drive you crazy because they know they c

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Review: I shall not waste my time writing anything much further about how every aspect of this film is indescribably bad. That has been done in great detail already, many times over. The 'plot' started out as a very uninspiring cockney wide-boy/gangster-by-numbers bore and very quickly descended into an utter shambles. Anybody who pretends that they can see some hidden masterpiece inside this awful mess is just kidding themselves. It is now 7 or 8 years since I watched it during its 1 week run at the cinema before it was pulled, yet it sticks in my mind for being easily the most terrible film I have ever seen.<br /><br />I am only making these comments, and indeed the only reason I went to see the film, is because of the amusing fact that my brother Eddie appeared in it as the second 'heavy' in the pub scene. It was his hands that thrust a zippo lighter towards Rhys Ifan's face in the bar in 'Russia' (it was actually filmed at the former Butlins holiday camp at Barry Island). My brothe

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




True Label: negative
Predicted Label: negative


Review: Baba - Rajinikanth will never forget this name in his life. This is the movie which caused his downfall. It was released with much hype but crashed badly and laid to severe financial losses for its producers and distributors. Rajinikanth had to personally repay them for the losses incurred. Soon after its release, he tried venturing into politics but failed miserably. Its a very bad movie with horrible acting, bad-quality makeup and pathetic screenplay. Throughout the movie, Rajinikanth looks like a person suffering from some disease. I'm one of the unfortunate souls who saw Baba, first day first show in theatre. The audiences were so bored that most of them left the theatre before the intermission. Sorry, I'll not recommend this one to anyone.
True Label: negative
Predicted Label: negative


Review: This is one of those films that looks so "dated" that being that way is part of the fun. You see and hear things you would NEVER s

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Predicted Label: positive


Review: I have been meaning to see this flick for the past few months. I was actually surprised at how good it was.<br /><br />The plot revolves around a group of high school teenagers who are bullying a boy named Marty. They constantly bully him until one of them makes a horrific mistake which leaves Marty horribly burned.<br /><br />A few years later, the group of reckless teenagers are invited back to their own high school which is now abandoned for a party. Though, not one of the reckless teenagers has organized this party.<br /><br />Later through the film, the teenagers start dying in the most gruesome of ways possible. I can certainly tell you that they are gory as well.<br /><br />At the end of the film, you find out it was all a dream and none of it happened. But, Marty is in the hospital with severe burns. Although the murders didn't happen, the burns and the pranks apparently did hap

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Review: I have to admit I was deceived by the title and the summary on the back of the box. So I popped it in the vcr and kept waiting... and waiting... and waiting for something good to happen. But of course, it never does. The makers of this film should be tied to a chair and made to watch "Saving Private Ryan". Maybe they would learn something.
True Label: negative
Predicted Label: negative


Review: While watching this film recently, I constantly had to remind myself that it was made in 1957..........and in the USSR! That makes it all the more remarkable. Many of the cinematographic effects in the film seem cliched in 2002, but they were quite original in 1957. I first saw this film in 1963, when it was first released in the US, and I was struck by its originality then. Now just having seen it 40 years later, I have no reason to change my mind.
True Label: positive
Predicted Label: positive


Review: this really is an