In [1]:
!pip install kagglehub pandas tabulate



In [2]:
import kagglehub
import pandas as pd
import numpy as np
import os
from tabulate import tabulate

In [3]:
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
file_path = os.path.join(path, "IMDB Dataset.csv")

Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.


In [4]:
df = pd.read_csv(file_path)

In [None]:
#Text Preprocessing (This we were reminded by Mavin to remove the HTML tags, Thank you!)
df["review"] = df["review"].str.replace(r"<br\s*/?>", " ", regex=True)
df["review"] = df["review"].str.lower()

 # Display shape and first few rows
print("Dataset shape:", df.shape)
display(df.head(10))

Dataset shape: (50000, 2)


Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming t...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
5,"probably my all-time favorite movie, a story o...",positive
6,i sure would like to see a resurrection of a u...,positive
7,"this show was an amazing, fresh & innovative i...",negative
8,encouraged by the positive comments about this...,negative
9,if you like original gut wrenching laughter yo...,positive


In [6]:
keywords = ["worst", "awful", "bad"]

In [7]:
N = len(df)
p_negative = len(df[df["sentiment"] == "negative"]) / N
p_positive = 1 - p_negative

print(f"Total number of reviews: {N}")
print(f"P(Negative) = {p_negative:.4f}")
print(f"P(Positive) = {p_positive:.4f}")

Total number of reviews: 50000
P(Negative) = 0.5000
P(Positive) = 0.5000


In [17]:
results = []

for kw in keywords:
    # Reviews containing the keyword
    contains_kw = df["review"].str.contains(rf"\b{kw}\b", regex=True, case=False, na=False)

    # Marginal probability P(keyword)
    p_kw = contains_kw.mean()

    # Likelihood P(keyword | Negative)
    p_kw_given_neg = (
        df[contains_kw & (df["sentiment"] == "negative")].shape[0]
        / df[df["sentiment"] == "negative"].shape[0]
    )

    # Posterior: P(Negative | keyword) using Bayes' theorem
    if p_kw > 0:
        p_neg_given_kw = (p_kw_given_neg * p_negative) / p_kw
    else:
        p_neg_given_kw = np.nan

    results.append({
        "Keyword": kw,
        "P(Negative)": round(p_negative, 4),
        "P(keyword|Negative)": round(p_kw_given_neg, 4),
        "P(keyword)": round(p_kw, 4),
        "P(Negative|keyword)": round(p_neg_given_kw, 4)
    })

    results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Keyword,P(Negative),P(keyword|Negative),P(keyword),P(Negative|keyword)
0,worst,0.5,0.1609,0.0887,0.9073
1,awful,0.5,0.104,0.0577,0.9015
2,bad,0.5,0.3531,0.2358,0.7488


In [18]:
results_df = pd.DataFrame(results)

In [19]:
print(tabulate(results_df, headers='keys', tablefmt='github', showindex=False))

| Keyword   |   P(Negative) |   P(keyword|Negative) |   P(keyword) |   P(Negative|keyword) |
|-----------|---------------|-----------------------|--------------|-----------------------|
| worst     |           0.5 |                0.1609 |       0.0887 |                0.9073 |
| awful     |           0.5 |                0.104  |       0.0577 |                0.9015 |
| bad       |           0.5 |                0.3531 |       0.2358 |                0.7488 |
