# New Cell (before Cell 1)

In [1]:
import nltk

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt_tab")



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\primu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\primu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\primu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\primu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\primu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Cell 1 — Imports & Setup

In [1]:
import sys
sys.path.append("../")

import pandas as pd
from tqdm import tqdm

from src.data_loader import load_raw_data, encode_labels, split_data, save_splits
from src.preprocessing import clean_text


# Cell 2 — Load & Encode Data

In [2]:
df = load_raw_data("../data/raw/imdb_reviews.csv")
df = encode_labels(df)

df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


# Cell 3 — Apply Text Cleaning

In [3]:
tqdm.pandas()

df["clean_review"] = df["review"].progress_apply(clean_text)
df.head()


100%|██████████| 50000/50000 [01:04<00:00, 772.17it/s] 


Unnamed: 0,review,sentiment,clean_review
0,One of the other reviewers has mentioned that ...,1,one reviewer mentioned watching episode youll ...
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love time money visually stunni...


# Cell 4 — Compare Raw vs Cleaned Text

In [4]:
print("RAW REVIEW:\n")
print(df["review"].iloc[0])

print("\nCLEANED REVIEW:\n")
print(df["clean_review"].iloc[0])


RAW REVIEW:

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is d

# Cell 5 — Train / Val / Test Split (80/10/10)

In [5]:
X_train, X_val, X_test, y_train, y_val, y_test = split_data(
    df,
    text_col="clean_review",
    label_col="sentiment"
)

print("Train size:", len(X_train))
print("Validation size:", len(X_val))
print("Test size:", len(X_test))


Train size: 40000
Validation size: 5000
Test size: 5000


# Cell 6 — Save Processed Data

In [6]:
save_splits(
    X_train, X_val, X_test,
    y_train, y_val, y_test,
    output_dir="../data/processed"
)


# Cell 7 — Final Sanity Check

In [7]:
pd.read_csv("../data/processed/train.csv").head()


Unnamed: 0,review,sentiment
0,caught little gem totally accident back reviva...,1
1,cant believe let movie accomplish favor friend...,0
2,spoiler alert get nerve people remake use term...,0
3,there one thing ive learnt watching george rom...,0
4,remember theater review said horrible well did...,0
