In [1]:
import zipfile
import pandas as pd
#unzip the files
archive_train = zipfile.ZipFile('Dataset/movies/train.tsv.zip')

#read training tsv file 
train = pd.read_csv('Dataset/movies/train.tsv.zip', delimiter='\t')
train.head(20)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [2]:
print(train["Sentiment"].value_counts())

Sentiment
2    79582
3    32927
1    27273
4     9206
0     7072
Name: count, dtype: int64


In [3]:
from sklearn.model_selection import train_test_split
import numpy as np
#split data into train and test data
#then split the test data into test and validation 
train_data, test_data = train_test_split(train, test_size=0.4, random_state=1)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=1)

#resets index after splitting data
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

print("Train set size is ",len(train_data))
print("Val set size is ",len(val_data))
print("Test set size is ",len(test_data))

Train set size is  93636
Val set size is  31212
Test set size is  31212


In [4]:
from nltk.corpus import stopwords
import re
import spacy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# load pretrained NER model to filter the stop word (names, locations)
NER = spacy.load("en_core_web_sm")
NER_LABELS_TO_REMOVE = {"PERSON", "GPE", "ORG"}
# stemming
porter = PorterStemmer()

# get the standard stop words
stop_words = set(stopwords.words('english'))

def ret_words(Phrase):
    Phrase = Phrase.lower()
    Phrase = re.sub(r'[-,;\'\\/.\.\.\.]', ' ', Phrase)  # Clean punctuation using regex
    
    # Perform Named Entity Recognition (NER) using SpaCy
    doc = NER(Phrase)
    entities_to_remove = {ent.text.lower() for ent in doc.ents if ent.label_ in NER_LABELS_TO_REMOVE}

    words = [
        porter.stem(word) for word in Phrase.split()
        if word not in stop_words and word not in entities_to_remove and re.match(r'^[a-zA-Z]+$', word)
    ]
    return ' '.join(words)

In [5]:
def preprocess(df, remove_short_phrases=True):
    """
    Preprocess the given dataframe by:
    1. Removing phrases with only one word (optional)
    2. Cleaning and stemming words using `ret_words`
    
    :param df: Input DataFrame with a 'Phrase' column
    :param remove_short_phrases: Whether to remove phrases with only one word (default: True)
    :return: Processed DataFrame
    """
    # Add a new column to store word count
    df["words_num"] = df['Phrase'].str.split().str.len()

    # Remove phrases with only one word if the flag is True
    if remove_short_phrases:
        df = df[df["words_num"] > 1].copy()

    # Apply text processing function `ret_words`
    df['Phrase'] = df["Phrase"].apply(ret_words)
    
    return df

In [6]:
train_preprocessed = preprocess(train_data,0)
test_preprocessed = preprocess(test_data,1)
val_preprocessed = preprocess(val_data,1)
train_preprocessed.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,words_num
0,18674,819,paulin,2,1
1,15525,665,tri go,2,3
2,119919,6413,week live,2,4
3,17938,781,essenti collect bit,2,5
4,98852,5185,director fake backdrop state pace,1,11
5,33703,1582,regret,2,2
6,152547,8324,tremend chemistri,3,3
7,112239,5960,sequel,2,2
8,58522,2948,cushion predict narr rhythm,2,6
9,117390,6263,bad,0,3


In [7]:
# Save processed data in the 'Dataset' folder
train_preprocessed.to_csv("Dataset/train_preprocessed.csv", index=False)
test_preprocessed.to_csv("Dataset/test_preprocessed.csv", index=False)
val_preprocessed.to_csv("Dataset/val_preprocessed.csv", index=False)