In [2]:
############################################################################
##################### TASK 0: Data Preparation #############################
############################################################################
import pandas as pd
from datasets import load_dataset

# a. Load the IMDB Movie Reviews dataset
imdb_dataset = load_dataset('imdb')

# b. Create a new dataframe with columns 'review_text' and 'sentiment'
# The original columns are 'text' and 'label'
train_df = pd.DataFrame({'review_text': imdb_dataset['train']['text'],
                         'sentiment': imdb_dataset['train']['label']})

# c. Map sentiment labels (0 -> 'negative', 1 -> 'positive')
sentiment_mapping = {0: 'negative', 1: 'positive'}
train_df['sentiment'] = train_df['sentiment'].map(sentiment_mapping)

# d. Save the processed dataset
train_df.to_csv('movie_reviews.csv', index=False)

print("Data preparation complete")

Data preparation complete


In [12]:
############################################################################
##################### TASK 1: Data Preprocessing ###########################
############################################################################
import re, string, nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')

def preprocess_text(text):
    """
    Performs text preprocessing: lowercasing, punctuation removal, and stop word removal.
    """
    # a. Convert text to lowercase
    text = text.lower()

    # b. Remove punctuation marks
    text = text.translate(str.maketrans('', '', string.punctuation))

    # c. Remove stop words
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    text = ' '.join(filtered_text)

    return text

# Load the dataframes if not already in memory from Task 0
train_df = pd.read_csv('movie_reviews.csv')

# Apply preprocessing to the 'review_text' column
train_df['review_text'].apply(preprocess_text)



[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


0        rented curiousyellow video store controversy s...
1        curious yellow risible pretentious steaming pi...
2        avoid making type film future film interesting...
3        film probably inspired godards masculin fémini...
4        oh brotherafter hearing ridiculous film umptee...
                               ...                        
24995    hit time better categorised australian cult fi...
24996    love movie like another time try explain virtu...
24997    film sequel barry mckenzie holds two greatest ...
24998    adventures barry mckenzie started life satiric...
24999    story centers around barry mckenzie must go en...
Name: review_text, Length: 25000, dtype: object

In [16]:
############################################################################
##################### TASK 2: Text Tokenization ############################
############################################################################
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import pandas as pd # Ensure pandas is imported

# Ensure 'punkt' is downloaded (it should have been downloaded in Task 1,
try:
    nltk.data.find('tokenizers/punkt.zip')
except LookupError:
    print("Downloading 'punkt' NLTK data...")
    nltk.download('punkt')
    print("'punkt' downloaded.")

def tokenize_text_data(df_column):
    """
    Performs sentence and word tokenization on a DataFrame column of text.
    """
    # a. Perform sentence tokenization using NLTK
    sentences = df_column.apply(sent_tokenize)
    # b. Perform word tokenization using NLTK on the processed text
    words = df_column.apply(word_tokenize)
    return sentences, words

# Load the dataframes
train_df = pd.read_csv('movie_reviews.csv')

# Apply tokenization to the 'review_text' column
train_sentences, train_words = tokenize_text_data(train_df['review_text'])

# c. Store tokenized sentences and words in new DataFrame columns
train_df['tokenized_sentences'] = train_sentences
train_df['tokenized_words'] = train_words

print("\nTask 2: Text tokenization complete. Added 'tokenized_sentences' and 'tokenized_words' columns.")
print("\nFirst 5 rows of train_df with tokenized data:")
print(train_df[['review_text', 'tokenized_sentences', 'tokenized_words']].head())



Task 2: Text tokenization complete. Added 'tokenized_sentences' and 'tokenized_words' columns.

First 5 rows of train_df with tokenized data:
                                         review_text  \
0  I rented I AM CURIOUS-YELLOW from my video sto...   
1  "I Am Curious: Yellow" is a risible and preten...   
2  If only to avoid making this type of film in t...   
3  This film was probably inspired by Godard's Ma...   
4  Oh, brother...after hearing about this ridicul...   

                                 tokenized_sentences  \
0  [I rented I AM CURIOUS-YELLOW from my video st...   
1  ["I Am Curious: Yellow" is a risible and prete...   
2  [If only to avoid making this type of film in ...   
3  [This film was probably inspired by Godard's M...   
4  [Oh, brother...after hearing about this ridicu...   

                                     tokenized_words  
0  [I, rented, I, AM, CURIOUS-YELLOW, from, my, v...  
1  [``, I, Am, Curious, :, Yellow, '', is, a, ris...  
2  [If, only, to, 

In [18]:
############################################################################
################## TASK 3: Basic Text Vectorization#########################
############################################################################
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd # Ensure pandas is imported

# Load the dataframes from Task 0
print("DataFrames loaded for feature vectorization.")
train_df = pd.read_csv('movie_reviews.csv')

# a. Create a Bag-of-Words representation using CountVectorizer
# Limit to top 1000 features as per the prompt's implied requirement
vectorizer = CountVectorizer(max_features=1000)

# b. Apply it to the processed text column
# Fit the vectorizer on the training data and transform both train and test
# Fitting only on training data prevents data leakage from the test set.
X_train = vectorizer.fit_transform(train_df['review_text'])

# Get feature names (the words that form the columns of the BoW matrix)
feature_names = vectorizer.get_feature_names_out()

# c. Calculate and analyze word frequencies
# Sum the counts for each feature (word) across all documents in the training set
word_frequencies = X_train.sum(axis=0)

# Convert to a dictionary mapping feature names to their frequencies
frequencies_dict = dict(zip(feature_names, word_frequencies.tolist()[0]))

# Display the top 20 most frequent words
# Sort the dictionary items by frequency in descending order
sorted_frequencies = sorted(frequencies_dict.items(), key=lambda item: item[1], reverse=True)

print("\nTask 3: Basic Feature Vectorization complete.")
print("\nShape of Bag-of-Words matrix for training data (documents x features):", X_train.shape)

print("\nTop 20 most frequent words in the training data:")
for word, freq in sorted_frequencies[:20]:
    print(f"- {word}: {freq}")

DataFrames loaded for feature vectorization.

Task 3: Basic Feature Vectorization complete.

Shape of Bag-of-Words matrix for training data (documents x features): (25000, 1000)

Top 20 most frequent words in the training data:
- the: 336749
- and: 164140
- of: 145864
- to: 135724
- is: 107332
- br: 101871
- it: 96467
- in: 93976
- this: 76007
- that: 73286
- was: 48209
- as: 46935
- for: 44345
- with: 44130
- movie: 44047
- but: 42623
- film: 40159
- you: 34267
- on: 34202
- not: 30632
