# IMDB 

## -------------------------- Load UP --------------------------

In [None]:
# Import
import pandas as pd
import numpy as np
import nltk
import string
import re
import os  

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')            
nltk.download('punkt_tab')        
nltk.download('omw-1.4')                 

# Display settings
pd.set_option('display.max_colwidth', 200)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\frank\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [44]:

df = pd.read_csv('imdb_labelled.txt', sep='\t', header=None, names=['review', 'sentiment'])
print("Dataset loaded successfully!")


Dataset loaded successfully!


In [45]:
print(f"Dataset shape: {df.shape}")


Dataset shape: (748, 2)


In [46]:
print("\nFirst 5 rows:")
print(df.head())


First 5 rows:
                                                                                                                                                                                         review  \
0                                                                                                       A very, very, very slow-moving, aimless movie about a distressed, drifting young man.     
1                                                                                           Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.     
2  Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.     
3                                                                                                                                                  Very little music or anything to speak of.     
4         

In [47]:
# data info
print("Dataset Information:")
print(f"Number of reviews: {len(df)}")
print(f"Number of positive reviews (1): {sum(df['sentiment'] == 1)}")
print(f"Number of negative reviews (0): {sum(df['sentiment'] == 0)}")
print("\nSample reviews:")
for i in range(3):
    print(f"\nReview {i+1} (Sentiment: {df['sentiment'].iloc[i]}):")
    print(f"  {df['review'].iloc[i][:150]}...")

Dataset Information:
Number of reviews: 748
Number of positive reviews (1): 386
Number of negative reviews (0): 362

Sample reviews:

Review 1 (Sentiment: 0):
  A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ...

Review 2 (Sentiment: 0):
  Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  ...

Review 3 (Sentiment: 0):
  Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the...


## Question 1 : Preprocess
1) removing punctuation, 
2) removing numbers, 
3) removing stop words, 
4) changing the text to lower/upper case  
5) lemmatising.

Describe in detail, with at least 3 examples,

In [38]:
def preprocess_text(text, steps_to_apply=None):
    """
    Apply various preprocessing steps to text.
    
    Parameters:
    -----------
    text : str
        The input text to preprocess
    steps_to_apply : list or None
        List of preprocessing steps to apply. If None, apply all steps.
        
    Returns:
    --------
    dict : Dictionary containing original text and results of each preprocessing step
    """
    
    # Initialize results dictionary
    results = {
        'original': text,
        'lowercase': None,
        'no_punctuation': None,
        'no_numbers': None,
        'no_stopwords': None,
        'lemmatized': None,
        'final_processed': None
    }
    
    # Define which steps to apply
    if steps_to_apply is None:
        steps_to_apply = ['lowercase', 'no_punctuation', 'no_numbers', 'no_stopwords', 'lemmatized']
    
    # 1: Convert to lowercase
    if 'lowercase' in steps_to_apply:
        text_lower = text.lower()
        results['lowercase'] = text_lower
    else:
        text_lower = text
        results['lowercase'] = text
    
    # 2: Remove punctuation
    if 'no_punctuation' in steps_to_apply:
        # Create a translation table to remove punctuation
        translator = str.maketrans('', '', string.punctuation)
        text_no_punct = text_lower.translate(translator)
        results['no_punctuation'] = text_no_punct
    else:
        text_no_punct = text_lower
        results['no_punctuation'] = text_lower
    
    # 3: Remove numbers
    if 'no_numbers' in steps_to_apply:
        # Remove digits
        text_no_numbers = re.sub(r'\d+', '', text_no_punct)
        results['no_numbers'] = text_no_numbers
    else:
        text_no_numbers = text_no_punct
        results['no_numbers'] = text_no_punct
    
    # 4: Remove stopwords
    if 'no_stopwords' in steps_to_apply:
        # Get English stopwords
        stop_words = set(stopwords.words('english'))
        # Tokenize the text
        tokens = word_tokenize(text_no_numbers)
        # Remove stopwords
        filtered_tokens = [word for word in tokens if word not in stop_words]
        # Reconstruct the text
        text_no_stopwords = ' '.join(filtered_tokens)
        results['no_stopwords'] = text_no_stopwords
    else:
        text_no_stopwords = text_no_numbers
        results['no_stopwords'] = text_no_numbers
    
    # 5: Lemmatization
    if 'lemmatized' in steps_to_apply:
        # Initialize lemmatizer
        lemmatizer = WordNetLemmatizer()
        
        # Function to get POS tag for lemmatization
        def get_wordnet_pos(treebank_tag):
            if treebank_tag.startswith('J'):
                return 'a'  # adjective
            elif treebank_tag.startswith('V'):
                return 'v'  # verb
            elif treebank_tag.startswith('N'):
                return 'n'  # noun
            elif treebank_tag.startswith('R'):
                return 'r'  # adverb
            else:
                return 'n'  # default to noun
        
        # Tokenize text
        tokens = word_tokenize(text_no_stopwords)
        # Get POS tags
        pos_tags = nltk.pos_tag(tokens)
        # Lemmatize each word with appropriate POS tag
        lemmatized_tokens = []
        for word, tag in pos_tags:
            pos = get_wordnet_pos(tag)
            lemmatized_word = lemmatizer.lemmatize(word, pos)
            lemmatized_tokens.append(lemmatized_word)
        
        # Reconstruct the text
        text_lemmatized = ' '.join(lemmatized_tokens)
        results['lemmatized'] = text_lemmatized
    else:
        text_lemmatized = text_no_stopwords
        results['lemmatized'] = text_no_stopwords
    
    # Final processed text
    results['final_processed'] = text_lemmatized
    
    return results



In [39]:
# Select 3 examples 
example_indices = [0, 10, 50]  # You can adjust these indices based on your dataset

print("Selected examples for preprocessing demonstration:")
print("=" * 80)

examples = []
for idx in example_indices:
    if idx < len(df):
        example = {
            'index': idx,
            'original_text': df['review'].iloc[idx],
            'sentiment': df['sentiment'].iloc[idx]
        }
        examples.append(example)
        print(f"\nExample {len(examples)} (Index: {idx}, Sentiment: {example['sentiment']}):")
        print(f"  Original: {example['original_text']}")



Selected examples for preprocessing demonstration:

Example 1 (Index: 0, Sentiment: 0):
  Original: A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  

Example 2 (Index: 10, Sentiment: 1):
  Original: And those baby owls were adorable.  

Example 3 (Index: 50, Sentiment: 0):
  Original: The directing and the cinematography aren't quite as good.  


In [40]:
print("\n" + "=" * 80)
print("PREPROCESSING STEP-BY-STEP DEMONSTRATION")
print("=" * 80)

# Process each example through each step
for i, example in enumerate(examples):
    print(f"\n{'='*40}")
    print(f"EXAMPLE {i+1} PROCESSING")
    print(f"{'='*40}")
    print(f"Original text (Sentiment: {example['sentiment']}):")
    print(f"  \"{example['original_text']}\"")
    
    # Apply each preprocessing step sequentially
    steps = [
        ('1. Lowercase Conversion', ['lowercase']),
        ('2. Remove Punctuation', ['lowercase', 'no_punctuation']),
        ('3. Remove Numbers', ['lowercase', 'no_punctuation', 'no_numbers']),
        ('4. Remove Stopwords', ['lowercase', 'no_punctuation', 'no_numbers', 'no_stopwords']),
        ('5. Lemmatization', ['lowercase', 'no_punctuation', 'no_numbers', 'no_stopwords', 'lemmatized'])
    ]
    
    for step_name, steps_to_apply in steps:
        result = preprocess_text(example['original_text'], steps_to_apply)
        print(f"\n{step_name}:")
        print(f"  \"{result['lemmatized']}\"")
        
        # Show specific changes for certain steps
        if step_name == '4. Remove Stopwords':
            # Show removed stopwords
            original_tokens = word_tokenize(result['no_numbers'])
            filtered_tokens = word_tokenize(result['no_stopwords'])
            removed_words = set(original_tokens) - set(filtered_tokens)
            stopword_set = set(stopwords.words('english'))
            actual_stopwords_removed = [w for w in removed_words if w in stopword_set]
            if actual_stopwords_removed:
                print(f"  Removed stopwords: {', '.join(actual_stopwords_removed)}")
        
        elif step_name == '5. Lemmatization':
            # Show examples of lemmatization
            before_tokens = word_tokenize(result['no_stopwords'])
            after_tokens = word_tokenize(result['lemmatized'])
            
            # Find words that changed
            changed_words = []
            for before, after in zip(before_tokens, after_tokens):
                if before != after:
                    changed_words.append((before, after))
            
            if changed_words:
                print(f"  Examples of lemmatization:")
                for before, after in changed_words[:5]:  # Show first 5 changes
                    print(f"    '{before}' → '{after}'")




PREPROCESSING STEP-BY-STEP DEMONSTRATION

EXAMPLE 1 PROCESSING
Original text (Sentiment: 0):
  "A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  "

1. Lowercase Conversion:
  "a very, very, very slow-moving, aimless movie about a distressed, drifting young man.  "

2. Remove Punctuation:
  "a very very very slowmoving aimless movie about a distressed drifting young man  "

3. Remove Numbers:
  "a very very very slowmoving aimless movie about a distressed drifting young man  "

4. Remove Stopwords:
  "slowmoving aimless movie distressed drifting young man"
  Removed stopwords: about, a, very

5. Lemmatization:
  "slowmoving aimless movie distress drift young man"
  Examples of lemmatization:
    'distressed' → 'distress'
    'drifting' → 'drift'

EXAMPLE 2 PROCESSING
Original text (Sentiment: 1):
  "And those baby owls were adorable.  "

1. Lowercase Conversion:
  "and those baby owls were adorable.  "

2. Remove Punctuation:
  "and those baby owls

In [41]:
#  Original vs Fully Processed Text
print("\n" + "=" * 80)
print("COMPARISON: ORIGINAL vs FULLY PROCESSED TEXT")
print("=" * 80)

for i, example in enumerate(examples):
    # Get fully processed text
    full_result = preprocess_text(example['original_text'])
    
    print(f"\nExample {i+1} (Sentiment: {example['sentiment']}):")
    print(f"\nOriginal text:")
    print(f"  {example['original_text']}")
    
    print(f"\nFully processed text (after all 5 steps):")
    print(f"  {full_result['final_processed']}")
    
    # Calculate statistics
    original_words = len(word_tokenize(example['original_text']))
    processed_words = len(word_tokenize(full_result['final_processed']))
    reduction = original_words - processed_words
    
    print(f"\nStatistics:")
    print(f"  Original word count: {original_words}")
    print(f"  Processed word count: {processed_words}")
    print(f"  Words removed: {reduction} ({reduction/original_words*100:.1f}% reduction)")




COMPARISON: ORIGINAL vs FULLY PROCESSED TEXT

Example 1 (Sentiment: 0):

Original text:
  A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  

Fully processed text (after all 5 steps):
  slowmoving aimless movie distress drift young man

Statistics:
  Original word count: 18
  Processed word count: 7
  Words removed: 11 (61.1% reduction)

Example 2 (Sentiment: 1):

Original text:
  And those baby owls were adorable.  

Fully processed text (after all 5 steps):
  baby owls adorable

Statistics:
  Original word count: 7
  Processed word count: 3
  Words removed: 4 (57.1% reduction)

Example 3 (Sentiment: 0):

Original text:
  The directing and the cinematography aren't quite as good.  

Fully processed text (after all 5 steps):
  direct cinematography arent quite good

Statistics:
  Original word count: 11
  Processed word count: 5
  Words removed: 6 (54.5% reduction)


In [42]:

# ## 1.7 Apply Preprocessing to Entire Dataset
print("\n" + "=" * 80)
print("APPLYING PREPROCESSING TO ENTIRE DATASET")
print("=" * 80)

# Apply preprocessing to all reviews
print("Applying preprocessing to all reviews...")
df['processed_review'] = df['review'].apply(lambda x: preprocess_text(x)['final_processed'])

# Show before/after comparison for a few samples
print("\nSample comparisons (first 5 rows):")
print("-" * 40)

sample_df = df.head().copy()
for idx, row in sample_df.iterrows():
    print(f"\nReview {idx} (Sentiment: {row['sentiment']}):")
    print(f"  Original: {row['review'][:100]}...")
    print(f"  Processed: {row['processed_review'][:100]}...")

# %%
print("\n" + "=" * 80)
print("ANALYSIS OF PREPROCESSING IMPACT")
print("=" * 80)

# Calculate statistics for the entire dataset
original_lengths = df['review'].apply(lambda x: len(word_tokenize(x)))
processed_lengths = df['processed_review'].apply(lambda x: len(word_tokenize(x)))

print(f"\nDataset Statistics:")
print(f"  Total reviews: {len(df)}")
print(f"  Average original length: {original_lengths.mean():.2f} words")
print(f"  Average processed length: {processed_lengths.mean():.2f} words")
print(f"  Average reduction: {original_lengths.mean() - processed_lengths.mean():.2f} words")
print(f"  Percentage reduction: {(1 - processed_lengths.mean()/original_lengths.mean())*100:.2f}%")

# Show most common words before and after
from collections import Counter

# Get all words from original reviews
all_original_words = []
for review in df['review']:
    tokens = word_tokenize(review.lower())
    all_original_words.extend(tokens)

# Get all words from processed reviews
all_processed_words = []
for review in df['processed_review']:
    tokens = word_tokenize(review)
    all_processed_words.extend(tokens)

# Count frequencies
original_word_counts = Counter(all_original_words)
processed_word_counts = Counter(all_processed_words)

print(f"\nMost common words BEFORE preprocessing:")
for word, count in original_word_counts.most_common(10):
    print(f"  '{word}': {count} occurrences")

print(f"\nMost common words AFTER preprocessing:")
for word, count in processed_word_counts.most_common(10):
    print(f"  '{word}': {count} occurrences")

output_filename = 'imdb_reviews_processed.csv'
df.to_csv(output_filename, index=False)
print(f"\nProcessed dataset saved as '{output_filename}'")

# Create a summary dataframe for the report
summary_data = []
for i, example in enumerate(examples):
    result = preprocess_text(example['original_text'])
    summary_data.append({
        'Example': i+1,
        'Sentiment': 'Positive' if example['sentiment'] == 1 else 'Negative',
        'Original_Text': example['original_text'][:100] + '...' if len(example['original_text']) > 100 else example['original_text'],
        'Lowercase': result['lowercase'][:100] + '...' if len(result['lowercase']) > 100 else result['lowercase'],
        'No_Punctuation': result['no_punctuation'][:100] + '...' if len(result['no_punctuation']) > 100 else result['no_punctuation'],
        'No_Numbers': result['no_numbers'][:100] + '...' if len(result['no_numbers']) > 100 else result['no_numbers'],
        'No_Stopwords': result['no_stopwords'][:100] + '...' if len(result['no_stopwords']) > 100 else result['no_stopwords'],
        'Lemmatized': result['lemmatized'][:100] + '...' if len(result['lemmatized']) > 100 else result['lemmatized']
    })

summary_df = pd.DataFrame(summary_data)
print("\nSummary table for report (first 100 characters shown):")
print(summary_df.to_string())

# Also save this summary
summary_df.to_csv('preprocessing_examples_summary.csv', index=False)
print(f"\nPreprocessing examples summary saved as 'preprocessing_examples_summary.csv'")


APPLYING PREPROCESSING TO ENTIRE DATASET
Applying preprocessing to all reviews...

Sample comparisons (first 5 rows):
----------------------------------------

Review 0 (Sentiment: 0):
  Original: A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ...
  Processed: slowmoving aimless movie distress drift young man...

Review 1 (Sentiment: 0):
  Original: Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  ...
  Processed: sure lose flat character audience nearly half walk...

Review 2 (Sentiment: 0):
  Original: Attempting artiness with black & white and clever camera angles, the movie disappointed - became eve...
  Processed: attempt artiness black white clever camera angle movie disappoint become even ridiculous act poor pl...

Review 3 (Sentiment: 0):
  Original: Very little music or anything to speak of.  ...
  Processed: little music anything speak...

Review 4 (Sentiment: 1):
  Original: The best s