In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from collections import Counter

# For text processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.util import ngrams
from wordcloud import WordCloud

# For advanced NLP
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

In [2]:
# Download necessary NLTK data
# In a real environment, run these once:

nltk.download('all')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Sample text data - movie reviews with sentiment labels
reviews = [
    "This movie was fantastic! I really enjoyed every moment of it.",
    "Terrible film. Complete waste of time and money.",
    "The acting was great but the plot had too many holes.",
    "Absolutely loved it, would watch again and recommend to friends!",
    "Boring and predictable. The special effects were good though.",
    "A masterpiece of modern cinema with outstanding performances.",
    "Fell asleep halfway through. Very disappointing experience.",
    "Not bad, but not great either. Just an average film."
]

sentiments = [1, 0, 0, 1, 0, 1, 0, 0]  # 1 = positive, 0 = negative

# Create a DataFrame for easy handling
df = pd.DataFrame({
    'review': reviews,
    'sentiment': sentiments
})

print("\nSample Data:")
print(df.head())

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/marwek/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/marwek/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/marwek/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/marwek/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/marwek/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nlt


Sample Data:
                                              review  sentiment
0  This movie was fantastic! I really enjoyed eve...          1
1   Terrible film. Complete waste of time and money.          0
2  The acting was great but the plot had too many...          0
3  Absolutely loved it, would watch again and rec...          1
4  Boring and predictable. The special effects we...          0


[nltk_data]    |   Package wordnet_ic is already up-to-date!
[nltk_data]    | Downloading package words to
[nltk_data]    |     /Users/marwek/nltk_data...
[nltk_data]    |   Package words is already up-to-date!
[nltk_data]    | Downloading package ycoe to
[nltk_data]    |     /Users/marwek/nltk_data...
[nltk_data]    |   Package ycoe is already up-to-date!
[nltk_data]    | 
[nltk_data]  Done downloading collection all


In [None]:
# 1. Basic Text Preprocessing

print(nltk.word_tokenize("Before preprocessing"))


def preprocess_text(text):
    """Basic preprocessing function for text data"""
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])

    # Tokenize (split text into words)
    tokens = word_tokenize(text)

    # Remove stopwords (common words like 'the', 'and', 'is')
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization (reducing words to their base form)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a single string
    processed_text = ' '.join(tokens)
    
    return processed_text


# Apply preprocessing to the reviews
df['processed_review'] = df['review'].apply(preprocess_text)

print("\nAfter preprocessing")
for i in range(3):
    print(f"Original : {df['review'][i]}")
    print(f"Processed : {df['processed_review'][i]}")    

['Before', 'preprocessing']

After preprocessing
Original : This movie was fantastic! I really enjoyed every moment of it.
Processed : movie fantastic really enjoyed every moment
Original : Terrible film. Complete waste of time and money.
Processed : terrible film complete waste time money
Original : The acting was great but the plot had too many holes.
Processed : acting great plot many hole
