---
# PART 1: SETUP & DATA LOADING
---

## 1.1 Import Libraries

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Text processing
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from collections import Counter

# Machine learning
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup

# Deep learning
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import AdamW

# Utilities
import time
import datetime
import warnings
warnings.filterwarnings('ignore')

# Configuration
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
sns.set_style("whitegrid")
np.random.seed(42)

print("‚úì All libraries imported successfully.")

## 1.2 Load Dataset

In [None]:
# Load dataset from GitHub release
try:
    df = pd.read_csv('https://github.com/erlanggadewasakti/Prinsip-Sains-Data/releases/download/prod/sa-psd-dataset.csv')
    print(f"‚úì Dataset loaded successfully")
    print(f"  Rows: {len(df):,}")
    print(f"  Columns: {len(df.columns)}")
except Exception as e:
    print(f"‚úó Error loading dataset: {e}")

## 1.3 Initial Data Inspection

In [None]:
print("=" * 70)
print("DATASET OVERVIEW")
print("=" * 70)

print("\nüìä First 5 Rows:")
display(df.head())

print("\nüìã Dataset Info:")
df.info()

print("\nüìà Statistical Summary:")
display(df.describe(include='all'))

print("\nüîç Missing Values:")
missing_info = pd.DataFrame({
    'Column': df.columns,
    'Missing Count': df.isnull().sum().values,
    'Percentage': (df.isnull().sum().values / len(df) * 100).round(2)
})
display(missing_info[missing_info['Missing Count'] > 0])

if df.isnull().sum().sum() == 0:
    print("‚úì No missing values detected")

---
# PART 2: EXPLORATORY DATA ANALYSIS (EDA)
---

## 2.1 Extract & Normalize Sentiment Labels

In [None]:
# Extract sentiment from output column
df['sentiment'] = df['output'].str.replace(r'^[A-E]:\s*', '', regex=True)

print("Sentiment Distribution (Before Normalization):")
print(df['sentiment'].value_counts())

# Normalize: merge "very positive/negative" with "positive/negative"
sentiment_mapping_initial = {
    'very positive': 'positive',
    'very negative': 'negative',
    'positive': 'positive',
    'negative': 'negative',
    'neutral': 'neutral'
}

df['sentiment'] = df['sentiment'].map(sentiment_mapping_initial)

print("\n" + "=" * 50)
print("Sentiment Distribution (After Normalization):")
print(df['sentiment'].value_counts())
print("\nProportions:")
print(df['sentiment'].value_counts(normalize=True).round(4))

# Display sample
print("\nSample Data:")
display(df[['input', 'output', 'sentiment']].head())

## 2.2 Sentiment Distribution Analysis

In [None]:
sentiment_colors = {'positive': '#2ecc71', 'negative': '#e74c3c', 'neutral': '#3498db'}

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Count plot
sns.countplot(x='sentiment', data=df, hue='sentiment', palette=sentiment_colors, legend=False, ax=axes[0])
axes[0].set_title('Sentiment Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Sentiment', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
for container in axes[0].containers:
    axes[0].bar_label(container, fmt='%d')

# Pie chart
sentiment_counts = df['sentiment'].value_counts()
colors = [sentiment_colors[label] for label in sentiment_counts.index]
axes[1].pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%',
            colors=colors, startangle=90, textprops={'fontsize': 12})
axes[1].set_title('Sentiment Distribution (Proportion)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Imbalance analysis
majority_class = sentiment_counts.idxmax()
minority_class = sentiment_counts.idxmin()
imbalance_ratio = sentiment_counts.max() / sentiment_counts.min()

print(f"\n{'=' * 50}")
print("CLASS BALANCE ANALYSIS")
print(f"{'=' * 50}")
print(f"Majority Class: {majority_class} ({sentiment_counts.max():,} samples)")
print(f"Minority Class: {minority_class} ({sentiment_counts.min():,} samples)")
print(f"Imbalance Ratio: {imbalance_ratio:.2f}:1")

if imbalance_ratio > 3:
    print("\n‚ö†Ô∏è  Dataset is IMBALANCED - Oversampling recommended!")
else:
    print("\n‚úì Dataset is relatively balanced")

## 2.3 Basic Text Cleaning for EDA

In [None]:
# Basic cleaning: lowercase
df['cleaned_input'] = df['input'].str.lower()

print("Sample Text Comparison (Original vs Cleaned):")
for idx in range(3):
    print(f"\n[{idx+1}] Original: {df['input'].iloc[idx][:80]}...")
    print(f"    Cleaned:  {df['cleaned_input'].iloc[idx][:80]}...")

## 2.4 Text Statistics Analysis

In [None]:
# Calculate text statistics
df['char_length'] = df['cleaned_input'].str.len()
df['word_count'] = df['cleaned_input'].str.split().str.len()
df['avg_word_length'] = df['char_length'] / df['word_count']
df['avg_word_length'] = df['avg_word_length'].fillna(0)

print("=" * 60)
print("TEXT STATISTICS")
print("=" * 60)

print("\nüìè Character Length:")
print(df['char_length'].describe())

print("\nüìù Word Count:")
print(df['word_count'].describe())

print("\nüìä Average Word Length:")
print(df['avg_word_length'].describe())

# Display sample
display(df[['cleaned_input', 'char_length', 'word_count', 'avg_word_length']].head())

## 2.5 Text Statistics Visualization

In [None]:
# Histograms by sentiment
text_stats = ['char_length', 'word_count', 'avg_word_length']
stat_titles = ['Character Length', 'Word Count', 'Average Word Length']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (stat, title) in enumerate(zip(text_stats, stat_titles)):
    sns.histplot(data=df, x=stat, hue='sentiment', kde=True, multiple='stack',
                 palette=sentiment_colors, ax=axes[idx], alpha=0.7)
    axes[idx].set_title(f'Distribution of {title}', fontweight='bold')
    axes[idx].set_xlabel(title)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# Boxplots by sentiment
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (stat, title) in enumerate(zip(text_stats, stat_titles)):
    sns.boxplot(data=df, x='sentiment', y=stat, palette=sentiment_colors, ax=axes[idx])
    axes[idx].set_title(f'{title} by Sentiment', fontweight='bold')
    axes[idx].set_xlabel('Sentiment')
    axes[idx].set_ylabel(title)

plt.tight_layout()
plt.show()

## 2.6 N-Gram Analysis

In [None]:
# Download stopwords
try:
    stopwords_english = stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
    stopwords_english = stopwords.words('english')

print("‚úì Stopwords loaded")

# N-gram analysis per sentiment
ngram_ranges = [(1, 1), (2, 2), (3, 3)]
ngram_names = ['Unigrams', 'Bigrams', 'Trigrams']

for sentiment in df['sentiment'].unique():
    print(f"\n{'=' * 60}")
    print(f"ANALYZING SENTIMENT: {sentiment.upper()}")
    print(f"{'=' * 60}")

    sentiment_df = df[df['sentiment'] == sentiment]
    cleaned_text = sentiment_df['cleaned_input'].dropna()

    if cleaned_text.empty:
        print(f"‚ö†Ô∏è  No text available for sentiment: {sentiment}")
        continue

    current_color = sentiment_colors.get(sentiment, 'gray')

    for n_range, n_name in zip(ngram_ranges, ngram_names):
        vectorizer = CountVectorizer(ngram_range=n_range, stop_words=stopwords_english)

        try:
            X = vectorizer.fit_transform(cleaned_text)
            sum_words = X.sum(axis=0)
            words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
            words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
            top_ngrams = words_freq[:20]

            if not top_ngrams:
                continue

            top_ngrams_df = pd.DataFrame(top_ngrams, columns=['ngram', 'count'])

            plt.figure(figsize=(12, 6))
            sns.barplot(x='count', y='ngram', data=top_ngrams_df, color=current_color, edgecolor='black')
            plt.title(f'Top 20 {n_name} - {sentiment.capitalize()} Sentiment', fontsize=14, fontweight='bold')
            plt.xlabel('Frequency', fontsize=12)
            plt.ylabel(n_name, fontsize=12)
            plt.tight_layout()
            plt.show()

        except ValueError as e:
            print(f"  ‚ö†Ô∏è  Could not analyze {n_name}: {e}")

print("\n‚úì N-gram analysis complete")

## 2.7 Word Cloud Analysis

In [None]:
# Word cloud for all text
all_text = ' '.join(df['cleaned_input'].dropna())
wordcloud_all = WordCloud(width=1200, height=600, background_color='white',
                          stopwords=stopwords_english, colormap='viridis').generate(all_text)

plt.figure(figsize=(14, 7))
plt.imshow(wordcloud_all, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - All Sentiments (Stopwords Removed)', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Get overall most common words
all_words = all_text.split()
all_words_filtered = [word for word in all_words if word not in stopwords_english]
overall_word_counts = Counter(all_words_filtered)
most_common_overall = set([word for word, count in overall_word_counts.most_common(50)])

# Word clouds per sentiment (excluding overall common words)
for sentiment in df['sentiment'].unique():
    sentiment_text = ' '.join(df[df['sentiment'] == sentiment]['cleaned_input'].dropna())

    if sentiment_text:
        sentiment_words = sentiment_text.split()
        sentiment_words_filtered = [word for word in sentiment_words
                                   if word not in most_common_overall and word not in stopwords_english]
        filtered_text = ' '.join(sentiment_words_filtered)

        if filtered_text:
            wordcloud = WordCloud(width=1200, height=600, background_color='white',
                                 colormap='RdYlGn' if sentiment == 'positive' else
                                         ('Blues' if sentiment == 'neutral' else 'Reds')).generate(filtered_text)

            plt.figure(figsize=(14, 7))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis('off')
            plt.title(f'Word Cloud - {sentiment.capitalize()} Sentiment (Unique Words)',
                     fontsize=16, fontweight='bold', pad=20)
            plt.tight_layout()
            plt.show()
        else:
            print(f"‚ö†Ô∏è  No unique words for {sentiment} after filtering")

print("\n‚úì Word cloud analysis complete")

---
# PART 3: DATA PREPROCESSING & CLEANING
---

## 3.1 Handle Missing Values & Duplicates

In [None]:
print("=" * 60)
print("DATA QUALITY CHECKS")
print("=" * 60)

# Check missing values
print("\n1Ô∏è‚É£  Missing Values:")
missing_critical = df[['input', 'output']].isnull().sum()
print(missing_critical)

if missing_critical.sum() > 0:
    initial_rows = len(df)
    df = df.dropna(subset=['input', 'output'])
    print(f"   Dropped {initial_rows - len(df)} rows with missing values")
else:
    print("   ‚úì No missing values in critical columns")

# Check duplicates
print("\n2Ô∏è‚É£  Duplicate Analysis:")
duplicates_all = df.duplicated().sum()
duplicates_input = df.duplicated(subset=['input']).sum()

print(f"   Full duplicates: {duplicates_all}")
print(f"   Input duplicates: {duplicates_input}")

if duplicates_input > 0:
    print(f"\n   Sample duplicate texts:")
    duplicate_samples = df[df.duplicated(subset=['input'], keep=False)].sort_values('input').head(6)
    display(duplicate_samples[['input', 'sentiment']])

    initial_rows = len(df)
    df = df.drop_duplicates(subset=['input'], keep='first')
    df = df.reset_index(drop=True)
    print(f"\n   ‚úì Removed {initial_rows - len(df)} duplicate rows")
else:
    print("   ‚úì No duplicates found")

print(f"\nüìä Final dataset size: {len(df):,} rows")

## 3.2 Advanced Text Cleaning

In [None]:
def advanced_text_cleaning(text):
    """
    Advanced text cleaning for LLM compatibility:
    - Remove URLs
    - Remove HTML tags
    - Remove special characters
    - Normalize whitespace
    """
    if not isinstance(text, str):
        return str(text) if text is not None else ""

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove special characters (keep letters, numbers, spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply advanced cleaning
df['cleaned_input'] = df['input'].apply(advanced_text_cleaning)

print("=" * 60)
print("TEXT CLEANING RESULTS")
print("=" * 60)

print("\nComparison (Original vs Cleaned):")
for idx in range(3):
    print(f"\n[{idx+1}] Original:")
    print(f"    {df['input'].iloc[idx][:100]}...")
    print(f"    Cleaned:")
    print(f"    {df['cleaned_input'].iloc[idx][:100]}...")

# Check for empty strings after cleaning
empty_count = (df['cleaned_input'].str.len() == 0).sum()
print(f"\nüìä Empty strings after cleaning: {empty_count}")

if empty_count > 0:
    df = df[df['cleaned_input'].str.len() > 0].reset_index(drop=True)
    print(f"   Removed {empty_count} empty rows")

print(f"\n‚úì Text cleaning complete - {len(df):,} rows remaining")

## 3.3 Sentiment Label Encoding

In [None]:
# Encode sentiment labels to numerical values
sentiment_encoding = {
    'positive': 0,
    'neutral': 1,
    'negative': 2
}

df['sentiment_encoded'] = df['sentiment'].map(sentiment_encoding)

print("=" * 60)
print("LABEL ENCODING")
print("=" * 60)

print("\nSentiment Mapping:")
for sentiment, code in sentiment_encoding.items():
    count = (df['sentiment_encoded'] == code).sum()
    percentage = count / len(df) * 100
    print(f"  {sentiment.capitalize():12s} -> {code}  ({count:6,} samples, {percentage:5.2f}%)")

# Verify encoding
assert df['sentiment_encoded'].isnull().sum() == 0, "‚ùå Encoding failed - null values found!"
print("\n‚úì Label encoding successful")

# Display sample
print("\nSample Encoded Data:")
display(df[['cleaned_input', 'sentiment', 'sentiment_encoded']].head())

## 3.4 Text Length Analysis

In [None]:
# Calculate text length statistics
df['text_length_chars'] = df['cleaned_input'].str.len()
df['text_length_words'] = df['cleaned_input'].str.split().str.len()

print("=" * 60)
print("TEXT LENGTH STATISTICS")
print("=" * 60)

print("\nüìè Character Length:")
print(df['text_length_chars'].describe())

print("\nüìù Word Count:")
print(df['text_length_words'].describe())

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Character length distribution
axes[0].hist(df['text_length_chars'], bins=50, color='skyblue', edgecolor='black', alpha=0.7)
axes[0].axvline(df['text_length_chars'].median(), color='red', linestyle='--', linewidth=2,
                label=f"Median: {df['text_length_chars'].median():.0f}")
axes[0].set_xlabel('Number of Characters', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Text Length (Characters)', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)

# Word count distribution
axes[1].hist(df['text_length_words'], bins=50, color='lightcoral', edgecolor='black', alpha=0.7)
axes[1].axvline(df['text_length_words'].median(), color='blue', linestyle='--', linewidth=2,
                label=f"Median: {df['text_length_words'].median():.0f}")
axes[1].set_xlabel('Number of Words', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_title('Distribution of Text Length (Words)', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

---
# PART 4: DATA SPLITTING & BALANCING
---

## 4.1 Train/Validation/Test Split (70/15/15)

In [None]:
RANDOM_STATE = 42

print("=" * 60)
print("DATA SPLITTING")
print("=" * 60)

# Split: Training (70%) and Temporary (30%)
df_train, df_temp = train_test_split(
    df, test_size=0.3, random_state=RANDOM_STATE,
    stratify=df['sentiment_encoded']
)

# Split Temporary: Validation (15%) and Test (15%)
df_val, df_test = train_test_split(
    df_temp, test_size=0.5, random_state=RANDOM_STATE,
    stratify=df_temp['sentiment_encoded']
)

print(f"\nüìä Dataset Split:")
print(f"   Training   : {len(df_train):6,} samples ({len(df_train)/len(df)*100:5.1f}%)")
print(f"   Validation : {len(df_val):6,} samples ({len(df_val)/len(df)*100:5.1f}%)")
print(f"   Test       : {len(df_test):6,} samples ({len(df_test)/len(df)*100:5.1f}%)")
print(f"   Total      : {len(df):6,} samples")

# Verify stratification
print(f"\nüîç Sentiment Distribution Across Splits:")
print("\nTraining:")
print(df_train['sentiment'].value_counts(normalize=True).sort_index())
print("\nValidation:")
print(df_val['sentiment'].value_counts(normalize=True).sort_index())
print("\nTest:")
print(df_test['sentiment'].value_counts(normalize=True).sort_index())

print("\n‚úì Data split successful with stratification maintained")

## 4.2 Oversampling Training Set

In [None]:
print("=" * 60)
print("OVERSAMPLING TRAINING SET")
print("=" * 60)

# Identify target count (majority class)
majority_count = df_train['sentiment'].value_counts().max()
print(f"\nTarget samples per class: {majority_count:,}")

print("\nBefore Oversampling:")
print(df_train['sentiment'].value_counts().sort_index())

# Oversample minority classes
df_train_oversampled = pd.DataFrame()

for sentiment_label in df_train['sentiment'].unique():
    sentiment_df = df_train[df_train['sentiment'] == sentiment_label]
    current_count = len(sentiment_df)

    if current_count < majority_count:
        # Oversample with replacement
        oversampled_df = sentiment_df.sample(n=majority_count, replace=True, random_state=RANDOM_STATE)
        df_train_oversampled = pd.concat([df_train_oversampled, oversampled_df], ignore_index=True)
        print(f"  {sentiment_label.capitalize():12s}: {current_count:6,} -> {len(oversampled_df):6,} (+{len(oversampled_df)-current_count:,})")
    else:
        df_train_oversampled = pd.concat([df_train_oversampled, sentiment_df], ignore_index=True)
        print(f"  {sentiment_label.capitalize():12s}: {current_count:6,} (unchanged)")

# Shuffle
df_train_oversampled = df_train_oversampled.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

print("\nAfter Oversampling:")
print(df_train_oversampled['sentiment'].value_counts().sort_index())

print(f"\nüìä Training set size: {len(df_train):,} -> {len(df_train_oversampled):,} (+{len(df_train_oversampled)-len(df_train):,})")
print("‚úì Oversampling complete - Classes balanced")

## 4.3 Visualize Oversampling Effect

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Before oversampling
df_train['sentiment'].value_counts().sort_index().plot(
    kind='bar', ax=axes[0], color=['#e74c3c', '#3498db', '#2ecc71'],
    edgecolor='black', alpha=0.8
)
axes[0].set_title('Training Set - BEFORE Oversampling', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Sentiment', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xticklabels(['Negative', 'Neutral', 'Positive'], rotation=0)
for container in axes[0].containers:
    axes[0].bar_label(container, fmt='%d', fontsize=10)
axes[0].grid(axis='y', alpha=0.3)

# After oversampling
df_train_oversampled['sentiment'].value_counts().sort_index().plot(
    kind='bar', ax=axes[1], color=['#e74c3c', '#3498db', '#2ecc71'],
    edgecolor='black', alpha=0.8
)
axes[1].set_title('Training Set - AFTER Oversampling', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Sentiment', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[1].set_xticklabels(['Negative', 'Neutral', 'Positive'], rotation=0)
for container in axes[1].containers:
    axes[1].bar_label(container, fmt='%d', fontsize=10)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

---
# PART 5: TOKENIZATION & MODEL PREPARATION
---

## 5.1 Load BERT Tokenizer

In [None]:
# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

print("=" * 60)
print("BERT TOKENIZER")
print("=" * 60)

print(f"\nModel: bert-base-uncased")
print(f"Vocabulary size: {tokenizer.vocab_size:,}")
print(f"Model max length: {tokenizer.model_max_length:,}")
print("\n‚úì Tokenizer loaded successfully")

## 5.2 Analyze Token Length Distribution

In [None]:
print("=" * 60)
print("TOKEN LENGTH ANALYSIS")
print("=" * 60)

# Calculate token lengths for training data
print("\nTokenizing training data to analyze length distribution...")
token_lengths = [
    len(tokenizer.encode(str(text), add_special_tokens=True, truncation=True, max_length=512))
    for text in df_train_oversampled['cleaned_input']
]

# Statistics
print("\nüìä Token Length Statistics:")
print(f"   Min        : {np.min(token_lengths)}")
print(f"   Max        : {np.max(token_lengths)}")
print(f"   Mean       : {np.mean(token_lengths):.2f}")
print(f"   Median     : {np.median(token_lengths):.0f}")
print(f"   P95        : {np.percentile(token_lengths, 95):.0f}")
print(f"   P99        : {np.percentile(token_lengths, 99):.0f}")

# Visualization
plt.figure(figsize=(14, 6))
sns.histplot(token_lengths, bins=50, kde=True, color='steelblue', edgecolor='black', alpha=0.7)
plt.axvline(np.mean(token_lengths), color='red', linestyle='--', linewidth=2,
            label=f'Mean: {np.mean(token_lengths):.0f}')
plt.axvline(np.percentile(token_lengths, 95), color='orange', linestyle='--', linewidth=2,
            label=f'P95: {np.percentile(token_lengths, 95):.0f}')
plt.xlabel('Number of Tokens', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Token Length Distribution (Training Set)', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

# Determine optimal MAX_LENGTH
suggested_max = int(np.percentile(token_lengths, 95))
MAX_LENGTH = min(suggested_max, 512)

# Use practical value if too large
if MAX_LENGTH > 400:
    MAX_LENGTH = 330

print(f"\nüéØ Optimal MAX_LENGTH Selection:")
print(f"   Suggested (P95): {suggested_max}")
print(f"   Chosen          : {MAX_LENGTH}")
coverage = (np.array(token_lengths) <= MAX_LENGTH).sum() / len(token_lengths) * 100
print(f"   Coverage        : {coverage:.2f}%")
print(f"\n‚úì MAX_LENGTH = {MAX_LENGTH} will cover {coverage:.1f}% of the data")

## 5.3 Tokenize All Datasets

In [None]:
def tokenize_data(texts, tokenizer, max_length):
    """Tokenize text data with padding and truncation"""
    return tokenizer(
        list(texts),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

print("=" * 60)
print("TOKENIZATION")
print("=" * 60)

print(f"\nTokenizing datasets with MAX_LENGTH = {MAX_LENGTH}...")

# Tokenize all datasets
X_train_tokenized = tokenize_data(df_train_oversampled['cleaned_input'], tokenizer, MAX_LENGTH)
X_val_tokenized = tokenize_data(df_val['cleaned_input'], tokenizer, MAX_LENGTH)
X_test_tokenized = tokenize_data(df_test['cleaned_input'], tokenizer, MAX_LENGTH)

print(f"\n‚úì Tokenization complete")
print(f"\nüìä Tokenized Shapes (input_ids):")
print(f"   Training   : {X_train_tokenized['input_ids'].shape}")
print(f"   Validation : {X_val_tokenized['input_ids'].shape}")
print(f"   Test       : {X_test_tokenized['input_ids'].shape}")

## 5.4 Create PyTorch Datasets

In [None]:
print("=" * 60)
print("PYTORCH DATASET CREATION")
print("=" * 60)

# Convert to PyTorch tensors
print("\nConverting tokenized data to PyTorch tensors...")

# Training data
input_ids_train = X_train_tokenized['input_ids']
attention_mask_train = X_train_tokenized['attention_mask']
token_type_ids_train = X_train_tokenized['token_type_ids']
labels_train = torch.tensor(df_train_oversampled['sentiment_encoded'].values.astype(int))

# Validation data
input_ids_val = X_val_tokenized['input_ids']
attention_mask_val = X_val_tokenized['attention_mask']
token_type_ids_val = X_val_tokenized['token_type_ids']
labels_val = torch.tensor(df_val['sentiment_encoded'].values.astype(int))

# Test data
input_ids_test = X_test_tokenized['input_ids']
attention_mask_test = X_test_tokenized['attention_mask']
token_type_ids_test = X_test_tokenized['token_type_ids']
labels_test = torch.tensor(df_test['sentiment_encoded'].values.astype(int))

print("‚úì Tensors created")

# Create TensorDatasets
print("\nCreating TensorDatasets...")
train_dataset = TensorDataset(input_ids_train, attention_mask_train, token_type_ids_train, labels_train)
val_dataset = TensorDataset(input_ids_val, attention_mask_val, token_type_ids_val, labels_val)
test_dataset = TensorDataset(input_ids_test, attention_mask_test, token_type_ids_test, labels_test)

print("‚úì TensorDatasets created")
print(f"\nüìä Dataset Sizes:")
print(f"   Training   : {len(train_dataset):,} samples")
print(f"   Validation : {len(val_dataset):,} samples")
print(f"   Test       : {len(test_dataset):,} samples")

## 5.5 Create DataLoaders

In [None]:
BATCH_SIZE = 32

print("=" * 60)
print("DATALOADER CREATION")
print("=" * 60)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"\nBatch size: {BATCH_SIZE}")
print(f"\nüìä DataLoader Batches:")
print(f"   Training   : {len(train_dataloader)} batches")
print(f"   Validation : {len(val_dataloader)} batches")
print(f"   Test       : {len(test_dataloader)} batches")

print("\n‚úì DataLoaders ready for model training")

---
# FINAL SUMMARY
---

## Pipeline Summary & Data Dictionary

In [None]:
print("=" * 80)
print("COMPLETE PREPROCESSING PIPELINE SUMMARY")
print("=" * 80)

print("\nüìä DATASET STATISTICS:")
print(f"   Original dataset        : {len(df):,} samples")
print(f"   After cleaning          : {len(df):,} samples")
print(f"   Training (original)     : {len(df_train):,} samples")
print(f"   Training (oversampled)  : {len(df_train_oversampled):,} samples")
print(f"   Validation              : {len(df_val):,} samples")
print(f"   Test                    : {len(df_test):,} samples")

print("\nüîÑ TRANSFORMATIONS APPLIED:")
print("   ‚úì Missing value removal")
print("   ‚úì Duplicate removal")
print("   ‚úì Advanced text cleaning (URLs, HTML, special chars)")
print("   ‚úì Sentiment label normalization")
print("   ‚úì Label encoding (3 classes: positive=0, neutral=1, negative=2)")
print("   ‚úì Stratified train/val/test split (70/15/15)")
print("   ‚úì Oversampling on training set (balanced 1:1:1 ratio)")
print(f"   ‚úì BERT tokenization (max_length={MAX_LENGTH})")
print("   ‚úì PyTorch DataLoader creation")

print("\nüìà SENTIMENT DISTRIBUTION (Training - Oversampled):")
train_dist = df_train_oversampled['sentiment'].value_counts().sort_index()
for sentiment in ['negative', 'neutral', 'positive']:
    count = train_dist[sentiment]
    pct = count / len(df_train_oversampled) * 100
    print(f"   {sentiment.capitalize():12s}: {count:6,} samples ({pct:5.1f}%)")

print("\nüéØ MODEL CONFIGURATION:")
print(f"   Tokenizer          : bert-base-uncased")
print(f"   Max sequence length: {MAX_LENGTH} tokens")
print(f"   Batch size         : {BATCH_SIZE}")
print(f"   Number of classes  : 3 (positive, neutral, negative)")

print("\n" + "=" * 80)
print("‚úÖ PREPROCESSING COMPLETE - DATASET READY FOR TRAINING")
print("=" * 80)

In [None]:
# Data Dictionary
data_dictionary = pd.DataFrame({
    'Column': [
        'input',
        'output',
        'cleaned_input',
        'sentiment',
        'sentiment_encoded',
        'text_length_chars',
        'text_length_words'
    ],
    'Type': [
        'string',
        'string',
        'string',
        'string',
        'int64',
        'int64',
        'int64'
    ],
    'Description': [
        'Original raw text from dataset',
        'Output label with prefix format (A:, B:, etc.)',
        'Cleaned text (URLs, HTML, special chars removed)',
        'Categorical sentiment label (positive/neutral/negative)',
        'Numerical sentiment encoding (0=positive, 1=neutral, 2=negative)',
        'Character count of cleaned text',
        'Word count of cleaned text'
    ],
    'Example': [
        'Amazing product! Love it ‚ù§Ô∏è',
        'A: very positive',
        'Amazing product Love it',
        'positive',
        '0',
        '23',
        '4'
    ]
})

print("\n" + "=" * 80)
print("DATA DICTIONARY")
print("=" * 80)
display(data_dictionary)

---
# NEXT STEPS

**Model Training Pipeline:**
1. Load BertForSequenceClassification model
2. Configure optimizer (AdamW) and learning rate scheduler
3. Implement training loop with validation
4. Evaluate on test set
5. Save best model for inference

**Available Variables:**
- `train_dataloader` - Training data ready for model
- `val_dataloader` - Validation data for hyperparameter tuning
- `test_dataloader` - Test data for final evaluation
- `tokenizer` - BERT tokenizer for inference
- `df_train_oversampled` - Full training DataFrame
- `df_val` - Validation DataFrame
- `df_test` - Test DataFrame

**Model Configuration:**
- Input: Tokenized text (max_length=229)
- Output: 3 classes (positive=0, neutral=1, negative=2)
- Architecture: BERT-base-uncased
- Batch size: 32