In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import movie_reviews
import random
import pandas as pd

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

import re

In [None]:
nltk.download('movie_reviews')

# Build a list of dictionaries
data = [
    {'review': movie_reviews.raw(f), 'label': category}
    for category in movie_reviews.categories()
    for f in movie_reviews.fileids(category)
]

# Create DataFrame
df = pd.DataFrame(data)

# Shuffle rows
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['review'], df['label'], test_size=0.2, random_state=42
)

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')  # for WordNet data
nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def lemmatize_analyzer(text):
    # Lowercase
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Split words
    words = text.split()
    # Lemmatize each word
    words = [word for word in words if word not in stop_words]
    return [lemmatizer.lemmatize(word) for word in words]

In [None]:
vectorizer = CountVectorizer(
    analyzer=lemmatize_analyzer,
    stop_words='english',
    max_features=5000
)


In [None]:
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=41))
])


In [None]:
vectorizer = pipeline.named_steps['vectorizer']
classifier = pipeline.named_steps['classifier']

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
feature_names = vectorizer.get_feature_names_out()
importances = classifier.feature_importances_

df_features = pd.DataFrame({
    'word': feature_names,
    'importance': importances
})

In [None]:
df_features

In [None]:
X_train_vec = vectorizer.transform(X_train)
X_dense = X_train_vec.toarray()

# Binary labels for convenience
y_train_bin = (y_train == 'pos').astype(int)

# Count per class
count_positive = X_dense[y_train_bin==1].sum(axis=0)
count_negative = X_dense[y_train_bin==0].sum(axis=0)

# Total count across all documents
total_count = X_dense.sum(axis=0)
feature_names = vectorizer.get_feature_names_out()
importances = classifier.feature_importances_

df_features = pd.DataFrame({
    'word': feature_names,
    'importance': importances,
    'count_positive': count_positive,
    'count_negative': count_negative,
    'total_count': total_count
})

# Sort by importance
df_features_sorted = df_features.sort_values(by='importance', ascending=False)



In [None]:
df_features_sorted.head(20)

In [None]:
df_features_sorted.tail(20)