# -------------- Step 1: Import Data and Libraries --------------

In [1]:
import re
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
data = pd.read_csv('user_courses_review_09_2023.csv', on_bad_lines='skip')
data.head()

Unnamed: 0,course_name,lecture_name,review_rating,review_comment
0,A/B Testing in Python,How to set up the A/B test,5,If she could provide more details about the pr...
1,A/B Testing in Python,How to set up the A/B test,5,nice
2,A/B Testing in Python,How to set up the A/B test,5,excellent course
3,A/B Testing in Python,How to set up the A/B test,5,nice
4,A/B Testing in Python,Conclusion,5,It was an experiential learning process


In [3]:
# Show basic info and a preview
# data.info(), data.head(), data.describe(include='all')


In [4]:
# Check current data types
dtypes_before = data.dtypes

# Convert 'review_rating' to numeric (force errors to NaN)
data['review_rating'] = pd.to_numeric(data['review_rating'], errors='coerce')

# Re-check data types after conversion
dtypes_after = data.dtypes

print(dtypes_before,'\n\n',dtypes_after)


course_name       object
lecture_name      object
review_rating     object
review_comment    object
dtype: object 

 course_name        object
lecture_name       object
review_rating     float64
review_comment     object
dtype: object


In [5]:
# Check missing values
print("\nMissing Values:\n", data.isnull().sum())

# Check for duplicate rows
duplicates = data.duplicated().sum()
print(f"\nDuplicate Rows: {duplicates}")

# Optionally drop rows with missing 'review_rating' or 'course_name'
df_cleaned = data.dropna(subset=['review_rating', 'course_name'])


Missing Values:
 course_name         0
lecture_name       10
review_rating      18
review_comment    203
dtype: int64

Duplicate Rows: 1453


In [6]:
# Drop duplicate rows
df_cleaned = df_cleaned.drop_duplicates()

# Optional: reset index
df_cleaned.reset_index(drop=True, inplace=True)

duplicates = df_cleaned.duplicated().sum()
print(f"\nDuplicate Rows: {duplicates}")

# Confirm result
print(f"Shape after removing duplicates: {df_cleaned.shape}")


df_cleaned.dropna(subset=['review_comment'], inplace=True)



Duplicate Rows: 0
Shape after removing duplicates: (9371, 4)


In [7]:
df_cleaned

Unnamed: 0,course_name,lecture_name,review_rating,review_comment
0,A/B Testing in Python,How to set up the A/B test,5.0,If she could provide more details about the pr...
1,A/B Testing in Python,How to set up the A/B test,5.0,nice
2,A/B Testing in Python,How to set up the A/B test,5.0,excellent course
3,A/B Testing in Python,Conclusion,5.0,It was an experiential learning process
4,A/B Testing in Python,How to set up the A/B test,5.0,great\n
...,...,...,...,...
9366,Working with Text Files in Python,Working with Text Files - Conclusion,5.0,This course was very informative and had a lot...
9367,Working with Text Files in Python,Principles of Importing Data in Python,5.0,On to the point
9368,Working with Text Files in Python,Principles of Importing Data in Python,5.0,loving it
9369,Working with Text Files in Python,Principles of Importing Data in Python,5.0,Good course


In [8]:
# Label based on TextBlob polarity (already done previously)
from textblob import TextBlob
df_cleaned['polarity'] = df_cleaned['review_comment'].apply(lambda x: TextBlob(x).sentiment.polarity)
df_cleaned['label'] = df_cleaned['polarity'].apply(lambda x: 1 if x > 0 else 0)

# -------------- Step 2: Build Custom Preprocessor --------------

In [9]:


# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        text = re.sub(r"[^a-zA-Z]", " ", text.lower()) # Lowercase and remove non-alphabetic characters
        tokens = nltk.word_tokenize(text) # Tokenize
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words and word not in string.punctuation]
        return ' '.join(tokens)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(self.clean_text)


[nltk_data] Downloading package punkt to /Users/nehasoni/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nehasoni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nehasoni/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# -------------- Step 3: Build and Train the Full Pipeline --------------

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Build the full pipeline
nb_pipeline = Pipeline([
    ('preprocess', TextPreprocessor()),
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('nb', MultinomialNB())
])

# Train on cleaned_comment and label
# nb_pipeline.fit(df_cleaned['review_comment'], df_cleaned['label'])  # Use original text here


# ------ Resample for balance----------

In [13]:
from imblearn.over_sampling import RandomOverSampler

# Oversample raw data BEFORE vectorizing
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df_cleaned[['review_comment']], df_cleaned['label'])

# Fit the pipeline on resampled text
nb_pipeline.fit(X_resampled['review_comment'], y_resampled)


Pipeline(steps=[('preprocess', TextPreprocessor()),
                ('tfidf', TfidfVectorizer(max_features=5000)),
                ('nb', MultinomialNB())])

# --------- Retrain pipeline on balanced data------------

In [17]:
import pickle

# Save pipeline
with open("nb_pipeline_with_cleaning.pkl", "wb") as f:
    pickle.dump(nb_pipeline, f)

print("✅ Model training complete. Pipeline saved to 'nb_pipeline_with_cleaning.pkl'.")

✅ Model training complete. Pipeline saved to 'nb_pipeline_with_cleaning.pkl'.


# --------- Predict new data------------

In [23]:
# Your input review
new_review = "This course was great but slow."

# Predict label
predicted_label = nb_pipeline.predict(pd.Series([new_review]))[0]

# Predict class probabilities
predicted_proba = nb_pipeline.predict_proba(pd.Series([new_review]))[0]

# Get class names (0 = Negative, 1 = Positive in your case)
class_names = nb_pipeline.classes_

# Show results
print(f"Predicted Label: {predicted_label} ({'Positive' if predicted_label == 1 else 'Negative'})")
print("Class Probabilities:")
for cls, prob in zip(class_names, predicted_proba):
    print(f"  Class {cls}: {prob:.4f}")


Predicted Label: 1 (Positive)
Class Probabilities:
  Class 0: 0.4105
  Class 1: 0.5895
