# Environment Setup

In [10]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# For text processing
from sklearn.feature_extraction.text import TfidfVectorizer

# For model building and evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

# Machine Learning models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier  # Add this line

# To handle sparse matrices
from scipy.sparse import hstack

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Load the data
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


Train shape: (1697533, 9)
Test shape: (212192, 2)


# Data Loading and Exploration

In [7]:
# Display first few rows of the training data
train_df.head()

# Check for missing values in training data
print("Missing values in training data:")
print(train_df.isnull().sum())



Missing values in training data:
Id                             0
ProductId                      0
UserId                         0
HelpfulnessNumerator           0
HelpfulnessDenominator         0
Time                           0
Summary                       32
Text                          62
Score                     212192
dtype: int64


# Feature Engineering

In [8]:
def add_features_to(df):
    # Handle missing values in 'HelpfulnessDenominator' to avoid division by zero
    df['HelpfulnessDenominator'] = df['HelpfulnessDenominator'].replace(0, np.nan)
    df['Helpfulness'] = df['HelpfulnessNumerator'] / df['HelpfulnessDenominator']
    df['Helpfulness'] = df['Helpfulness'].fillna(0)

    # Convert 'Time' to datetime and extract components
    df['Time'] = pd.to_datetime(df['Time'], unit='s')
    df['Review_Year'] = df['Time'].dt.year
    df['Review_Month'] = df['Time'].dt.month
    df['Review_Day'] = df['Time'].dt.day

    # Fill NaN in 'Summary' and 'Text'
    df['Summary'] = df['Summary'].fillna('')
    df['Text'] = df['Text'].fillna('')

    # Length of 'Summary' and 'Text'
    df['Summary_length'] = df['Summary'].apply(len)
    df['Text_length'] = df['Text'].apply(len)

    # Word count in 'Summary' and 'Text'
    df['Summary_word_count'] = df['Summary'].apply(lambda x: len(x.split()))
    df['Text_word_count'] = df['Text'].apply(lambda x: len(x.split()))
    
    return df

# Apply the function to the entire train_df
train_df = add_features_to(train_df)

# Training data: Rows where 'Score' is not null
train_data = train_df[train_df['Score'].notnull()]

# Test data: Rows where 'Id' is in test_df['Id'] and 'Score' is null
test_data = train_df[train_df['Id'].isin(test_df['Id']) & train_df['Score'].isnull()]

# Verify that all test IDs are included
missing_ids = set(test_df['Id']) - set(test_data['Id'])
if len(missing_ids) > 0:
    print(f"Missing Ids in test_data: {missing_ids}")
else:
    print("All test Ids are present in test_data.")

# If any Ids are missing, you may need to adjust the approach.

# Merge test_df with train_df to get test_data
test_data = pd.merge(test_df[['Id']], train_df.drop(columns=['Score']), on='Id', how='left')

# Combine 'Summary' and 'Text' in train and test data
train_data['Combined_Text'] = train_data['Summary'] + ' ' + train_data['Text']
test_data['Combined_Text'] = test_data['Summary'] + ' ' + test_data['Text']

# Combine the data for vectorization
combined_data = pd.concat([train_data['Combined_Text'], test_data['Combined_Text']], axis=0)

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

# Fit and transform the combined text data
tfidf_combined_text = tfidf_vectorizer.fit_transform(combined_data)

# Split the vectorized data back into train and test sets
tfidf_train = tfidf_combined_text[:len(train_data)]
tfidf_test = tfidf_combined_text[len(train_data):]

# List of numerical features
numeric_features = ['HelpfulnessNumerator', 'HelpfulnessDenominator', 'Helpfulness',
                    'Summary_length', 'Text_length', 'Summary_word_count', 'Text_word_count',
                    'Review_Year', 'Review_Month', 'Review_Day']

# Prepare numeric features for training and testing data
X_train_numeric = train_data[numeric_features].fillna(0)
X_test_numeric = test_data[numeric_features].fillna(0)

# Combine TF-IDF features with numeric features
from scipy.sparse import csr_matrix

X_train_combined = hstack([tfidf_train, csr_matrix(X_train_numeric.values)])
X_test_combined = hstack([tfidf_test, csr_matrix(X_test_numeric.values)])

# Define the target variable
y_train = train_data['Score']

All test Ids are present in test_data.


# Model Building and Evaluation


In [11]:
# Split the training data into training and validation sets
X_train_part, X_valid, y_train_part, y_valid = train_test_split(
    X_train_combined, y_train, test_size=0.25, random_state=42)

print("Training Logistic Regression model...")
logreg = LogisticRegression(max_iter=1000, n_jobs=-1)
logreg.fit(X_train_part, y_train_part)

# Predict and evaluate
y_pred_logreg = logreg.predict(X_valid)
accuracy_logreg = accuracy_score(y_valid, y_pred_logreg)
print("Logistic Regression Accuracy:", accuracy_logreg)

print("Training Multinomial Naive Bayes model...")
mnb = MultinomialNB()
mnb.fit(X_train_part, y_train_part)

# Predict and evaluate
y_pred_mnb = mnb.predict(X_valid)
accuracy_mnb = accuracy_score(y_valid, y_pred_mnb)
print("Multinomial Naive Bayes Accuracy:", accuracy_mnb)

# Optional: Dimensionality reduction before Random Forest
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)
X_train_reduced = svd.fit_transform(X_train_combined)
X_valid_reduced = svd.transform(X_valid)

print("Training Random Forest Classifier model...")
rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
rf_classifier.fit(X_train_reduced[:len(y_train_part)], y_train_part)

# Predict and evaluate
y_pred_rf = rf_classifier.predict(X_valid_reduced)
accuracy_rf = accuracy_score(y_valid, y_pred_rf)
print("Random Forest Classifier Accuracy:", accuracy_rf)

Training Logistic Regression model...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.5480642867914772
Training Multinomial Naive Bayes model...
Multinomial Naive Bayes Accuracy: 0.45190878341986773
Training Random Forest Classifier model...
Random Forest Classifier Accuracy: 0.39996660706206777


# Model Selection


In [12]:
# Compare accuracies
print("Model Accuracies:")
print(f"Logistic Regression: {accuracy_logreg}")
print(f"Multinomial Naive Bayes: {accuracy_mnb}")
print(f"Random Forest Classifier: {accuracy_rf}")


Model Accuracies:
Logistic Regression: 0.5480642867914772
Multinomial Naive Bayes: 0.45190878341986773
Random Forest Classifier: 0.39996660706206777


# Final Model Training and Prediction

In [13]:
# If Logistic Regression is the best model
y_test_pred = logreg.predict(X_test_combined)

# Prepare submission file
submission = test_df[['Id']].copy()
submission['Score'] = y_test_pred

# Save to CSV
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")


Submission file created successfully!
