# Modeling

In [2]:
# Imports
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter

In [3]:
# Loading in the cleaned and normalized data
df = pd.read_csv('data/reddit_post_cleaned_normalized.csv')

In [4]:
df['body'] = df['body'].fillna('')

In [5]:
# Initializing TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))

# Combining title and body text
df['combined_text'] = df['title'] + ' ' + df['body']

# Applying TF-IDF transformation
X = tfidf_vectorizer.fit_transform(df['combined_text'])

# Checking shape of the transformed matrix
print(f"TF-IDF Matrix Shape: {X.shape}")

TF-IDF Matrix Shape: (2692, 73457)


In [6]:
from sklearn.model_selection import train_test_split

# Defining the target variable, encoding subreddits
y = df['subreddit'].map({'askscience': 0, 'explainlikeimfive': 1})

# Splitting the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Checking the shape of the splits
print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (2153, 73457)
Test set size: (539, 73457)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initializing the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)

# Training the model
lr_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred_lr = lr_model.predict(X_test)

# Evaluating the model
print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr)}")
print(classification_report(y_test, y_pred_lr))

Logistic Regression Performance:
Accuracy: 0.7179962894248608
              precision    recall  f1-score   support

           0       0.74      0.71      0.72       279
           1       0.70      0.73      0.71       260

    accuracy                           0.72       539
   macro avg       0.72      0.72      0.72       539
weighted avg       0.72      0.72      0.72       539



In [8]:
from sklearn.ensemble import RandomForestClassifier

# Initializing the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Training the model
rf_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluating the model
print("Random Forest Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}")
print(classification_report(y_test, y_pred_rf))

Random Forest Performance:
Accuracy: 0.6901669758812616
              precision    recall  f1-score   support

           0       0.81      0.53      0.64       279
           1       0.63      0.87      0.73       260

    accuracy                           0.69       539
   macro avg       0.72      0.70      0.68       539
weighted avg       0.72      0.69      0.68       539



In [30]:
from sklearn.model_selection import GridSearchCV

# Defining hyperparameter grid for Logistic Regression
param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear']}

# Initializing GridSearchCV
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='recall')

# Training with GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

# Making predictions with the best model
best_lr_model = grid_search.best_estimator_
y_pred_best_lr = best_lr_model.predict(X_test)

# Evaluating the tuned model
print("Tuned Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best_lr)}")
print(classification_report(y_test, y_pred_best_lr))

Best Parameters: {'C': 10, 'solver': 'liblinear'}
Best Score: 0.7058858045587904
Tuned Logistic Regression Performance:
Accuracy: 0.7198515769944341
              precision    recall  f1-score   support

           0       0.75      0.68      0.72       279
           1       0.69      0.76      0.72       260

    accuracy                           0.72       539
   macro avg       0.72      0.72      0.72       539
weighted avg       0.72      0.72      0.72       539



In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define the hyperparameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [10, 20, 30],  # Tree depth
    'bootstrap': [True, False]  # Use bootstrap sampling or not
}

# Initialize GridSearchCV, optimizing for recall of ELI5 (label = 1)
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42, class_weight='balanced'),  # Handle class imbalance
    param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='recall',  # Optimize for recall
    n_jobs=-1,  # Use all processors
    verbose=1
)

# Train the model with GridSearchCV
grid_search.fit(X_train, y_train)

# Print best parameters and score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Recall Score: {grid_search.best_score_}")  # Now optimized for recall

# Use the best model for predictions
best_rf_model = grid_search.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)

# Evaluate the tuned model
print("Tuned Random Forest Performance:")
print(classification_report(y_test, y_pred_best_rf))  # Recall for ELI5 (1) will be the focus

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters: {'bootstrap': True, 'max_depth': 10, 'n_estimators': 200}
Best Recall Score: 0.8368223877228615
Tuned Random Forest Performance:
              precision    recall  f1-score   support

           0       0.79      0.53      0.64       279
           1       0.63      0.85      0.72       260

    accuracy                           0.68       539
   macro avg       0.71      0.69      0.68       539
weighted avg       0.71      0.68      0.68       539



In [10]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Initializing the SVM model
svm_model = SVC(kernel='linear', C=1, gamma='scale', random_state=42)

# Training the model
svm_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred_svm = svm_model.predict(X_test)

# Evaluating the model
print("SVM Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm)}")
print(classification_report(y_test, y_pred_svm))

SVM Performance:
Accuracy: 0.725417439703154
              precision    recall  f1-score   support

           0       0.77      0.67      0.72       279
           1       0.69      0.78      0.73       260

    accuracy                           0.73       539
   macro avg       0.73      0.73      0.73       539
weighted avg       0.73      0.73      0.72       539



In [11]:
# Initializing the lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [60]:
def predict_subreddit(input_text, model, vectorizer):
    input_text_cleaned = preprocess_text(input_text)
    
    # Applying TF-IDF transformation
    input_vectorized = vectorizer.transform([input_text_cleaned])
    
    # Predicting the subreddit
    prediction = model.predict(input_vectorized)
    
    # Mapping the prediction back to subreddit name
    subreddit_pred = 'askscience' if prediction == 0 else 'explainlikeimfive'
    
    return subreddit_pred
# Input question
input_question = "Calculate the energy levels and wavelengths of light emitted or absorbed by a hydrogen atom, using the Bohr model or the Schr√∂dinger equation."
predicted_subreddit = predict_subreddit(input_question, rf_model, tfidf_vectorizer)
print(f"The question belongs to subreddit: {predicted_subreddit}")

The question belongs to subreddit: askscience
