# Modeling

In [1]:
# Imports
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter

In [2]:
# Loading in the cleaned and normalized data
df = pd.read_csv('data/reddit_post_cleaned_normalized.csv')

In [3]:
df['body'] = df['body'].fillna('')

In [4]:
# Initializing TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# Combining title and body text
df['combined_text'] = df['title'] + ' ' + df['body']

# Applying TF-IDF transformation
X = tfidf_vectorizer.fit_transform(df['combined_text'])

# Checking shape of the transformed matrix
print(f"TF-IDF Matrix Shape: {X.shape}")

TF-IDF Matrix Shape: (2692, 1000)


In [5]:
from sklearn.model_selection import train_test_split

# Defining the target variable, encoding subreddits
y = df['subreddit'].map({'askscience': 0, 'explainlikeimfive': 1})

# Splitting the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Checking the shape of the splits
print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (2153, 1000)
Test set size: (539, 1000)


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initializing the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)

# Training the model
lr_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred_lr = lr_model.predict(X_test)

# Evaluating the model
print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr)}")
print(classification_report(y_test, y_pred_lr))

Logistic Regression Performance:
Accuracy: 0.6901669758812616
              precision    recall  f1-score   support

           0       0.72      0.66      0.69       279
           1       0.66      0.73      0.69       260

    accuracy                           0.69       539
   macro avg       0.69      0.69      0.69       539
weighted avg       0.69      0.69      0.69       539



In [7]:
from sklearn.ensemble import RandomForestClassifier

# Initializing the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Training the model
rf_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluating the model
print("Random Forest Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}")
print(classification_report(y_test, y_pred_rf))

Random Forest Performance:
Accuracy: 0.6883116883116883
              precision    recall  f1-score   support

           0       0.71      0.67      0.69       279
           1       0.67      0.71      0.69       260

    accuracy                           0.69       539
   macro avg       0.69      0.69      0.69       539
weighted avg       0.69      0.69      0.69       539



In [8]:
from sklearn.model_selection import GridSearchCV

# Defining hyperparameter grid for Logistic Regression
param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear']}

# Initializing GridSearchCV
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='accuracy')

# Training with GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

# Making predictions with the best model
best_lr_model = grid_search.best_estimator_
y_pred_best_lr = best_lr_model.predict(X_test)

# Evaluating the tuned model
print("Tuned Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best_lr)}")
print(classification_report(y_test, y_pred_best_lr))

Best Parameters: {'C': 1, 'solver': 'liblinear'}
Best Score: 0.6939167970646954
Tuned Logistic Regression Performance:
Accuracy: 0.6901669758812616
              precision    recall  f1-score   support

           0       0.72      0.65      0.69       279
           1       0.66      0.73      0.69       260

    accuracy                           0.69       539
   macro avg       0.69      0.69      0.69       539
weighted avg       0.69      0.69      0.69       539



In [19]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Initializing the SVM model
svm_model = SVC(kernel='linear', C=1, gamma='scale', random_state=42)

# Training the model
svm_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred_svm = svm_model.predict(X_test)

# Evaluating the model
print("SVM Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm)}")
print(classification_report(y_test, y_pred_svm))

SVM Performance:
Accuracy: 0.6679035250463822
              precision    recall  f1-score   support

           0       0.70      0.63      0.66       279
           1       0.64      0.71      0.67       260

    accuracy                           0.67       539
   macro avg       0.67      0.67      0.67       539
weighted avg       0.67      0.67      0.67       539



In [None]:
# Initializing the lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [None]:
def predict_subreddit(input_text, model, vectorizer):
    input_text_cleaned = preprocess_text(input_text)
    
    # Applying TF-IDF transformation
    input_vectorized = vectorizer.transform([input_text_cleaned])
    
    # Predicting the subreddit
    prediction = model.predict(input_vectorized)
    
    # Mapping the prediction back to subreddit name
    subreddit_pred = 'askscience' if prediction == 0 else 'explainlikeimfive'
    
    return subreddit_pred
# Input question
input_question = "how does economics work"
predicted_subreddit = predict_subreddit(input_question, best_lr_model, tfidf_vectorizer)
print(f"The question belongs to subreddit: {predicted_subreddit}")