## Testing

First thing that is to be done is to import the data and generate splits

In [None]:
from sklearn.model_selection import train_test_split

# Load dataset
import pandas as pd
df = pd.read_csv('Datasets/train_cleaned.csv')

# Split dataset into training and testing
X_df = df['body']
y_df = df['subreddit']
y_df = y_df.map({'Toronto': 0, 'London': 1, 'Paris': 2, 'Montreal': 3})

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, stratify=y_df, random_state=42)

In [None]:
# Print a nice confusion matrix between y_pred and y_test
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Create function to plot a confusion matrix
def plot_conf_mat(conf_mat):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(6, 4))
    ax = sns.heatmap(conf_mat,
                     annot=True,  # Annotate the boxes
                     cbar=False,
                     fmt='g',
                     cmap='Blues')
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.show()

Naive Bayes Classifier

In [None]:
# Perform a thorough grid search with CountVectorizer and Naive Bayes
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline

# Define the pipeline with CountVectorizer and Multinomial Naive Bayes
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

# Define the hyperparameters to search
parameters = {
    'vectorizer__max_features': [3000, 5000],  
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],  # Consider different n-gram ranges
    'classifier__alpha': [0.01, 0.5, 1.0],  # Smoothing parameter for Naive Bayes
    'classifier__fit_prior': [True, False],  # Whether to learn class prior probabilities
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Assign model to a variable
best_mnb = grid_search.best_estimator_

# Print the best parameters and corresponding accuracy
print("Best Accuracy: ", grid_search.best_score_)
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy: ", test_accuracy)

# Print distribution of y_pred
y_pred = grid_search.predict(X_test)
print('Class Distribution')
print(pd.Series(y_pred).value_counts(normalize=True))

conf_mat = confusion_matrix(y_test, y_pred)
plot_conf_mat(conf_mat)

Logistic Regression Classifier

In [None]:
# Perform a thorough grid search with CountVectorizer and Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

# Define the pipeline with CountVectorizer and Logistic Regression
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression(max_iter=1000))  # Increase max_iter if needed
])

# Define the hyperparameters to search
parameters = {
    'vectorizer__max_features': [3000, 5000],  
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],  # Consider different n-gram ranges
    'classifier__C': [0.1, 1.0, 10.0],  # Inverse of regularization strength
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train, y_train)  # Assuming X_train and y_train are your training data and labels

# Assign model to a variable
best_lr = grid_search.best_estimator_

# Print the best parameters and corresponding accuracy
print("Best Accuracy: ", grid_search.best_score_)
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy: ", test_accuracy)

# Print distribution of y_pred
y_pred = grid_search.predict(X_test)
print('Class Distribution')
print(pd.Series(y_pred).value_counts(normalize=True))

conf_mat = confusion_matrix(y_test, y_pred)
plot_conf_mat(conf_mat)

Multi-Layer Perceptron

In [None]:
'''# Train a MLP
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Define the pipeline with CountVectorizer and Multilayer Perceptron
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=3000)),
    ('classifier', MLPClassifier())
])

# Define the hyperparameters to search
parameters = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],  # Consider different n-gram ranges
    'classifier__hidden_layer_sizes': [(100,), (300, 150), (200, 100), (200, 100, 50), (100, 50, 25)],  # Vary hidden layer sizes
    'classifier__alpha': [0.001],  # L2 regularization strength
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Assign model to a variable
best_mlp = grid_search.best_estimator_

# Print the best parameters and corresponding accuracy
print("Best Accuracy: ", grid_search.best_score_)
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy: ", test_accuracy)

# Print distribution of y_pred
y_pred = grid_search.predict(X_test)
print('Class Distribution')
print(pd.Series(y_pred).value_counts(normalize=True))

conf_mat = confusion_matrix(y_test, y_pred)
plot_conf_mat(conf_mat)'''

SVM

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer

svm_model = SVC()

# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', SVC())
])

parameters = {
    'vectorizer__max_features': [3000, 5000],  
    'vectorizer__ngram_range': [(1, 2), (1, 3), (1, 4)],  # Consider different n-gram ranges
    'classifier__C': [10, 20, 30],  # Regularization parameter
    'classifier__degree': [2, 3, 4],  # Degree of the polynomial kernel
    'classifier__class_weight': [{0: 1, 1: 1, 2: 1, 3: 1}, {0: 1, 1: 1, 2: 2, 3: 1}, {0: 1, 1: 1, 2: 10, 3: 1}]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Assign model to a variable
best_svm = grid_search.best_estimator_

# Print the best parameters and corresponding accuracy
print("Best Accuracy: ", grid_search.best_score_)
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy: ", test_accuracy)

# Print distribution of y_pred
y_pred = grid_search.predict(X_test)
print('Class Distribution')
print(pd.Series(y_pred).value_counts(normalize=True))

conf_mat = confusion_matrix(y_test, y_pred)
plot_conf_mat(conf_mat)

Bernoulli Naive Bayes Classifier

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import BernoulliNB

# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', OneVsRestClassifier(BernoulliNB())) 
])

param_grid = {
    'vectorizer__max_features': [1000, 3000, 5000], 
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)], 
    'classifier__estimator__alpha': [0.001, 0.5, 1.0],  
}

# Create GridSearchCV instance
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Assign model to a variable
best_bnb = grid_search.best_estimator_

# Print the best parameters and corresponding accuracy
print("Best Accuracy: ", grid_search.best_score_)
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy: ", test_accuracy)

# Print distribution of y_pred
y_pred = grid_search.predict(X_test)
print('Class Distribution')
print(pd.Series(y_pred).value_counts(normalize=True))

conf_mat = confusion_matrix(y_test, y_pred)
plot_conf_mat(conf_mat)

Random Forest

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', OneVsRestClassifier(RandomForestClassifier())) 
])

param_grid = {
    'vectorizer__max_features': [500, 1000, 3000], 
    'vectorizer__ngram_range': [(1, 1), (1, 2)], 
    'classifier__estimator__n_estimators': [50, 100, 200],  
    'classifier__estimator__max_depth': [10, 20, 30, 50],
}

# Create GridSearchCV instance
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Assign model to a variable
best_rfc = grid_search.best_estimator_

# Print the best parameters and corresponding accuracy
print("Best Accuracy: ", grid_search.best_score_)
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy: ", test_accuracy)

# Print distribution of y_pred
y_pred = grid_search.predict(X_test)
print('Class Distribution')
print(pd.Series(y_pred).value_counts(normalize=True))

conf_mat = confusion_matrix(y_test, y_pred)
plot_conf_mat(conf_mat)

In [None]:
# Get most important features
feature_importance = best_rfc.named_steps['classifier'].estimators_[0].feature_importances_
feature_names = best_rfc.named_steps['vectorizer'].get_feature_names_out()
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
feature_importance_df.sort_values(by='importance', ascending=False, inplace=True)
feature_importance_df.head(20)

feature_importance = best_rfc.named_steps['classifier'].estimators_[1].feature_importances_
feature_names = best_rfc.named_steps['vectorizer'].get_feature_names_out()
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
feature_importance_df.sort_values(by='importance', ascending=False, inplace=True)
feature_importance_df.head(20)

feature_importance = best_rfc.named_steps['classifier'].estimators_[2].feature_importances_
feature_names = best_rfc.named_steps['vectorizer'].get_feature_names_out()
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
feature_importance_df.sort_values(by='importance', ascending=False, inplace=True)
feature_importance_df.head(20)

feature_importance = best_rfc.named_steps['classifier'].estimators_[3].feature_importances_
feature_names = best_rfc.named_steps['vectorizer'].get_feature_names_out()
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
feature_importance_df.sort_values(by='importance', ascending=False, inplace=True)
feature_importance_df.head(20)

Now we will use the best performing model to predict the Kaggle test set

In [86]:
# Load a model
best_model = best_svm

# Fit the model
best_model.fit(X_df, y_df)

# Load the Kaggle test set
kaggle_test = pd.read_csv('Datasets/test_cleaned.csv')

# Make predictions on the Kaggle test set
kaggle_test_pred = best_model.predict(kaggle_test['body'])

kaggle_test_dict = {
    'id': kaggle_test['id'],
    'subreddit': kaggle_test_pred
}

kaggle_test_df = pd.DataFrame(kaggle_test_dict)
kaggle_test_df['subreddit'] = kaggle_test_df['subreddit'].map({0: 'Toronto', 1: 'London', 2: 'Paris', 3: 'Montreal'})

# Save the predictions to a CSV file
kaggle_test_df.to_csv('Datasets/kaggle_test_predictions.csv', index=False)

# See distribution of classes in Kaggle test set
kaggle_test_df['subreddit'].value_counts(normalize=True)

subreddit
London      0.275986
Toronto     0.258065
Paris       0.250896
Montreal    0.215054
Name: proportion, dtype: float64