## Testing

First thing that is to be done is to import the data and generate splits

In [2]:
# Load dataset
import pandas as pd
df = pd.read_csv('Datasets/train_cleaned.csv')

# Split dataset into training and testing
from sklearn.model_selection import train_test_split
X = df['body']
y = df['subreddit']
y = y.map({'Toronto': 0, 'London': 1, 'Montreal': 2, 'Paris': 3})
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

We can encode the training data two ways, CountVectorizer and Tfidf

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vec_vectorizer = CountVectorizer(max_features=5000, ngram_range=(1,3))
vec_x_train = vec_vectorizer.fit_transform(X_train)
vec_x_test = vec_vectorizer.transform(X_test)

from sklearn.feature_extraction.text import TfidfVectorizer
tfid_vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1,3))
tfid_x_train = tfid_vectorizer.fit_transform(X_train)
tfid_x_test = tfid_vectorizer.transform(X_test)

Naive Bayes Classifier

In [None]:
# Perform a thorough grid search with CountVectorizer and Naive Bayes
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

# Define the pipeline with CountVectorizer and Multinomial Naive Bayes
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

# Define the hyperparameters to search
parameters = {
    'vectorizer__max_features': [1000, 3000, 5000],  # Adjust as needed
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],  # Consider different n-gram ranges
    'classifier__alpha': [0.01, 0.5, 1.0],  # Smoothing parameter for Naive Bayes
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='accuracy', verbose=1)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and corresponding accuracy
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

# Evaluate on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy: ", test_accuracy)

Logistic Regression Classifier

In [None]:
# Perform a thorough grid search with CountVectorizer and Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

# Define the pipeline with CountVectorizer and Logistic Regression
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression(max_iter=1000))  # Increase max_iter if needed
])

# Define the hyperparameters to search
parameters = {
    'vectorizer__max_features': [1000, 3000, 5000],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'classifier__C': [0.1, 1.0, 10.0],  # Inverse of regularization strength
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='accuracy', verbose=1)

# Perform the grid search
grid_search.fit(X_train, y_train)  # Assuming X_train and y_train are your training data and labels

# Print the best parameters and corresponding accuracy
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

# Evaluate on the test set
test_accuracy = grid_search.score(X_test, y_test)  # Assuming X_test and y_test are your test data and labels
print("Test Accuracy: ", test_accuracy)

Multi-Layer Perceptron

In [15]:
# Train a MLP
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Define the pipeline with CountVectorizer and Multilayer Perceptron
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MLPClassifier(solver='adam', learning_rate='adaptive', early_stopping=True, batch_size=32))
])

# Define the hyperparameters to search
parameters = {
    'classifier__hidden_layer_sizes': [(25,), (50,), (75,), (5, 10, 5)],  # Number of neurons in each hidden layer
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, parameters, cv=2, scoring='accuracy', verbose=1)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and corresponding accuracy
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

# Evaluate on the test set
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy: ", test_accuracy)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
Best Parameters:  {'classifier__hidden_layer_sizes': (50,)}
Best Accuracy:  0.5436045711138648
Test Accuracy:  0.6611111111111111
