## Testing

First thing that is to be done is to import the data and generate splits

In [9]:
import sys
sys.path.append('../')

from prep import *
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

# Load data
df = pd.read_csv('../../Datasets/train.csv', encoding='cp1252')

# Do some basic cleaning
df = prep_data(df)

# Split data using KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=3000)),
    ('classifier', MultinomialNB())
])

# Define the hyperparameters to search
parameters = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],  # Consider different n-gram ranges
    'classifier__alpha': [0.01, 0.5, 1.0, 15.0],  # Smoothing parameter for Naive Bayes
    'classifier__fit_prior': [True, False],  # Whether to learn class prior probabilities
}
# Make list to store accuracies
test_acc = []

# Iterate through folds
for train_index, val_index in kf.split(df):
    # Split data
    train = df.iloc[train_index]
    val = df.iloc[val_index]

    # Reduce features based on mutual information
    subreddits = ['Toronto', 'London', 'Paris', 'Montreal']
    train = remove_common_words(train, subreddits)
    train, vocab = mutual_info_transform(train, 4250)

    # Remove words not in vocab from val
    val['body'] = val['body'].apply(lambda x: ' '.join([word for word in x.split() if word in vocab]))

    # Split into X and y
    X_train = train['body']
    y_train = train['subreddit']
    y_train = y_train.map({'Toronto': 0, 'London': 1, 'Paris': 2, 'Montreal': 3})
    X_val = val['body']
    y_val = val['subreddit']
    y_val = y_val.map({'Toronto': 0, 'London': 1, 'Paris': 2, 'Montreal': 3})

    # Perform grid search
    grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Evaluate on validation set
    test_acc.append(grid_search.score(X_val, y_val))

    # Print accuracy
    print(f'Accuracy: {test_acc[-1]}')

test_acc = np.mean(test_acc)
print(f'Average accuracy: {test_acc}')

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Accuracy: 0.6666666666666666
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Accuracy: 0.6527777777777778
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Accuracy: 0.6597222222222222
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Accuracy: 0.6180555555555556
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Accuracy: 0.6783216783216783
Average accuracy: 0.6551087801087802
