## RFC Testing

First thing that is to be done is to import the data and generate splits

In [2]:
import sys
sys.path.append('../')

from prep import *
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

# Load data
df = pd.read_csv('../../Datasets/train.csv', encoding='cp1252')

# Do some basic cleaning
df = prep_data(df)

# Split data using KFold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=3000)),
    ('classifier', OneVsRestClassifier(RandomForestClassifier(random_state=42))) 
])

parameters = {
    'vectorizer__ngram_range': [(1, 1), (1, 2)], 
    'classifier__estimator__n_estimators': [50, 100],  
    'classifier__estimator__max_depth': [20, 30, 50],
}
# Make list to store accuracies
test_acc = []

# Iterate through folds
for train_index, val_index in kf.split(df['body'], df['subreddit']):
    # Split data
    train = df.iloc[train_index]
    val = df.iloc[val_index]

    # Reduce features based on mutual information
    subreddits = ['Toronto', 'London', 'Paris', 'Montreal']
    train, _ = remove_common_words(train, subreddits, 300)
    train = mutual_info_transform(train, 3250)
    train, vocab = remove_common_words(train, subreddits, 25)

    # Remove words not in vocab from val
    val['body'] = val['body'].apply(lambda x: ' '.join([word for word in x.split() if word in vocab]))

    # Split into X and y
    X_train = train['body']
    y_train = train['subreddit']
    y_train = y_train.map({'Toronto': 0, 'London': 1, 'Paris': 2, 'Montreal': 3})
    X_val = val['body']
    y_val = val['subreddit']
    y_val = y_val.map({'Toronto': 0, 'London': 1, 'Paris': 2, 'Montreal': 3})

    # Perform grid search
    grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)

    # Evaluate on validation set
    test_acc.append(grid_search.score(X_val, y_val))

    # Print accuracy
    print(f'Accuracy: {test_acc[-1]}')

test_acc = np.mean(test_acc)
print(f'Average accuracy: {test_acc}')

Fitting 5 folds for each of 24 candidates, totalling 120 fits
{'classifier__estimator__max_depth': 30, 'classifier__estimator__n_estimators': 100, 'vectorizer__max_features': 3000, 'vectorizer__ngram_range': (1, 1)}
Accuracy: 0.6736111111111112
Fitting 5 folds for each of 24 candidates, totalling 120 fits
{'classifier__estimator__max_depth': 50, 'classifier__estimator__n_estimators': 50, 'vectorizer__max_features': 3000, 'vectorizer__ngram_range': (1, 1)}
Accuracy: 0.6944444444444444
Fitting 5 folds for each of 24 candidates, totalling 120 fits
{'classifier__estimator__max_depth': 50, 'classifier__estimator__n_estimators': 100, 'vectorizer__max_features': 3000, 'vectorizer__ngram_range': (1, 1)}
Accuracy: 0.7083333333333334
Fitting 5 folds for each of 24 candidates, totalling 120 fits
{'classifier__estimator__max_depth': 30, 'classifier__estimator__n_estimators': 100, 'vectorizer__max_features': 3000, 'vectorizer__ngram_range': (1, 1)}
Accuracy: 0.6527777777777778
Fitting 5 folds for e

In [None]:
# Retrain on full dataset and test on Kaggle test set
kaggle_test = pd.read_csv('../../Datasets/Kaggle/test.csv', encoding='cp1252')
kaggle_test = prep_data(kaggle_test)
test_body = kaggle_test['body'].copy()

test_df = df.copy()

# Reduce features based on mutual information
subreddits = ['Toronto', 'London', 'Paris', 'Montreal']
test_df, _ = remove_common_words(test_df, subreddits, 300)
test_df = mutual_info_transform(test_df, 3250)
test_df, vocab = remove_common_words(test_df, subreddits, 25)

# Remove words not in vocab from kaggle test set
kaggle_test['body'] = kaggle_test['body'].apply(lambda x: ' '.join([word for word in x.split() if word in vocab]))

# Split into X and y
X_train = test_df['body']
y_train = test_df['subreddit']

# Fit on full training set
pipeline.fit(X_train, y_train)

# Make predictions on test set
X_test = kaggle_test['body']
y_pred = pipeline.predict(X_test)

# Make submission file
submission = generate_kaggle_submission(kaggle_test, y_pred)

# Compare result to the best results
best_result = pd.read_csv('../../Datasets/Kaggle/best_score.csv')

# Make dataframe that has the body and the two predictions
compare = pd.DataFrame({'body': test_body, 'y_pred': y_pred, 'y_best': best_result['subreddit']})

# Remove the rows where the two predictions are the same
compare = compare[compare['y_pred'] != compare['y_best']]

# Output the dataframe to a csv file
compare.to_csv('compare.csv', index=False)