# Train classification model

Trains and tests a random forest model on page data after dividing it by volume.

In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold, cross_val_score, GridSearchCV


In [19]:
pages = pd.read_csv('featurematrix.csv')

In [20]:
pages.columns

Index(['pagenum', 'pagefrac', 'backnum', 'backfrac', 'nlines', 'nwords',
       'nalpha', 'fracalpha', 'nnumeric', 'fracnumeric', 'npunct', 'fracpunct',
       'nupper', 'fracupper', 'nother', 'fracother', 'meanlinelen',
       'sdlinelen', 'meanwordlength', 'startupper', 'top20words',
       'top2000words', 'paratextwords', 'label', 'nwordsminusmean',
       'wordlengthminusmean', 'linelenminusmean', 'top2000minusmean',
       'nwordsminusprev', 'top2000minusprev', 'centerdist', 'centerdist^2',
       'pagefrac^2', 'backfrac^2', 'htid'],
      dtype='object')

In [21]:
# Get unique htids in pages
unique_htids = pages['htid'].unique().tolist()

# Calculate the number of htids for training and testing
num_train_htids = int(len(unique_htids) * 0.8)
num_test_htids = len(unique_htids) - num_train_htids

# Randomly select htids for training and testing
train_htids = np.random.choice(unique_htids, size=num_train_htids, replace=False)
test_htids = list(set(unique_htids) - set(train_htids))

# Divide the pages dataframe into pages_train and pages_test
pages_train = pages[pages['htid'].isin(train_htids)]
pages_test = pages[pages['htid'].isin(test_htids)]

# Delete the original pages dataframe
del pages

In [22]:
labels_train = pages_train['label']
pages_train = pages_train.drop('label', axis=1)
htids_train = pages_train['htid']
pages_train = pages_train.drop('htid', axis=1)

In [24]:
# Create the RandomForestClassifier
clf = RandomForestClassifier()

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [None, 40],
    'min_samples_split': [2, 3]
}

# Create the GroupKFold object
group_kfold = GroupKFold(n_splits=5)

# Create the GridSearchCV object
grid_search = GridSearchCV(clf, param_grid, cv=group_kfold, n_jobs=-1)

# Perform grid search with grouped cross-validation
grid_search.fit(pages_train, labels_train, groups=htids_train)

print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters found: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Best cross-validation score: 0.9850203960968479
