# Train classification model

Trains and tests a random forest model on page data after dividing it by volume.

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold, cross_val_score, GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression

In [2]:
pages = pd.read_csv('newfeaturematrix.csv')

In [3]:
pages.columns

Index(['pagenum', 'pagefrac', 'backnum', 'backfrac', 'nlines', 'nwords',
       'nalpha', 'fracalpha', 'nnumeric', 'fracnumeric', 'npunct', 'fracpunct',
       'nupper', 'fracupper', 'nother', 'fracother', 'meanlinelen',
       'sdlinelen', 'meanwordlength', 'startupper', 'verbs', 'top2000words',
       'paratextwords', 'byofwords', 'fracprice', 'label', 'nwordsminusmean',
       'wordlengthminusmean', 'linelenminusmean', 'top2000minusmean',
       'nwordsminusprev', 'top2000minusprev', 'centerdist', 'centerdist^2',
       'pagefrac^2', 'backfrac^2', 'htid'],
      dtype='object')

In [4]:
# Get unique htids in pages
unique_htids = pages['htid'].unique().tolist()

# Calculate the number of htids for training and testing
num_train_htids = int(len(unique_htids) * 0.8)
num_test_htids = len(unique_htids) - num_train_htids

# Randomly select htids for training and testing
train_htids = np.random.choice(unique_htids, size=num_train_htids, replace=False)
test_htids = list(set(unique_htids) - set(train_htids))

# Divide the pages dataframe into pages_train and pages_test
pages_train = pages[pages['htid'].isin(train_htids)]
pages_test = pages[pages['htid'].isin(test_htids)]

# Delete the original pages dataframe
del pages

In [5]:
labels_train = pages_train['label']
pages_train = pages_train.drop('label', axis=1)
htids_train = pages_train['htid']
pages_train = pages_train.drop('htid', axis=1)

In [6]:
# Create the RandomForestClassifier
clf = RandomForestClassifier()

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [200, 250],
    'max_depth': [None, 50],
    'min_samples_split': [2, 3]
}

# Create the GroupKFold object
group_kfold = GroupKFold(n_splits=5)

# Create the GridSearchCV object
grid_search = GridSearchCV(clf, param_grid, cv=group_kfold, n_jobs=-1)

# Perform grid search with grouped cross-validation
grid_search.fit(pages_train, labels_train, groups=htids_train)

print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters found: {'max_depth': 50, 'min_samples_split': 2, 'n_estimators': 250}
Best cross-validation score: 0.9705630018644446


In [7]:
labels_test = pages_test['label']
pages_test = pages_test.drop('label', axis=1)
htids_test = pages_test['htid']
pages_test = pages_test.drop('htid', axis=1)

In [8]:
# Now we train a model using the best parameters found by grid search
# on the train dataset, and evaluate it on the test dataset.

clf = RandomForestClassifier(**grid_search.best_params_)    
clf.fit(pages_train, labels_train)
test_score = clf.score(pages_test, labels_test)
print("Test score:", test_score)

# Save the model
import joblib
joblib.dump(clf, 'models/RF_model4.pkl')

# Save the test score
with open('models/test_score4.txt', 'w') as f:
    f.write(str(test_score))

# Save the best parameters
with open('models/best_params4.txt', 'w') as f:
    f.write(str(grid_search.best_params_))

# Test the model, clf
predictions = clf.predict(pages_test)
probabilities = clf.predict_proba(pages_test)
print(clf.score(pages_test, labels_test))


Test score: 0.9834111141613396
0.9834111141613396


In [9]:
# Now we use grid_search.best_params_ to infer probabilities
# for the training dataset, through cross-validation.

# Create the RandomForestClassifier
clf = RandomForestClassifier(**grid_search.best_params_)
train_probabilities = cross_val_predict(clf, pages_train, labels_train, groups=htids_train, cv=group_kfold, method='predict_proba')

In [10]:
# Create a dataframe with columns for pagenum, htids_train, labels_train, and train_probabilities
df_train_results = pd.DataFrame({
    'pagenum': pages_train['pagenum'],
    'wordcount': pages_train['nwords'],
    'htid': htids_train,
    'label': labels_train,
    'probabilities': train_probabilities[:, 1]  # Assuming the second column contains the probabilities for 'text'
})

df_train_results.head()

Unnamed: 0,pagenum,wordcount,htid,label,probabilities
0,0,0,uc2.ark+=13960=t1mg7h137,para,0.0
1,1,0,uc2.ark+=13960=t1mg7h137,para,0.0
2,2,0,uc2.ark+=13960=t1mg7h137,para,0.0
3,3,0,uc2.ark+=13960=t1mg7h137,para,0.004
4,4,0,uc2.ark+=13960=t1mg7h137,para,0.036


In [11]:
df_train_results.to_csv('models/train_results3.csv', index=False)

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(pages_train)

In [17]:
logreg = LogisticRegression()
param_grid = {'C': [0.1, 1, 10, 50, 100, 150, 200, 300, 400, 500]}
grid_search = GridSearchCV(logreg, param_grid, cv=group_kfold, n_jobs=-1)
grid_search.fit(X, labels_train, groups=htids_train)

print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters found: {'C': 200}
Best cross-validation score: 0.9692771561182033


In [21]:
# Now we train a model using the best parameters found by grid search
# on the train dataset, and evaluate it on the test dataset.

logreg = LogisticRegression(**grid_search.best_params_)    
logreg.fit(X, labels_train)
X_test = scaler.transform(pages_test)
test_score = logreg.score(X_test, labels_test)
print("Test score:", test_score)

# Save the model
import joblib
joblib.dump(clf, 'models/LR_model2.pkl')

# Save the test score
with open('models/test_score2.txt', 'w') as f:
    f.write(str(test_score))

# Save the best parameters
with open('models/best_params2.txt', 'w') as f:
    f.write(str(grid_search.best_params_))

# Test the model, clf
LRpredictions = logreg.predict(X_test)
LRprobabilities = logreg.predict_proba(X_test)


Test score: 0.9469058511919095


In [22]:
logreg = LogisticRegression(**grid_search.best_params_)
train_probabilities = cross_val_predict(logreg, X, labels_train, groups=htids_train, cv=group_kfold, method='predict_proba')

In [23]:
# Create a dataframe with columns for pagenum, htids_train, labels_train, and train_probabilities
df_train_results = pd.DataFrame({
    'pagenum': pages_train['pagenum'],
    'wordcount': pages_train['nwords'],
    'htid': htids_train,
    'label': labels_train,
    'probabilities': train_probabilities[:, 1]  # Assuming the second column contains the probabilities for 'text'
})

df_train_results.head()

Unnamed: 0,pagenum,wordcount,htid,label,probabilities
178,0,5,nyp.33433071387207,para,0.107463
179,1,0,nyp.33433071387207,para,0.003639
180,2,0,nyp.33433071387207,para,0.006158
181,3,0,nyp.33433071387207,para,0.007745
182,4,0,nyp.33433071387207,para,0.056439


In [24]:
df_train_results.to_csv('models/train_results2.csv', index=False)