In [1]:
import logging
logging.basicConfig(level=logging.WARNING)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')
df = pd.read_csv('blogtext.csv', parse_dates = ['date'])

In [4]:
%%time
#Here we import my custom cleaner script, clean.py. I parallelize the function to run faster here, to work when scaling. 
#Using 3 cores for the task and 1 core to manage the multiprocessing.
import clean
from joblib import Parallel, delayed
df['tokens'] = Parallel(n_jobs=3)(delayed(clean.clean)(line) for line in df['text'])

CPU times: user 1min 8s, sys: 4.83 s, total: 1min 13s
Wall time: 52min 51s


# Machine Learning pre-processing

In [7]:
# df['token_str'] = df['trigrams'].map(lambda x: ' '.join(x))
df['token_str'] = df['tokens'].map(lambda x: ' '.join(x))
df['token_str'].head()

0    info have find page psf file have wait until t...
1    team member drives van der lang mail ruby die ...
2    nader van kernfusie barde mark waterstofbom ho...
3                                            test test
4    thank yahoo have toolbar capture urls popups m...
Name: token_str, dtype: object

In [8]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
# Converting string labels into numbers.
y=le.fit_transform(df['gender'])
X=df['token_str']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [9]:
from sklearn.metrics import f1_score, balanced_accuracy_score, accuracy_score, classification_report, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer(min_df=5, max_df=0.67, smooth_idf=True,
                       norm = 'l2', ngram_range=[1,2], max_features=150000)
tfid_train_matrix = tfid.fit_transform(X_train)
tfid_test_matrix = tfid.transform(X_test)

In [11]:
%%time
'''Use n_jobs parameter to your # of CPU cores minus 1
If your computer has a GPU (which mine doesn't), you should set the booster parameter to 'gpu_hist', 
and possibly bring the subsample size up. With a faster-running tool you could increase the number of estimators
and num_parallel_tree as well.'''

from sklearn.metrics import roc_curve
from sklearn import metrics
from xgboost import XGBClassifier
clf = XGBClassifier(n_estimators=500, learning_rate=0.0875, max_depth=6, n_jobs=3, 
                    num_parallel_tree=4, subsample=0.85)
clf.fit(tfid_train_matrix, y_train)
y_preds=clf.predict(tfid_test_matrix)
print('ROC score: ', roc_auc_score(y_test, y_preds))
print("F1 score: ", metrics.f1_score(y_test , y_preds))
print(metrics.confusion_matrix(y_test , y_preds))

ROC score:  0.6660686531189777
F1 score:  0.7001487110824156
[[47513 36576]
 [20083 66149]]
CPU times: user 13h 35min 8s, sys: 30.3 s, total: 13h 35min 38s
Wall time: 4h 32min 17s


In [15]:
# from joblib import dump 
# dump(clf, "xgbooster")
# dump(tfid, 'vectorizer')

## The best results were from tfidf tokenization and XGBoost with boosted random forest implementation.