In [1]:
import pandas as pd
import numpy as np
import gensim

In [2]:
df = pd.read_csv('blogtext.csv')
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [3]:
%%time
#Here we import my custom cleaner script, clean.py. I parallelize the function to run faster here, to work when scaling. 
#Using 11 threads to run the script and 1 to manage the multiprocessing.
import clean
from joblib import Parallel, delayed
df['tokens'] = Parallel(n_jobs=11)(delayed(clean.clean)(line) for line in df['text'])

CPU times: user 1min 26s, sys: 4.85 s, total: 1min 30s
Wall time: 18min 36s


In [4]:
#Some packages only want the tokens fed as a string. Others want a list of strings.
df['token_str'] = df['tokens'].map(lambda x: ' '.join(x))
df['token_str'].head()

0    info have find page psf file have wait until t...
1    team member drives van der lang mail ruby die ...
2    nader van kernfusie barde mark waterstofbom ho...
3                                            test test
4    thank yahoo have toolbar capture urls popups m...
Name: token_str, dtype: object

# Machine Learning Pre-Processing
Here we convert string labels into numbers (label-encoding). Next we vectorize all words into tf-idf vectors.
Note that I dropped all words which appear in over 67% of documents and those that appear in less than 5 documents. This step helps filter out noisy words and only keep the significant ones. 
max_features determines the max # of words to keep in the vocabulary.

In [5]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y=le.fit_transform(df['gender'])
X=df['token_str']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [6]:
from sklearn.metrics import f1_score, balanced_accuracy_score, accuracy_score, classification_report, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer(min_df=5, max_df=0.67, smooth_idf=True,
                       norm = 'l2', ngram_range=[1,2], max_features=150000)
tfid_train_matrix = tfid.fit_transform(X_train)
tfid_test_matrix = tfid.transform(X_test)

# Building the Machine Learning Model
Here I'm using XGBoost's "boosted random forest" implementation, which is, so far, the only algorithm that delivers an F1 score over .69. 

Boosted random forest is simply the normal boosted trees algorithm, but at each cycle, it creates num_parallel_tree mini-decision trees. On this dataset, it's resulted in drastically increased accuracy, but be warned, it's much more computationally intensive. Increased runtimes can be drastically reduced by selecting the 'gpu_hist' method, but currently support for CUDA drivers is mixed.

Use the nthread parameter to determine how many CPU cores you want to run the algorithm simultaneously. Currently, XGBoost defaults to (# of cores -1).

learning_rate is how much XGBoost learns from the mini decision trees at each cycle.

In [10]:
%%time
# If your computer has a GPU, you should set the booster parameter to 'gpu_hist', 
# and possibly bring the subsample size up. With a faster-running tool you could increase the number of estimators
# and num_parallel_tree as well.'''

from sklearn.metrics import roc_curve
from sklearn import metrics
from xgboost import XGBClassifier
clf = XGBClassifier(n_estimators=500, learning_rate=0.0875, max_depth=6, nthread=11, 
                    num_parallel_tree=3, subsample=0.85)
clf.fit(tfid_train_matrix, y_train)
y_preds=clf.predict(tfid_test_matrix)
print('ROC score: ', roc_auc_score(y_test, y_preds))
print("F1 score: ", metrics.f1_score(y_test , y_preds))
print(metrics.confusion_matrix(y_test , y_preds))

ROC score:  0.6663247762121159
F1 score:  0.6989776398571671
[[47900 36329]
 [20321 65771]]
CPU times: user 8h 36min 26s, sys: 44.3 s, total: 8h 37min 11s
Wall time: 47min 23s


#### Saving the models here. Do not use built-in .save() methods for machine learning tools because they don't save many important attributes. Use pickle or joblib.
#### Saving the entire tf-idf vectorizer would be too large a filesize. So instead save the .vocabulary__, and in the app, load a tf-idf vectorizer with the same settings as above.

In [8]:
#Saving the models here
from joblib import dump
dump(clf, 'xgbooster')
#Saving the entire tf-idf vectorizer would make a file too large. So just save the vocabulary, load another vectorizer

dump(tfid.vocabulary_, 'vocab')

['vocab']