In [1]:
import pandas as pd
import numpy as np
import gensim

In [2]:
df = pd.read_csv('blogtext.csv').sample(50000)
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
602153,2339326,male,34,Non-Profit,Taurus,"08,June,2004","Hello all, I got an email from Bruc..."
70739,3368101,male,17,Student,Scorpio,"16,July,2004","Well ladies and gents, Im back. Its bee..."
122014,3776941,male,39,indUnk,Libra,"08,July,2004",Came across this great article in this ...
245431,2869197,male,17,Student,Aries,"02,July,2004",Today was a high as well as a low. ...
98871,2473007,male,27,Education,Aries,"08,August,2004","A talkative brook, 20 feet wide and..."


In [3]:
%%time
# df = df.sample(50000)
#Here we import my custom cleaner script, clean.py. I parallelize the function to run faster here, to work when scaling. 
#Using 11 threads to run the script and 1 to manage the multiprocessing.
import clean
from joblib import Parallel, delayed
df['tokens'] = Parallel(n_jobs=11)(delayed(clean.clean)(line) for line in df['text'])

CPU times: user 7.14 s, sys: 479 ms, total: 7.62 s
Wall time: 1min 11s


In [4]:
#Some packages only want the tokens fed as a string. Others want a list of strings.
df['token_str'] = df['tokens'].map(lambda x: ' '.join(x))
df['token_str'].head()

602153    hello all email bruce today email him ask how ...
70739     good lady gent back age last blogged reason bl...
122014    come great article week have edition spectator...
245431    today high good low rise fly land waddle muck ...
98871     talkative brook foot wide colour burn midnight...
Name: token_str, dtype: object

# Machine Learning Pre-Processing
Here we convert string labels into numbers (label-encoding). Next we vectorize all words into tf-idf vectors.
Note that I dropped all words which appear in over 67% of documents and those that appear in less than 5 documents. This step helps filter out noisy words and only keep the significant ones. 
max_features determines the max # of words to keep in the vocabulary.

In [5]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y=le.fit_transform(df['gender'])
X=df['token_str']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [6]:
from sklearn.metrics import f1_score, balanced_accuracy_score, accuracy_score, classification_report, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer(min_df=5, max_df=0.67, smooth_idf=True,
                       norm = 'l2', ngram_range=[1,2], max_features=150000)
tfid_train_matrix = tfid.fit_transform(X_train)
tfid_test_matrix = tfid.transform(X_test)

# Building the Machine Learning Model
Here I'm using XGBoost's "boosted random forest" implementation, which is, so far, the only algorithm that delivers an F1 score over .69. 

Boosted random forest is simply the normal boosted trees algorithm, but at each cycle, it creates num_parallel_tree mini-decision trees. On this dataset, it's resulted in drastically increased accuracy, but be warned, it's much more computationally intensive. Increased runtimes can be drastically reduced by selecting the 'gpu_hist' method, but currently support for CUDA drivers is mixed.

Use the nthread parameter to determine how many CPU cores you want to run the algorithm simultaneously. Currently, XGBoost defaults to (# of cores -1).

learning_rate is how much XGBoost learns from the mini decision trees at each cycle.

In [7]:
%%time
# If your computer has a GPU, you should set the booster parameter to 'gpu_hist', 
# and possibly bring the subsample size up. With a faster-running tool you could increase the number of estimators
# and num_parallel_tree as well.'''

from sklearn.metrics import roc_curve
from sklearn import metrics
from xgboost import XGBClassifier
clf = XGBClassifier(n_estimators=500, learning_rate=0.0875, max_depth=6, nthread=11, 
                    num_parallel_tree=3, subsample=0.85)
clf.fit(tfid_train_matrix, y_train)
y_preds=clf.predict(tfid_test_matrix)
print('ROC score: ', roc_auc_score(y_test, y_preds))
print("F1 score: ", metrics.f1_score(y_test , y_preds))
print(metrics.confusion_matrix(y_test , y_preds))

ROC score:  0.593228340518287
F1 score:  0.6458042932812936
[[2785 3456]
 [1626 4633]]


In [None]:
from joblib import dump
# dump(clf, 'xgbooster')
# dump(tfid.vocabulary_, 'feature')
#Save the tfid_test_matrix for SHAP
# pd.DataFrame(tfid_test_matrix).to_csv('tfid_test.csv')
# X_test.to_csv('X_test.csv')

In [10]:
tfid_train_matrix[20]

<1x106228 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>