In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

In [2]:
init_df = pd.read_csv('../../datasets/tokenized/blog_authorship_tokenized.csv', usecols=['text', 'tokenized', 'n_words', 'gender'])
init_df.sample(15)

Unnamed: 0,text,gender,n_words,tokenized
610399,every latter-day saint should read eugene engl...,male,17,every latter - day saint should read eugene en...
450653,today was bored at school .. mr liew and miss ...,male,126,today was bored at school .. mr liew and miss ...
82433,sorry about that last outburst...lady green's ...,male,54,sorry about that last outburst ... lady green ...
406991,In the words of Tenacious D...we are but men! ...,male,17,in the words of tenacious d ... we are but men...
383245,Its Friday the 13th. Is anyone out there super...,female,373,its friday the 13th . is anyone out there supe...
624995,I am a girly girl. Even if I'm 41. I never car...,female,93,i am a girly girl . even if i 'm 41 . i never ...
462692,The Pros of Being Me 1. My rapier wit 2. My za...,female,106,the pros of being me 1 . my rapier wit 2 . my ...
255647,HIIII!! QUESTION what is the debate on 2morow ...,male,40,hiiii ! ! question what is the debate on 2moro...
627207,my fiends oh what can I say one of them thinks...,male,154,my fiends oh what can i say one of them thinks...
523994,Via Mark Kilmer comes this story about Michael...,male,269,via mark kilmer comes this story about michael...


In [3]:
init_df.isnull().sum()

text         0
gender       0
n_words      0
tokenized    0
dtype: int64

In [4]:
# Stratify the sample by age group
df = init_df.groupby('gender', group_keys=False).apply(lambda x: x.sample(frac=.035))
df.shape

(22245, 4)

In [5]:
df['gender'].value_counts()

male      11282
female    10963
Name: gender, dtype: int64

In [6]:
enc = LabelEncoder()
enc.fit(df['gender'])
encoded_labels = dict(zip(enc.classes_, enc.transform(enc.classes_))) # will use that in the cm later
df['gender'] = enc.transform(df['gender'])
encoded_labels

{'female': 0, 'male': 1}

## Baseline

In [7]:
X = df[['text']]
y = df['gender']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True, random_state=123, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((17796, 1), (4449, 1), (17796,), (4449,))

In [8]:
base_vect = CountVectorizer()
base_ct = ColumnTransformer([('vect', base_vect, 'text')], remainder='drop', n_jobs=-1)
base_svm = SVC(kernel='linear')
base_pipe = make_pipeline(base_ct, base_svm)

In [9]:
base_params = {
    'verbose': [5] 
}

In [10]:
baseline = GridSearchCV(base_pipe, base_params, cv=5, scoring='f1_macro', refit=True, n_jobs=12)

baseline.fit(X_train, y_train)
print(baseline.best_params_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END ..........................svc__C=1;, score=0.626 total time=60.6min
[CV 2/5] END ..........................svc__C=1;, score=0.610 total time=62.8min
[CV 3/5] END ..........................svc__C=1;, score=0.598 total time=66.6min
[CV 4/5] END ..........................svc__C=1;, score=0.608 total time=59.7min
[CV 5/5] END ..........................svc__C=1;, score=0.590 total time=64.1min
[CV 1/5] END .........................svc__C=10;, score=0.623 total time=80.4min


In [None]:
y_pred = baseline.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=baseline.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=baseline.classes_)
disp.display_labels = encoded_labels
disp.plot(cmap='Blues')
disp.ax_.tick_params(axis='x', rotation=0);

## On tokenized data

In [None]:
X = df[['tokenized']]
y = df['gender']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True, random_state=123, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
ch_2grams_cv = CountVectorizer(analyzer='char', ngram_range=(2, 2), max_features=4000, stop_words=None)
ch_3grams_cv = CountVectorizer(analyzer='char', ngram_range=(3, 3), max_features=4000, stop_words=None)
wd_2grams_cv = CountVectorizer(analyzer='word', ngram_range=(2, 2), max_features=4000, stop_words=None)
wd_3grams_cv = CountVectorizer(analyzer='word', ngram_range=(2, 2), max_features=4000, stop_words=None)
scl = MinMaxScaler()

ct = ColumnTransformer([
    ('vect_ch_2', ch_2grams_cv, 'tokenized'),
    ('vect_ch_3', ch_3grams_cv, 'tokenized'),
    # ('vect_wd_2', wd_2grams_cv, 'tokenized'),
    ('vect_wd_3', wd_3grams_cv, 'tokenized'),
], remainder='passthrough', n_jobs=-1)

svm = SVC(kernel='linear')
logreg = LogisticRegression(
        solver='liblinear',
        penalty='l2'
    )

pipe = make_pipeline(ct, logreg)

In [None]:
param_grid = {
    'logisticregression__C': [1, 10],
    'logisticregression__max_iter': [5000, 2500, 1000]
}

In [None]:
gs = GridSearchCV(pipe, param_grid, cv=10, scoring='f1_macro', n_jobs=-1, refit=True, verbose=5, error_score='raise')

gs.fit(X_train, y_train)
print(gs.best_params_)

In [None]:
y_pred = gs.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=gs.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=gs.classes_)
disp.display_labels = encoded_labels
disp.plot(cmap='Blues')
disp.ax_.tick_params(axis='x', rotation=90)
disp.ax_.set_facecolor('red');