# Predicting gender from blog post

## Working with an 80k subsample. This gives us a large enough sample size to fully train the models, but small enough to work with locally.

In [1]:
import logging
logging.basicConfig(level=logging.WARNING)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')
df = pd.read_csv('blogtext.csv', parse_dates = ['date']).sample(80000)

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/drew/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
# df['custom_topic'].value_counts().plot('bar')
import spacy
import gensim
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import strip_short
from symspellpy import SymSpell, Verbosity
import pkg_resources
import re
nlp = spacy.load('en_core_web_sm')
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
stops = [spacy.lang.en.stop_words.STOP_WORDS] + stopwords.words('english')
def clean_jv(doc):
   typo_free = ' '.join([(sym_spell.lookup(i, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True)[0].term) for i in doc])
   twol_free = strip_short(typo_free)
   return twol_free
def clean(lst):
    lst = [token for token in lst if not token.is_stop]
    lst = [token.lemma_ for token in lst if str(token.lemma_) not in stops]
    lst = [re.sub(r'[\W\d\s]', '', string) for string in lst]
    lst = [token for token in lst if token not in stops]
    while '' in lst:
        lst.remove('')
    lst = clean_jv(lst)
    return lst

In [6]:
#Using gensim to create bigrams and trigrams here
df['tokens'] = df['text'].map(lambda x: nlp.tokenizer(x.lower()))
df['tokens'] = df['tokens'].map(lambda x: clean(x).split())
#Creating a single-string version of the cleaned texts, for tools that require it.
df['token_str'] = df['tokens'].map(lambda x: ' '.join(x))
doc_clean = df['tokens']
bigram =gensim.models.Phrases(doc_clean, min_count = 5, threshold = 50)
trigram =gensim.models.Phrases(bigram[doc_clean], min_count = 5, threshold = 50)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
def make_trigrams(texts):
    return trigram_mod[bigram_mod[texts]]
doc_trigram = df['tokens'].map(lambda x: make_trigrams(x))
dct = gensim.corpora.Dictionary(doc_trigram)
dct.filter_extremes(no_below=5, no_above=0.6666, keep_n=90000)
doc_term_matrix = [dct.doc2bow(doc) for doc in doc_trigram]

# Machine Learning Pre-Processing

In [14]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
# Converting string labels into numbers.
y=le.fit_transform(df['gender'])
X=df['token_str']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)

# from sklearn.feature_extraction.text import CountVectorizer
# cv = CountVectorizer(lowercase=True, analyzer = "word",
#                      max_features=3500,min_df=4, tokenizer=None, 
#                      ngram_range = [3,5], preprocessor=None)
# X_train_cv = cv.fit_transform(X_train)
# X_test_cv = cv.transform(X_test)
#Metrics I'm going to use
from sklearn.metrics import f1_score, balanced_accuracy_score, accuracy_score, classification_report

In [11]:
from gensim.sklearn_api import TfIdfTransformer
from gensim.corpora import Dictionary
X = df['tokens']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)

dictionary = Dictionary(X_train)
model = TfIdfTransformer(dictionary)
train = [dictionary.doc2bow(text) for text in X_train]
test = [dictionary.doc2bow(text) for text in X_test]
X_train_g =  model.fit_transform(train)
X_test_g = model.fit_transform(test)

# Using sklearn's tf-idf:

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer(use_idf=True, min_df=3, analyzer='word', smooth_idf=True,
                       norm = 'l2', ngram_range=[3,5], sublinear_tf=True)
# X=df['form_str']
X=df['token_str']
X_train, X_test, y_train, y_test = train_test_split(X,y)
tfid_train_matrix = tfid.fit_transform(X_train)
tfid_test_matrix = tfid.transform(X_test)

In [19]:
from xgboost import XGBClassifier
clf = XGBClassifier(num_boosting_rounds=300, learning_rate=.12, num_parallel_trees=2)
clf.fit(tfid_train_matrix, y_train)
y_preds=clf.predict(tfid_test_matrix)
print('Accuracy score: ', accuracy_score(y_test, y_preds))
print("F1 score: ", f1_score(y_test , y_preds))

Accuracy score:  0.5048
F1 score:  0.664839255499154
