In [1]:
%pylab inline
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction import text
from sklearn import grid_search, linear_model, naive_bayes, svm, preprocessing, cross_validation
from sklearn.metrics import make_scorer, roc_auc_score
from nltk.corpus import wordnet as wn
from nltk import stem
import re

Populating the interactive namespace from numpy and matplotlib


# Load the train data

In [124]:
train_data = pd.read_csv("train_data.csv.gz", compression="gzip", encoding="utf8")

In [3]:
train_data.head()

Unnamed: 0,author,subreddit,created_utc,body
0,ejchristian86,TwoXChromosomes,1388534000.0,I hadn't ever heard of them before joining thi...
1,Shamus_Aran,mylittlepony,1388534000.0,I don't think we'd get nearly as much fanficti...
2,Riddance,sex,1388534000.0,"Thanks. I made it up, that's how I got over my..."
3,savoytruffle,AskReddit,1388534000.0,bite me
4,Secret_Wizard,DragonsDogma,1388534000.0,Are you sure you aren't confusing Cyclops (the...


In [4]:
target = pd.read_csv("train_target.csv")

In [5]:
target.head()

Unnamed: 0,author,gender
0,RedThunder90,0
1,Lirkmor,1
2,In0chi,0
3,ProjectGrudge,0
4,TehTurtleHermit,0


# Feature Extraction

In [6]:
subreddits = train_data.subreddit.unique()
subreddits_map = pd.Series(index=subreddits, data=arange(subreddits.shape[0]))

In [7]:
pattern = re.compile('(?u)\\b((([a-z])(?!\\3{2,})){4,})\\b')

stemmer = stem.SnowballStemmer('english')
def stemming(doc):
    split = []
    for word in pattern.finditer(doc):
        split.append(word.group())
    l = [stemmer.stem(t) for t in split]
    return [w for w in l if len(w) > 2]


In [8]:
test_data = pd.read_csv("test_data.csv.gz", compression="gzip", encoding="utf8")
test_data = test_data[["author", "body"]]

In [64]:
%%time
                     
def make_corpus(train, test):
    for author, group in train.groupby('author'):
        yield " ".join(group["body"].values.astype('U'))
    for author, group in test.groupby('author'):
        yield " ".join(group["body"].values.astype('U'))


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 17.9 µs


In [66]:
%%time
vectorizer = text.TfidfVectorizer(min_df=10, max_df=0.9, stop_words='english', decode_error='ignore',
                                   ngram_range=(1, 1), sublinear_tf=True, tokenizer = stemming)
vectorizer.fit(make_corpus(train_data, test_data))



CPU times: user 36min 42s, sys: 9.58 s, total: 36min 52s
Wall time: 36min 59s


In [67]:
len(vectorizer.get_feature_names())

37440

In [68]:
%%time

del test_data

words_dict = {}

for author, group in train_data.groupby('author'):
    words_dict[author] = " ".join(group["body"].values)

aut_bodies = [words_dict[author] for author in target.author]

del words_dict

X_words = vectorizer.transform(aut_bodies)

del aut_bodies

CPU times: user 7min 44s, sys: 368 ms, total: 7min 44s
Wall time: 7min 46s


In [70]:
y = target.gender

In [72]:
%%time

word_alphas = np.logspace(-6, -5, 5)

word_gs = grid_search.GridSearchCV(word_model, {"alpha":word_alphas}, cv=7, scoring='roc_auc')
word_gs.fit(X_words, y)
print word_gs.best_params_
print word_gs.best_score_

{'alpha': 5.6234132519034912e-06}
0.84208282212
CPU times: user 16.6 s, sys: 300 ms, total: 16.9 s
Wall time: 9.12 s


In [73]:
del X_words

In [136]:
def extract_features(group):
    group_subreddits = group['subreddit'].values
    idxs = subreddits_map[group_subreddits].values
    v = sparse.dok_matrix((1, subreddits.shape[0]))
    for idx in idxs:
        if not np.isnan(idx):
            v[0, idx] = 1.0
    v3 = vectorizer.transform(group["body"].values.astype('U'))
    return sparse.hstack([v.tocsr(), word_model.predict_proba(v3)[:,1].mean()]) 

print extract_features(train_data[train_data.author=='LadyWhiskers'])

[ 1.          0.          1.          1.          1.          1.          1.
  1.          1.          1.          0.          1.          1.          0.3398721
  1.          0.          0.          1.          1.          1.          1.
  0.          0.          1.          1.          0.          1.
  0.60823849  1.          0.69676993  1.          0.21533796  0.38792112
  1.          1.          0.          1.          0.17612916  1.          0.
  1.          0.          1.          1.          1.          0.
  0.74566946  1.          0.          1.          0.25666091  1.          1.
  0.          1.          0.71338505  0.          0.          1.          1.        ]
  (0, 3)	1.0
  (0, 18)	1.0
  (0, 225)	1.0
  (0, 619)	1.0
  (0, 1399)	1.0
  (0, 2785)	1.0
  (0, 3866)	0.668999736213


In [104]:
%%time
features_dict = {}

for author, group in train_data.groupby('author'):
    features_dict[author] = extract_features(group)

CPU times: user 9min 28s, sys: 1.25 s, total: 9min 29s
Wall time: 9min 31s


In [105]:
X = sparse.vstack([features_dict[author] for author in target.author])

In [106]:
del features_dict
y = target.gender

# Model Selection

In [114]:
%%time 

model = naive_bayes.MultinomialNB()

alphas = np.linspace(0.3, 0.5, 30)

gs = grid_search.GridSearchCV(model, {"alpha":alphas}, cv=9, scoring='roc_auc')
gs.fit(X, y)
print gs.best_params_
print gs.best_score_


{'alpha': 0.4517241379310345}
0.931080908041
CPU times: user 4.7 s, sys: 4 ms, total: 4.71 s
Wall time: 4.77 s


# Prepare the solution

In [116]:
test_data = pd.read_csv("test_data.csv.gz", compression="gzip", encoding="utf8")

In [117]:
del train_data
del target
test_data['body'].fillna('', inplace=True)
aut_uniq = test_data.author.unique()

In [118]:
%%time

features_dict = {}

for author, group in test_data.groupby('author'):
    features_dict[author] = extract_features(group)
    
del test_data

In [119]:
X_test = sparse.vstack([features_dict[author] for author in aut_uniq])
del features_dict

In [120]:
y_pred = gs.predict_proba(X_test)[:,1]

In [121]:
solution = pd.DataFrame({"author":aut_uniq, "gender":y_pred})
solution.head()

Unnamed: 0,author,gender
0,Asks_Politely,1.814081e-09
1,smartphone-redditor,0.01336787
2,Simcom,5.099409e-12
3,ZenDragon,1.011557e-10
4,imgurtranscriber,2.698285e-61


In [122]:
solution.to_csv("words_classifier.csv", index=False)