In [19]:
%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


# Load the train data

In [20]:
train_data = pd.read_csv("train_data.csv.gz", compression="gzip", encoding="utf8")

In [21]:
train_data.head()

Unnamed: 0,author,subreddit,created_utc,body
0,ejchristian86,TwoXChromosomes,1388534000.0,I hadn't ever heard of them before joining thi...
1,Shamus_Aran,mylittlepony,1388534000.0,I don't think we'd get nearly as much fanficti...
2,Riddance,sex,1388534000.0,"Thanks. I made it up, that's how I got over my..."
3,savoytruffle,AskReddit,1388534000.0,bite me
4,Secret_Wizard,DragonsDogma,1388534000.0,Are you sure you aren't confusing Cyclops (the...


In [22]:
target = pd.read_csv("train_target.csv")

In [23]:
target.head()

Unnamed: 0,author,gender
0,RedThunder90,0
1,Lirkmor,1
2,In0chi,0
3,ProjectGrudge,0
4,TehTurtleHermit,0


# Feature Extraction

In [24]:
subreddits = train_data.subreddit.unique()
subreddits_map = pd.Series(index=subreddits, data=arange(subreddits.shape[0]))

In [25]:
from scipy import sparse

In [26]:
def extract_features(group):
    group_subreddits = group['subreddit'].values
    idxs = subreddits_map[group_subreddits].values
    v = sparse.dok_matrix((1, subreddits.shape[0]))
    for idx in idxs:
        if not np.isnan(idx):
            v[0, idx] = +1
    return v.tocsr()

extract_features(train_data[train_data.author=='RedThunder90'])

<1x3866 sparse matrix of type '<type 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [27]:
features_dict = {}

for author, group in train_data.groupby('author'):
    features_dict[author] = extract_features(group)

In [28]:
X = sparse.vstack([features_dict[author] for author in target.author])
X

<10000x3866 sparse matrix of type '<type 'numpy.float64'>'
	with 96876 stored elements in Compressed Sparse Row format>

In [29]:
y = target.gender

# Model Selection

In [30]:
# YOUR CODE HERE
from sklearn import metrics, cross_validation
from sklearn import grid_search

from sklearn import naive_bayes
naive_bayes.MultinomialNB()   # è un tipo di modello (il Multinomial NB, Naive Bayes)

model = naive_bayes.MultinomialNB()
model.fit(X, y)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Prepare the solution

In [31]:
test_data = pd.read_csv("test_data.csv.gz", compression="gzip", encoding="utf8")

In [32]:
features_dict = {}

for author, group in test_data.groupby('author'):
    features_dict[author] = extract_features(group)

In [33]:
X_test = sparse.vstack([features_dict[author] for author in test_data.author.unique()])
X_test

<34887x3866 sparse matrix of type '<type 'numpy.float64'>'
	with 345539 stored elements in Compressed Sparse Row format>

In [34]:
X.shape

(10000, 3866)

In [35]:
X_test.shape

(34887, 3866)

In [37]:
X_test[:,1]

<34887x1 sparse matrix of type '<type 'numpy.float64'>'
	with 207 stored elements in Compressed Sparse Row format>

In [40]:
y_pred = model.predict_proba(X_test)[:,1]


In [41]:
solution = pd.DataFrame({"author":test_data.author.unique(), "gender":y_pred})
solution.head()

Unnamed: 0,author,gender
0,Asks_Politely,5.151342e-08
1,smartphone-redditor,0.01199343
2,Simcom,5.509215e-10
3,ZenDragon,2.388658e-09
4,imgurtranscriber,5.1683029999999997e-26


In [42]:
solution.to_csv("logistic_regression.csv", index=False)