In [89]:
%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


# Load the train data

In [90]:
train_data = pd.read_csv("train_data.csv.gz", compression="gzip", encoding="utf8")

In [91]:
train_data.head()

Unnamed: 0,author,subreddit,created_utc,body
0,ejchristian86,TwoXChromosomes,1388534000.0,I hadn't ever heard of them before joining thi...
1,Shamus_Aran,mylittlepony,1388534000.0,I don't think we'd get nearly as much fanficti...
2,Riddance,sex,1388534000.0,"Thanks. I made it up, that's how I got over my..."
3,savoytruffle,AskReddit,1388534000.0,bite me
4,Secret_Wizard,DragonsDogma,1388534000.0,Are you sure you aren't confusing Cyclops (the...


In [92]:
len(train_data)

761189

In [93]:
target = pd.read_csv("train_target.csv")

In [94]:
target.head()

Unnamed: 0,author,gender
0,RedThunder90,0
1,Lirkmor,1
2,In0chi,0
3,ProjectGrudge,0
4,TehTurtleHermit,0


# Feature Extraction

In [95]:
subreddits = train_data.subreddit.unique()
subreddits_map = pd.Series(index=subreddits, data=arange(subreddits.shape[0]))

In [96]:
from scipy import sparse

In [97]:
def extract_features(group):
    group_subreddits = group['subreddit'].values
    idxs = subreddits_map[group_subreddits].values
    v = sparse.dok_matrix((1, subreddits.shape[0]))
    for idx in idxs:
        if not np.isnan(idx):
            v[0, idx] = +1
    return v.tocsr()

extract_features(train_data[train_data.author=='RedThunder90'])

<1x3866 sparse matrix of type '<type 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [98]:
features_dict = {}

for author, group in train_data.groupby('author'):
    features_dict[author] = extract_features(group)

In [99]:
features_dict[author]

<1x3866 sparse matrix of type '<type 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [100]:
extract_features(group)

<1x3866 sparse matrix of type '<type 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

# Text Analysis

In [101]:
from sklearn.feature_extraction import text

In [102]:
stop_words=text.ENGLISH_STOP_WORDS  # sono le parole che vengono rimosse a priori

In [103]:
vectorizer = text.CountVectorizer(max_df=0.8, max_features=10000, stop_words=text.ENGLISH_STOP_WORDS)
# non voglio le parole che appaiono più dell'80% di volte (max_df)
# max_featues: mi fermo alle prime 10.000 parole

counts = vectorizer.fit_transform(subreddits)  #trasformo i documents (sono i dati già modificati, non gli originali)
tfidf = text.TfidfTransformer().fit_transform(counts)

In [104]:
counts
# ho una matrice sparsa da 3866 * 3863

<3866x3863 sparse matrix of type '<type 'numpy.int64'>'
	with 3863 stored elements in Compressed Sparse Row format>

In [105]:
counts[0]  # nella prima parola ho 1 conteggio

<1x3863 sparse matrix of type '<type 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [106]:
tfidf

<3866x3863 sparse matrix of type '<type 'numpy.float64'>'
	with 3863 stored elements in Compressed Sparse Row format>

In [107]:
tfidf[0]

<1x3863 sparse matrix of type '<type 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [108]:
tfidf[0].data

array([ 1.])

In [109]:
array(vectorizer.get_feature_names())[counts[0].nonzero()[1]] 
#trasformo il vectorizer che è un dizionario in array e conto i non zero

array([u'twoxchromosomes'], 
      dtype='<U21')

In [110]:
vectorizer = text.CountVectorizer(max_df=0.8,
                                  max_features=10000, 
                                  stop_words=text.ENGLISH_STOP_WORDS,
                                 ngram_range=(1,2))

                                # ngram_range mi dà le coppie di parole 
    # LA DIFFERENZA RISPETTO AL CODICE PRECEDENTE E' CHE QUI HO LE COPPIE DI PAROLE
    
counts = vectorizer.fit_transform(subreddits)  
tfidf = text.TfidfTransformer().fit_transform(counts)

In [111]:
import nltk

In [112]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/bigdive/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [113]:
from sklearn import decomposition

# Fit the NMF model
nmf = decomposition.NMF(n_components=6)   # NMF = Non Negative Matrix Factorization
nmf.fit(tfidf)
W = nmf.transform(tfidf)   # matrice W = peso
H = nmf.components_

In [114]:
W.shape  

(3866, 6)

In [115]:
H.shape

(6, 3863)

In [116]:
feature_names = vectorizer.get_feature_names()

In [117]:
print[feature_names[i] for i in H[0].argsort()[-21:-1]]

[u'simracing', u'gaben', u'beatles', u'elderscrolls', u'bbexchange', u'gamecollecting', u'singapore', u'powerrangers', u'urbanexploration', u'internet_box', u'asksciencediscussion', u'ottawa', u'ska', u'ucalgary', u'patriots', u'shameless', u'epilepsy', u'canes', u'metacanada', u'longhornnation']


In [118]:
print[feature_names[i] for i in H[0].argsort()[:]]

[u'labrats', u'newyorkmets', u'newzealand', u'nextdoorasians', u'nexus4', u'nexus5', u'nfffffffluuuuuuuuuuuu', u'nfl', u'nfl_draft', u'nflcirclejerk', u'nhl', u'nilesy', u'nin', u'nintendo', u'nirvana', u'nissan', u'nl_kripparrian', u'nlsscirclejerk', u'nocontext_wallpapers', u'nocontract', u'nofap', u'noisygifs', u'newtotf2', u'newsoftheweird', u'newsofthestupid', u'news', u'ncsu', u'needafriend', u'needamod', u'negareddit', u'nerdcubed', u'nerdfighters', u'nerdist', u'nes', u'netflix', u'nethack', u'nolibswatch', u'nether', u'netsec', u'networking', u'neutralpolitics', u'neverwinter', u'newcastle', u'newcastleupontyne', u'newcountry', u'newfoundland', u'newjersey', u'newreddits', u'netrunner', u'nbaspurs', u'nonmonogamy', u'nonononoyes', u'nsfw_gif', u'nsfw_wtf', u'nsfwfunny', u'nsfwiama', u'nufc', u'nutrition', u'nvidia', u'nyjets', u'nyyankees', u'nzxt', u'oaklandathletics', u'obama', u'objectivism', u'occult', u'occupywallstreet', u'ocd', u'oculus', u'offbeat', u'offmychest', u'of

In [119]:
print[feature_names[i] for i in H[0].argsort()[-21:-1]]

[u'simracing', u'gaben', u'beatles', u'elderscrolls', u'bbexchange', u'gamecollecting', u'singapore', u'powerrangers', u'urbanexploration', u'internet_box', u'asksciencediscussion', u'ottawa', u'ska', u'ucalgary', u'patriots', u'shameless', u'epilepsy', u'canes', u'metacanada', u'longhornnation']


In [120]:
for topic_idx, topic in enumerate(H):
    print "Topic #%d:" % topic_idx
    print ",".join([feature_names[i] for i in topic.argsort()[:-21:-1]])
    print

Topic #0:
hypothyroidism,longhornnation,metacanada,canes,epilepsy,shameless,patriots,ucalgary,ska,ottawa,asksciencediscussion,internet_box,urbanexploration,powerrangers,singapore,gamecollecting,bbexchange,elderscrolls,beatles,gaben

Topic #1:
averagebattlestations,contagiouslaughter,boltedontits,makeup,bobdylan,kpopslumberparty,rap,geocaching,grumpycats,litecoinmining,php,bleachshirts,baltimore,jaguars,fantasyhockey,nin,malefashionadvice,gonemild,gallifrey,1911

Topic #2:
musicals,beautyboxes,computerforensics,philadelphia,bakersfield,almosthuman,mexico,savannah,hongkong,wouldyoufuckmywife,asksciencefiction,turkey,cosplay,animesketch,mildlyinteresting,truedetective,ios7,curvy,coloradoavalanche,thanksobama

Topic #3:
exposureporn,bannersaga,plex,marvelstudios,roomporn,piercing,rance,dreadlocks,sbu,gmod,cancer,getdisciplined,gamedeals,brogress,floridaman,triphop,mercedes_benz,worldoftanks,daddit,fairytail

Topic #4:
crusaderkings,emulation,raspberry_pi,hawaii,nbaspurs,woahdude,simplerock

In [121]:
X = sparse.vstack([features_dict[author] for author in target.author])
X

<10000x3866 sparse matrix of type '<type 'numpy.float64'>'
	with 96876 stored elements in Compressed Sparse Row format>

In [122]:
y = target.gender

# Model Selection

In [123]:
# YOUR CODE HERE
from sklearn import metrics, cross_validation
from sklearn import grid_search

from sklearn import naive_bayes
naive_bayes.MultinomialNB()   # è un tipo di modello (il Multinomial NB, Naive Bayes)

model = naive_bayes.MultinomialNB(alpha=0.5)
model.fit(X, y)


MultinomialNB(alpha=0.25, class_prior=None, fit_prior=True)

# Prepare the solution

In [124]:
test_data = pd.read_csv("test_data.csv.gz", compression="gzip", encoding="utf8")

In [125]:
features_dict = {}

for author, group in test_data.groupby('author'):
    features_dict[author] = extract_features(group)

In [126]:
X_test = sparse.vstack([features_dict[author] for author in test_data.author.unique()])
X_test

<34887x3866 sparse matrix of type '<type 'numpy.float64'>'
	with 345539 stored elements in Compressed Sparse Row format>

In [127]:
X.shape

(10000, 3866)

In [128]:
X_test.shape

(34887, 3866)

In [129]:
X_test[:,1]

<34887x1 sparse matrix of type '<type 'numpy.float64'>'
	with 207 stored elements in Compressed Sparse Row format>

In [130]:
y_pred = model.predict_proba(X_test)[:,1]


In [131]:
solution = pd.DataFrame({"author":test_data.author.unique(), "gender":y_pred})
solution.head()

Unnamed: 0,author,gender
0,Asks_Politely,7.821589e-10
1,smartphone-redditor,0.01360886
2,Simcom,5.55629e-13
3,ZenDragon,3.169147e-11
4,imgurtranscriber,1.565051e-74


In [132]:
solution.to_csv("logistic_regression.csv", index=False)