In [1]:
%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


# Load the train data

In [2]:
test_data = pd.read_csv("test_data.csv.gz", compression="gzip", encoding="utf8")

In [3]:
train_data = pd.read_csv("train_data.csv.gz", compression="gzip", encoding="utf8")
target = pd.read_csv("train_target.csv")

In [4]:
train_data.head()    
test_data.head()     
type(test_data)
print(train_data.shape , test_data.shape)


(296042, 4) (1107946, 4)


In [5]:
data=train_data.append(test_data)                              #merge train e test for feature extraction
data.shape
data.head()


Unnamed: 0,author,subreddit,created_utc,body
0,Shamus_Aran,mylittlepony,1388534000.0,I don't think we'd get nearly as much fanficti...
1,Riddance,sex,1388534000.0,"Thanks. I made it up, that's how I got over my..."
2,Secret_Wizard,DragonsDogma,1388534000.0,Are you sure you aren't confusing Cyclops (the...
3,Penultimatum,malefashionadvice,1388534000.0,dont do this to me bro
4,7-SE7EN-7,todayilearned,1388534000.0,That's what we do when we can't find a mate


In [6]:
data.author.unique().shape        # 20000 authors 

(20000,)

In [7]:
ordered_author=target.author.append(test_data.author)
ordered_author=ordered_author.unique()                  
ordered_author.shape

(20000,)

In [8]:
target.head()               #gender of the 5000 authors in the test

Unnamed: 0,author,gender
0,RedThunder90,0
1,Lirkmor,1
2,In0chi,0
3,ProjectGrudge,0
4,TehTurtleHermit,0


# Feature Extraction

In [9]:
subreddits = data.subreddit.unique()      #topics 
subreddits_map = pd.Series(index=subreddits, data=arange(subreddits.shape[0])) #list subreddits: index= name, data= number

subreddits_map.head()

mylittlepony         0
sex                  1
DragonsDogma         2
malefashionadvice    3
todayilearned        4
dtype: int32

In [10]:
from scipy import sparse

In [11]:
def extract_features(group):                       #build a function that links authors with subreddits (numbered)
    group_subreddits = group['subreddit'].values
    idxs = subreddits_map[group_subreddits].values
    v = sparse.dok_matrix((1, subreddits.shape[0]))   # matrix of zeroes
    for idx in idxs:
        if not np.isnan(idx):
            v[0, idx] = 1
    return v.tocsr()             #sparse matrix of 0 and 1

print(extract_features(data[data.author=='RedThunder90'])) #example: posts by author 'RedThunder90'

  (0, 103)	1.0


In [12]:
features_dict = {}         #key = author, value = subreddit index

for author, group in data.groupby('author'):
    features_dict[author] = extract_features(group)

In [13]:
data.groupby('author')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F2AF9769D0>

In [14]:
print(features_dict['RedThunder90'])

  (0, 103)	1.0


In [15]:
X = sparse.vstack([features_dict[author] for author in ordered_author])
X   # author x subreddit: indicator 1\0

<20000x4033 sparse matrix of type '<class 'numpy.float64'>'
	with 196281 stored elements in Compressed Sparse Row format>

In [16]:
y = target.gender   

In [17]:
def extract_text(group):     # extract text from author (merge all text from same author)
    group_text = group['body'].values
    return " ".join(group_text)

extract_text(data[data.author=='RedThunder90'])

'I still prefer to buy foods either grown locally or where animals are treated better, but this definitely has me looking at organic food differently.'

In [18]:
data=data.astype(dtype='str') 

text_dict = {}       #dictionary with key = author and value = text

for author, group in data.groupby('author'):
    text_dict[author] = extract_text(group)

In [19]:
author_text = [text_dict[author] for author in ordered_author]

# Model Selection

In [21]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

pattern='(?u)\\b[A-Za-z]{3,}'   

cv = CountVectorizer(stop_words=None, token_pattern=pattern) #consider monograms only.
                     
C = cv.fit_transform(author_text) 

tfidf = TfidfTransformer(sublinear_tf=True)             #use term-frequency/inverse document frequency
                                                        #TERM FREQUENCY : frequency of a word in an document
                                                        #INVERSE DOCUMENT FREQUENCY: number of documents with a word
                        
#calculate features using tf-idf and create a training set 
X_train = tfidf.fit_transform(C)
print ("X_train is a sparse matrix with shape: %s" % str(X_train.shape))

X_train is a sparse matrix with shape: (20000, 266828)


In [22]:
from sklearn.linear_model import LogisticRegression
logit =LogisticRegression(class_weight='balanced',C=1.5)  #balanced!

In [23]:
X_train1=X_train.tocsr()         #subreddits matrix
X_train1=X_train1[0:5000,:]

X1=X.tocsr()                      #words matrix
X1=X1[0:5000,:]


In [24]:
data_train = sparse.hstack([X_train1,X1]) #merge matrices
data_train

<5000x270861 sparse matrix of type '<class 'numpy.float64'>'
	with 2523930 stored elements in COOrdinate format>

In [35]:
%%time
logit.fit(data_train,y)

Wall time: 5.34 s


LogisticRegression(C=1.5, class_weight='balanced')

In [36]:
pred= logit.predict(data_train)

In [37]:
%%time
from sklearn.model_selection import cross_val_score
logit2=LogisticRegression(class_weight='balanced',C=1.5)
scores = cross_val_score(logit2, data_train, y, cv=10)
print(mean(scores))        

0.8576
Wall time: 50.4 s


In [38]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y, pred)
print(metrics.auc(fpr, tpr))
print(metrics.confusion_matrix(y,pred))
                        

0.983992017378384
[[3572   79]
 [  14 1335]]


# Prepare the solution

In [39]:
test_data.head()

Unnamed: 0,author,subreddit,created_utc,body
0,ejchristian86,TwoXChromosomes,1388534000.0,I hadn't ever heard of them before joining thi...
1,ZenDragon,gaming,1388534000.0,"At 7680 by 4320 with 64x AA, right?"
2,savoytruffle,AskReddit,1388534000.0,bite me
3,hentercenter,stlouisblues,1388534000.0,Damn that was a good penalty :(
4,rick-o-suave,army,1388534000.0,I swore into DEP on 6-OCT and I left 5-NOV und...


In [40]:
X_test1=X_train.tocsr()
X_test1=X_test1[5000:20001,:]

X2=X.tocsr()
X2=X2[5000:20001,:]

data_test1 = sparse.hstack([X_test1,X2])
print(data_test1.shape)

(15000, 270861)


In [41]:
y_pred = logit.predict_proba(data_test1)[:,1]
y_pred # probability of being in class 1

array([0.99999598, 0.00298943, 0.01807153, ..., 0.20146071, 0.35516218,
       0.23498549])

In [42]:
solution = pd.DataFrame({"author":test_data.author.unique(), "gender":y_pred})
solution.head()

Unnamed: 0,author,gender
0,ejchristian86,0.999996
1,ZenDragon,0.002989
2,savoytruffle,0.018072
3,hentercenter,0.066428
4,rick-o-suave,0.423016


In [None]:
solution.to_csv("solution.csv", index=False)