In [1]:
from zipfile import ZipFile
import os
import sys
import time
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, hstack
import numpy as np
import pandas as pd
from pandas.io import sql
import sqlite3
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn import cross_validation

datapath = "/Users/Renato Aranha/Documents/Python Scripts/bases_TD/"

In [2]:
app_event = "app_events.csv"
app_label = "app_labels.csv"
event = "events.csv"
gender_ag_test = "gender_age_test.csv"
gender_ag_train = "gender_age_train.csv"
label_category = "label_categories.csv"
phone_brand_device_mode = "phone_brand_device_model.csv"
sample_submissio = "sample_submission.csv"

In [3]:
app_events = pd.read_csv(os.path.join(datapath,app_event),dtype={'is_active':bool})
app_labels = pd.read_csv(os.path.join(datapath,app_label))
events = pd.read_csv(os.path.join(datapath,event))
gender_age_test = pd.read_csv(os.path.join(datapath,gender_ag_test),index_col='device_id')
gender_age_train = pd.read_csv(os.path.join(datapath,gender_ag_train),index_col='device_id')
label_categories = pd.read_csv(os.path.join(datapath,label_category))
phone = pd.read_csv(os.path.join(datapath,phone_brand_device_mode))
phone_dedup = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
sample_submission = pd.read_csv(os.path.join(datapath,sample_submissio))

In [4]:
events.head()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66


In [5]:
brandencoder = LabelEncoder().fit(phone_dedup.phone_brand)
phone_dedup['brand'] = brandencoder.transform(phone_dedup['phone_brand'])
gender_age_train['brand'] = phone_dedup['brand']
gender_age_test['brand'] = phone_dedup['brand']

In [6]:
phone_dedup.head()

Unnamed: 0_level_0,phone_brand,device_model,brand
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-8890648629457979026,小米,红米,51
1277779817574759137,小米,MI 2,51
5137427614288105724,三星,Galaxy S4,15
3669464369358936369,SUGAR,时尚手机,9
-5019277647504317457,三星,Galaxy Note 2,15


In [7]:
gender_age_train['trainrow'] = np.arange(gender_age_train.shape[0]) 
#acima: criando um vetor coluna cujas componentes são os números das linhas do dataframe gender_age_train
gender_age_test['testrow'] = np.arange(gender_age_test.shape[0])

In [8]:
gender_age_train.head()

Unnamed: 0_level_0,gender,age,group,brand,trainrow
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-8076087639492063270,M,35,M32-38,51,0
-2897161552818060146,M,35,M32-38,51,1
-8260683887967679142,M,35,M32-38,51,2
-4938849341048082022,M,30,M29-31,51,3
245133531816851882,M,30,M29-31,51,4


## gerando matrizes esparsas para relacionar device_id com brand:

In [9]:
Xtr_brand = csr_matrix((np.ones(gender_age_train.shape[0]), (gender_age_train.trainrow, gender_age_train.brand)))
#criando matriz com: 
#qtd linhas = qtd linhas do dataframe gender_age_train
#qtd colunas = qtd de marcas distintas (qtd de brands) no dataframe gender_age_train
#os elementos das células com as combinações acima sendo iguais a 1

#abaixo: fazendo a mesma operação com a base de teste
Xte_brand = csr_matrix((np.ones(gender_age_test.shape[0]), (gender_age_test.testrow, gender_age_test.brand)))

In [10]:
Xte_brand.shape

(112071, 131)

## gerando matrizes esparsas para relacionar device_id com concatenado de model e brand :

In [11]:
m = phone_dedup.phone_brand.str.cat(phone_dedup.device_model) #concatenando phone_brand e device_model (e fazendo o encode)
modelencoder = LabelEncoder().fit(m)
phone_dedup['model'] = modelencoder.transform(m)

gender_age_train['model'] = phone_dedup['model']
gender_age_test['model'] = phone_dedup['model']

Xtr_model = csr_matrix((np.ones(gender_age_train.shape[0]), (gender_age_train.trainrow, gender_age_train.model)))
Xte_model = csr_matrix((np.ones(gender_age_test.shape[0]), (gender_age_test.testrow, gender_age_test.model)))

In [12]:
m.head()

device_id
-8890648629457979026               小米红米
 1277779817574759137             小米MI 2
 5137427614288105724        三星Galaxy S4
 3669464369358936369          SUGAR时尚手机
-5019277647504317457    三星Galaxy Note 2
Name: phone_brand, dtype: object

In [13]:
phone_dedup.head()

Unnamed: 0_level_0,phone_brand,device_model,brand,model
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-8890648629457979026,小米,红米,51,858
1277779817574759137,小米,MI 2,51,843
5137427614288105724,三星,Galaxy S4,15,371
3669464369358936369,SUGAR,时尚手机,9,166
-5019277647504317457,三星,Galaxy Note 2,15,347


In [14]:
Xtr_model.shape

(74645, 1667)

# gerando matrizes esparsas para saber quais aplicativos estão em cada device 

In [15]:
app_events.head()

Unnamed: 0,event_id,app_id,is_installed,is_active
0,2,5927333115845830913,1,True
1,2,-5720078949152207372,1,False
2,2,-1633887856876571208,1,False
3,2,-653184325010919369,1,True
4,2,8693964245073640147,1,True


In [16]:
events.head()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66


In [17]:
appencoder = LabelEncoder().fit(app_events.app_id)
app_events['app'] = appencoder.transform(app_events.app_id)
napps = len(appencoder.classes_)

app_events.head()

Unnamed: 0,event_id,app_id,is_installed,is_active,app
0,2,5927333115845830913,1,True,15408
1,2,-5720078949152207372,1,False,3384
2,2,-1633887856876571208,1,False,7620
3,2,-653184325010919369,1,True,8902
4,2,8693964245073640147,1,True,18686


In [18]:
deviceapps = pd.merge(app_events,events,how='left',on='event_id')
deviceapps2 = pd.merge(deviceapps[['device_id','app']],gender_age_train.reset_index(),how='left',on='device_id')

In [24]:
deviceapps3 = pd.merge(deviceapps2[['device_id','app','trainrow']],gender_age_test.reset_index(),how='left',on='device_id')

In [26]:
deviceapps3.drop_duplicates('device_id',keep='first')

Unnamed: 0,device_id,app,trainrow,brand,testrow,model
0,-6401643145415154744,15408,,15.0,68691.0,329.0
1,-6401643145415154744,3384,,15.0,68691.0,329.0
2,-6401643145415154744,7620,,15.0,68691.0,329.0
3,-6401643145415154744,8902,,15.0,68691.0,329.0
4,-6401643145415154744,18686,,15.0,68691.0,329.0


In [35]:
deviceapps = deviceapps3.drop_duplicates('device_id',keep='first')

In [42]:
apoio = deviceapps.dropna(subset=['trainrow'])
Xtr_app = csr_matrix((np.ones(apoio.shape[0]), (apoio.trainrow, apoio.app)), shape=(gender_age_train.shape[0],napps))

apoio = deviceapps.dropna(subset=['testrow'])
Xte_app = csr_matrix((np.ones(apoio.shape[0]), (apoio.testrow, apoio.app)),  shape=(gender_age_test.shape[0],napps))

print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))

Apps data: train shape (74645, 19237), test shape (112071, 19237)


## concatenando as features

In [43]:
Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app), format='csr')
Xtest =  hstack((Xte_brand, Xte_model, Xte_app), format='csr')
print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))

All features: train shape (74645, 21035), test shape (112071, 21035)


## cross validation

In [44]:
targetencoder = LabelEncoder().fit(gender_age_train.group)
y = targetencoder.transform(gender_age_train.group)
nclasses = len(targetencoder.classes_)

In [45]:
def score(clf, random_state = 0):
    kf = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=random_state)
    pred = np.zeros((y.shape[0],nclasses))
    for itrain, itest in kf:
        Xtr, Xte = Xtrain[itrain, :], Xtrain[itest, :]
        ytr, yte = y[itrain], y[itest]
        clf.fit(Xtr, ytr)
        pred[itest,:] = clf.predict_proba(Xte)
        # Downsize to one fold only for kernels
        return log_loss(yte, pred[itest, :])
        print("{:.5f}".format(log_loss(yte, pred[itest,:])), end=' ')
    print('')
    return log_loss(y, pred)

In [89]:
from sklearn.ensemble import RandomForestClassifier
score(RandomForestClassifier(n_estimators=300, 
                              criterion='gini', 
                              max_depth=None, 
                              min_samples_split=2, 
                              min_samples_leaf=1, 
                              min_weight_fraction_leaf=0.0, 
                              max_features='auto', 
                              max_leaf_nodes=None, 
                              bootstrap=True, 
                              oob_score=False, 
                              n_jobs=-1, 
                              random_state=0, 
                              verbose=0, 
                              warm_start=False, 
                              class_weight=None))

3.2998230008324936

In [50]:
from sklearn.ensemble import RandomForestClassifier
score(RandomForestClassifier(n_estimators=300, 
                              criterion='entropy', 
                              max_depth=None, 
                              min_samples_split=2, 
                              min_samples_leaf=1, 
                              min_weight_fraction_leaf=0.0, 
                              max_features='auto', 
                              max_leaf_nodes=None, 
                              bootstrap=True, 
                              oob_score=False, 
                              n_jobs=-1, 
                              random_state=0, 
                              verbose=0, 
                              warm_start=False, 
                              class_weight=None))

KeyboardInterrupt: 

In [92]:
from sklearn.ensemble import AdaBoostClassifier
score(AdaBoostClassifier(base_estimator=None,
                          n_estimators=200,
                          learning_rate=0.1,
                          algorithm='SAMME.R',
                          random_state=0))

2.4768583305048693

In [46]:
from sklearn.linear_model import SGDClassifier
score(SGDClassifier(loss='log',
                     penalty='l2', 
                     alpha=0.0001,
                     l1_ratio=0.15, 
                     fit_intercept=True,
                     n_iter=200, 
                     shuffle=True,
                     verbose=0,
                     epsilon=0.1,
                     n_jobs=-1,
                     random_state=0,
                     learning_rate='optimal',
                     eta0=0.0, 
                     power_t=0.5,
                     class_weight=None,
                     warm_start=False, 
                     average=False))

2.3842726382274599

In [110]:
from sklearn.tree import DecisionTreeClassifier
score(DecisionTreeClassifier(criterion='gini', 
                              splitter='best', 
                              max_depth=None, 
                              min_samples_split=1, 
                              min_samples_leaf=1, 
                              min_weight_fraction_leaf=0.0, 
                              max_features=None, 
                              random_state=0, 
                              max_leaf_nodes=None, 
                              class_weight=None, 
                              presort=False))

3.8666639037237269

In [67]:
#testando Regressão Logística (vamos tentar calibrar a constante de regularização C, cujo valor padrão é 1)
#rodar várias vezes (tentativa e erro)

In [47]:
score(LogisticRegression(C=0.2))

2.3842483511861028

In [48]:
score(LogisticRegression(C=0.2, multi_class='multinomial',solver='lbfgs'))

2.3842655664407402

In [49]:
score(LogisticRegression(C=0.2, multi_class='multinomial',solver='newton-cg'))

2.3842271877651493

In [66]:
#mesmo padrão do exemplo do Titanic:
score(LogisticRegression(penalty='l2',
                           dual=False, 
                           tol=0.0001, 
                           C=1.0, 
                           fit_intercept=True, 
                           intercept_scaling=1, 
                           class_weight=None, 
                           random_state=0, 
                           solver='liblinear', 
                           max_iter=100, 
                           multi_class='ovr', 
                           verbose=0, 
                           warm_start=False, 
                           n_jobs=-1))

2.4045222947077129

In [108]:
clf = RandomForestClassifier(n_estimators=300, 
                              criterion='gini', 
                              max_depth=None, 
                              min_samples_split=2, 
                              min_samples_leaf=1, 
                              min_weight_fraction_leaf=0.0, 
                              max_features='auto', 
                              max_leaf_nodes=None, 
                              bootstrap=True, 
                              oob_score=False, 
                              n_jobs=-1, 
                              random_state=0, 
                              verbose=0, 
                              warm_start=False, 
                              class_weight=None)
clf.fit(Xtrain, y)
pred = pd.DataFrame(clf.predict_proba(Xtest), index = gender_age_test.index, columns=targetencoder.classes_)
pred.head()

Unnamed: 0_level_0,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1002079943728939269,0.040561,0.054695,0.053897,0.063217,0.057758,0.046373,0.111232,0.161476,0.091746,0.100351,0.126029,0.092665
-1547860181818787117,0.044528,0.04893,0.039932,0.037627,0.057528,0.059067,0.097567,0.155305,0.079579,0.105545,0.1351,0.139292
7374582448058474277,0.047236,0.035022,0.034532,0.04156,0.091961,0.077951,0.085574,0.114978,0.075253,0.103815,0.129973,0.162145
-6220210354783429585,0.040772,0.054878,0.025916,0.057818,0.067045,0.054203,0.078283,0.155741,0.082718,0.119748,0.151784,0.111095
-5893464122623104785,0.031834,0.0751,0.039745,0.063715,0.047289,0.035804,0.061484,0.169774,0.113966,0.108139,0.152462,0.10069


In [109]:
pred.to_csv('logreg_subm.csv',index=True)