In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

from sklearn.cross_validation import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
data_dir = os.path.join(os.path.expanduser('~'), 'workspace/talkinData/data')
sub_dir = os.path.join(os.path.expanduser('~'), 'workspace/talkinData/sub')

In [3]:
train = pd.read_csv(os.path.join(data_dir, 'gender_age_train.csv'))

pbdm = pd.read_csv(os.path.join(data_dir, 'phone_brand_device_model.csv'))
pbdm.drop_duplicates('device_id', keep='first', inplace=True)

test = pd.read_csv(os.path.join(data_dir, 'gender_age_test.csv'))

sample_sub = pd.read_csv(os.path.join(sub_dir, 'sample_submission.csv'))

In [4]:
train.head()

Unnamed: 0,device_id,gender,age,group
0,-8076087639492063270,M,35,M32-38
1,-2897161552818060146,M,35,M32-38
2,-8260683887967679142,M,35,M32-38
3,-4938849341048082022,M,30,M29-31
4,245133531816851882,M,30,M29-31


In [5]:
pbdm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186716 entries, 0 to 187243
Data columns (total 3 columns):
device_id       186716 non-null int64
phone_brand     186716 non-null object
device_model    186716 non-null object
dtypes: int64(1), object(2)
memory usage: 5.7+ MB


In [6]:
train_all = pd.merge(train, pbdm, on='device_id', how='left', left_index=True)

test_all = pd.merge(test, pbdm, on='device_id', how='left', left_index=True)

In [9]:
train_all.head()

Unnamed: 0,device_id,gender,age,group,phone_brand,device_model
56800,-8076087639492063270,M,35,M32-38,51,749
41294,-2897161552818060146,M,35,M32-38,51,749
9993,-8260683887967679142,M,35,M32-38,51,749
179893,-4938849341048082022,M,30,M29-31,51,1524
102570,245133531816851882,M,30,M29-31,51,753


In [8]:
cat_col = ['phone_brand', 'device_model']

lb = LabelEncoder()
for var in cat_col:
    full_data = pd.concat((train_all[var], test_all[var]), axis=0).astype('str')
    lb.fit(full_data)
    train_all[var] = lb.transform(train_all[var].astype('str'))
    test_all[var] = lb.transform(test_all[var].astype('str'))

In [24]:
# phone_count
def phone_putter(var):
    if phone_count.has_key(var):
        return phone_count[var]
    else:
        return 0
        
def device_putter(var):
    if dev_model_count.has_key(var):
        return dev_model_count[var]
    else:
        return 0

phone_count = train_all.groupby('phone_brand')['device_model'].count().to_dict()
train_all['phone_count'] = train_all.phone_brand.apply(phone_putter)
test_all['phone_count'] = test_all.phone_brand.apply(phone_putter)

# device_count
dev_model_count = train_all.groupby('device_model')['phone_brand'].count().to_dict()
train_all['device_count'] = train_all.device_model.apply(device_putter)
test_all['device_count'] = test_all.device_model.apply(device_putter)

In [25]:
train_all.head()

Unnamed: 0,device_id,gender,age,group,phone_brand,device_model,phone_count,device_count
56800,-8076087639492063270,M,35,M32-38,51,749,17299,511
41294,-2897161552818060146,M,35,M32-38,51,749,17299,511
9993,-8260683887967679142,M,35,M32-38,51,749,17299,511
179893,-4938849341048082022,M,30,M29-31,51,1524,17299,3013
102570,245133531816851882,M,30,M29-31,51,753,17299,2299


In [29]:
test_all.head()

Unnamed: 0,device_id,phone_brand,device_model,phone_count,device_count
78253,1002079943728939269,51,1482,17299,620
84501,-1547860181818787117,51,1519,17299,824
104765,7374582448058474277,31,1371,12960,0
145592,-6220210354783429585,31,1544,12960,1239
12144,-5893464122623104785,51,749,17299,511


In [31]:
train_cols = ['phone_brand', 'device_model', 'phone_count', 'device_count']

data_x = train_all.ix[:, train_cols]
data_y = train_all.group.values

data_x_test = test_all.ix[:, train_cols]

In [32]:
clf1 = RandomForestClassifier(n_estimators=250, n_jobs=3, max_depth=6)

In [77]:
clf2 = LogisticRegression(n_jobs=3)

In [99]:
clf3 = GradientBoostingClassifier(max_depth=6, n_estimators=250)

## Benchmark score

### Features : 

* phone_brand
* device_model

### Model

* RandomForestClassifier(n_estimators=250, n_jobs=3, max_depth=5)

### Score
* array([-2.40141466, -2.40489151, -2.40238346])

### LB Score
* 2.39

---------------

## Highest local score

### Features : 

* phone_brand
* device_model
* phone_count
* device_count


### Model

* RandomForestClassifier(n_estimators=250, n_jobs=3, max_depth=6)

### Score
* [-2.40089375 -2.40244605 -2.40174421] 
  * -2.40169467067

In [33]:
a = cross_val_score(clf1, data_x, data_y, cv = 3, scoring='log_loss')
print a, a.mean()

[-2.40080737 -2.4024285  -2.40175078] -2.40166221725


In [91]:
cross_val_score(clf2, data_x, data_y, cv = 3, scoring='log_loss')

array([-2.43617993, -2.45988826, -2.44778023])

In [127]:
a = cross_val_score(clf3, data_x, data_y, cv = 3, scoring='log_loss')
print a, a.mean()

[-2.49159475 -2.49242333 -2.49094507] -2.49165438419


In [34]:
clf1.fit(data_x, data_y);
zip(data_x.columns, clf1.feature_importances_)

[('phone_brand', 0.32060275593571735),
 ('device_model', 0.27714569337949552),
 ('phone_count', 0.23572972044937235),
 ('device_count', 0.16652183023541475)]

In [97]:
clf3.fit(data_x, data_y);
zip(data_x.columns, clf3.feature_importances_)

[('phone_brand', 0.13498266959399377),
 ('device_model', 0.7708095973313418),
 ('phone_count', 0.094207733074664152)]

In [35]:
clf1.fit(data_x, data_y);

In [36]:
pred = clf1.predict_proba(data_x_test)

In [37]:
sub = pd.DataFrame(pred, columns=clf1.classes_)
sub['device_id'] = test_all.device_id.values

In [39]:
sub.to_csv(os.path.join(sub_dir, 'sub05.csv'), index=False)

In [112]:
a = train_all.groupby('device_model')['phone_brand'].count().to_dict()

In [114]:
b = train_all.device_model.apply(lambda x : a[x])

In [116]:
b.shape

(74645,)