In [96]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

from sklearn.cross_validation import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
data_dir = os.path.join(os.path.expanduser('~'), 'workspace/talkinData/data')
sub_dir = os.path.join(os.path.expanduser('~'), 'workspace/talkinData/sub')

In [3]:
train = pd.read_csv(os.path.join(data_dir, 'gender_age_train.csv'))

pbdm = pd.read_csv(os.path.join(data_dir, 'phone_brand_device_model.csv'))
pbdm.drop_duplicates('device_id', keep='first', inplace=True)

test = pd.read_csv(os.path.join(data_dir, 'gender_age_test.csv'))

sample_sub = pd.read_csv(os.path.join(sub_dir, 'sample_submission.csv'))

In [40]:
train.head()

Unnamed: 0,device_id,gender,age,group
0,-8076087639492063270,M,35,M32-38
1,-2897161552818060146,M,35,M32-38
2,-8260683887967679142,M,35,M32-38
3,-4938849341048082022,M,30,M29-31
4,245133531816851882,M,30,M29-31


In [5]:
pbdm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186716 entries, 0 to 187243
Data columns (total 3 columns):
device_id       186716 non-null int64
phone_brand     186716 non-null object
device_model    186716 non-null object
dtypes: int64(1), object(2)
memory usage: 5.7+ MB


In [6]:
train_all = pd.merge(train, pbdm, on='device_id', how='left', left_index=True)

test_all = pd.merge(test, pbdm, on='device_id', how='left', left_index=True)

In [42]:
train_all.head()

Unnamed: 0,device_id,gender,age,group,phone_brand,device_model
56800,-8076087639492063270,M,35,M32-38,51,749
41294,-2897161552818060146,M,35,M32-38,51,749
9993,-8260683887967679142,M,35,M32-38,51,749
179893,-4938849341048082022,M,30,M29-31,51,1524
102570,245133531816851882,M,30,M29-31,51,753


In [8]:
cat_col = ['phone_brand', 'device_model']

lb = LabelEncoder()
for var in cat_col:
    full_data = pd.concat((train_all[var], test_all[var]), axis=0).astype('str')
    lb.fit(full_data)
    train_all[var] = lb.transform(train_all[var].astype('str'))
    test_all[var] = lb.transform(test_all[var].astype('str'))

In [76]:
data_x = train_all.ix[:, ['phone_brand', 'device_model']].values
data_y = train_all.group

data_x_test = test_all.ix[:, ['phone_brand', 'device_model']].values

In [77]:
data_x_test.shape

(112071, 2)

In [101]:
clf1 = RandomForestClassifier(n_estimators=250, n_jobs=3, max_depth=5)

In [97]:
clf2 = LogisticRegression(n_jobs=3)

In [102]:
cross_val_score(clf1, data_x, data_y, cv = 3, scoring='log_loss')

array([-2.40135642, -2.40474334, -2.40253446])

In [98]:
cross_val_score(clf2, data_x, data_y, cv = 3, scoring='log_loss')

array([-2.42194961, -2.42435122, -2.42665763])

In [103]:
clf1.fit(data_x, data_y);

In [104]:
pred = clf1.predict_proba(data_x_test)

In [105]:
clf1.classes_

array(['F23-', 'F24-26', 'F27-28', 'F29-32', 'F33-42', 'F43+', 'M22-',
       'M23-26', 'M27-28', 'M29-31', 'M32-38', 'M39+'], dtype=object)

In [106]:
sub = pd.DataFrame(pred, columns=clf1.classes_)
sub['device_id'] = test_all.device_id.values

In [107]:
sub.to_csv(os.path.join(sub_dir, 'sub04.csv'), index=False)