# MLND Capstone project

## Fire up graphlab create & Import dataset


In [1]:
import graphlab as gl
from datetime import datetime
app_labels = gl.SFrame.read_csv('source_data/app_labels.csv',column_type_hints={'app_id': str})
gender_age_train = gl.SFrame.read_csv('source_data/gender_age_train.csv',column_type_hints={'device_id': str})
label_categories = gl.SFrame.read_csv('source_data/label_categories.csv')
phone_brand_device_model = gl.SFrame.read_csv('source_data/phone_brand_device_model.csv',column_type_hints={'device_id': str})
sample_submission = gl.SFrame.read_csv('source_data/sample_submission.csv',column_type_hints={'device_id': str})
app_events = gl.SFrame.read_csv('source_data/app_events.csv',column_type_hints={'app_id': str})
events = gl.SFrame.read_csv('source_data/events.csv',column_type_hints={'device_id': str})

A newer version of GraphLab Create (v2.1) is available! Your current version is v1.9.

You can use pip to upgrade the graphlab-create package. For more information see https://dato.com/products/create/upgrade.


This non-commercial license of GraphLab Create for academic use is assigned to rufang@ucsd.edu and will expire on May 08, 2017.


2016-07-31 20:21:43,786 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.9 started. Logging: /tmp/graphlab_server_1469996503.log


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [2]:
gender_age_train.head(3)

device_id,gender,age,group
-8076087639492063270,M,35,M32-38
-2897161552818060146,M,35,M32-38
-8260683887967679142,M,35,M32-38


## Creating training and testing set

In [3]:
ga_train, ga_test = gender_age_train.random_split(0.8, seed=5)

## Benchmark score

In [4]:
benchmark = ga_train.groupby('group',{'count':gl.aggregate.COUNT('device_id')})
benchmark

group,count
F33-42,4449
F23-,4083
F27-28,2515
M27-28,4424
M32-38,7575
F29-32,3711
F24-26,3367
M23-26,7692
M29-31,5820
M39+,6877


In [5]:
total = benchmark['count'].sum()
benchmark['freq']=benchmark['count']/total
classes = sorted(sample_submission.column_names())
classes.remove("device_id")
submission = gl.SFrame({'device_id':ga_test['device_id']})

In [6]:
for i, label in enumerate(classes):
    submission[label] = benchmark[benchmark['group']==label]['freq'][0]
submission.head(1)

device_id,F23-,F24-26,F27-28,F29-32,F33-42
236877999787307864,0.0681954836986,0.0562366381614,0.0420062800641,0.0619822287547,0.074308524853

F43+,M22-,M23-26,M27-28,M29-31,M32-38
0.0566207910208,0.0996960181721,0.128474078033,0.0738909673971,0.0972073757349,0.126519909139

M39+
0.114861704971


In [7]:
target = ga_test['group']
predictions = submission.remove_column('device_id')
row = []
col = []
for label in classes:
    row.append(predictions[0][label])
for i in range(len(ga_test)):
    col.append(row)
submission = gl.SArray(col)

In [8]:
log_loss = gl.evaluation.log_loss(target, submission)
print 'the benchmark logloss is {}'.format(log_loss)

the benchmark logloss is 2.4234927185


## Merging data into one table

In [9]:
app_label_list = app_labels.groupby('app_id',{'label_list': gl.aggregate.CONCAT('label_id')})
app_label_list.head(3)

app_id,label_list
-539730114641165062,"[548, 710, 704]"
-736899763926984371,"[405, 796, 794, 795]"
3258653066506014655,"[548, 549, 128]"


In [10]:
app_events = app_events.join(app_label_list, on='app_id', how='inner')
app_events['label_list'] = app_events['label_list'].apply(lambda x: ' '.join(str(e) for e in x))

In [11]:
# separate app_events
app_events_installed = app_events[app_events['is_active']==0]
app_events_active = app_events[app_events['is_active']==1]

In [12]:
ga_events_active = app_events_active.groupby("event_id", {"active_label": gl.aggregate.CONCAT('label_list'), 'active_app_id': gl.aggregate.CONCAT('app_id')})
ga_events_active['active_label'] =ga_events_active['active_label'].apply(lambda x: ' '.join(str(e) for e in x))
ga_events_active['active_tfidf']= gl.text_analytics.tf_idf(ga_events_active['active_label'])
ga_events_active.head(3)

event_id,active_label,active_app_id,active_tfidf
2658191,549 710 169 271 302 548 303 318 704 306 843 840 ...,"[5195654633279707117, -6793861127573349654] ...","{'318': 3.0342132763787455, ..."
846413,548 549 152 548 704 713 717 721 232 721 548 704 ...,"[543880124725657021, -755461362045697404, ...","{'152': 2.67227674683976, '704': 1.184211267210 ..."
2683136,548 549 549 710 548 704 172 548 549 714 548 704 ...,"[-7680145830980282919, 5927333115845830913, ...","{'704': 0.7401320420064275, ..."


In [13]:
ga_events_install = app_events_installed.groupby("event_id", {"installed_label": gl.aggregate.CONCAT('label_list'), 'installed_app_id': gl.aggregate.CONCAT('app_id')})
ga_events_install['installed_label'] =ga_events_install['installed_label'].apply(lambda x: ' '.join(str(e) for e in x))
ga_events_install['install_tfidf']= gl.text_analytics.tf_idf(ga_events_install['installed_label'])
ga_events_install.head(3)

event_id,installed_label,installed_app_id,install_tfidf
561271,549 405 548 730 756 761 777 782 787 1012 713 704 ...,"[3722410295315095646, 133935734026685494, ...","{'704': 0.04857517511347787, ..."
1718105,549 710 548 704 172 548 549 168 549 721 724 186 ...,"[5927333115845830913, -1805603084550565936, ...","{'214': 1.6798201754695075, ..."
895926,549 713 237 704 548 549 721 302 303 704 548 183 ...,"[8483751493632839871, -7377004479023402858, ...","{'775': 0.9557122571870953, ..."


In [14]:
events = events.join(ga_events_active, on ='event_id', how='left') # events has more event id than app_event, so use left join
events= events.join(ga_events_install, on = 'event_id', how='left')
events.head(3)

event_id,device_id,timestamp,longitude,latitude,active_label
1,29182687948017175,2016-05-01 00:55:25,121.38,31.24,
2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,549 710 548 704 172 405 730 747 749 776 782 785 ...
3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7,

active_app_id,active_tfidf,installed_label,installed_app_id
,,,
"[5927333115845830913, -653184325010919369, ...","{'747': 4.9236469360864525, ...",721 548 302 303 704 251 263 405 730 756 757 775 ...,"[-5720078949152207372, -1633887856876571208, ..."
,,,

install_tfidf
""
"{'704': 0.08500655644858628, ..."
""


In [15]:
def period (x):
    if x.hour>=0 and x.hour<=5:
        return 'midnight'
    elif x.hour>=6 and x.hour<=11:
        return 'morning'
    elif x.hour>=12 and x.hour<=17:
        return 'afternoon'
    else:
        return 'evening'

In [16]:
name_of_weekday = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturaday','Sunday']
def weekday (x):
    return name_of_weekday[x-1]

In [17]:
# creating time features
events["timestamp"] = events["timestamp"].str_to_datetime(str_format="%Y-%m-%dT%H:%M:%S")
events['weekday']= events['timestamp'].apply(lambda x: x.isoweekday())
events['weekday']= events['weekday'].apply(weekday)
events['period'] = events['timestamp'].apply(period)

In [18]:
# adding phone brand device model 
events = events.join(phone_brand_device_model, on='device_id',how='inner')

In [19]:
events.head(3)

event_id,device_id,timestamp,longitude,latitude,active_label
1,29182687948017175,2016-05-01 00:55:25,121.38,31.24,
2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,549 710 548 704 172 405 730 747 749 776 782 785 ...
3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7,

active_app_id,active_tfidf,installed_label,installed_app_id
,,,
"[5927333115845830913, -653184325010919369, ...","{'747': 4.9236469360864525, ...",721 548 302 303 704 251 263 405 730 756 757 775 ...,"[-5720078949152207372, -1633887856876571208, ..."
,,,

install_tfidf,weekday,period,phone_brand,device_model
,Sunday,midnight,小米,红米note
"{'704': 0.08500655644858628, ...",Sunday,midnight,三星,Galaxy Grand Prime
,Sunday,midnight,魅族,MX4 Pro


In [20]:
# merging events into training and testing dataset
ga_train = ga_train.join(events, on='device_id', how='inner')
ga_train.head(3)

device_id,gender,age,group,event_id,timestamp,longitude,latitude,active_label
-8195816569128397698,M,39,M39+,10,2016-05-01 00:41:31,119.34,26.04,
6130108008013735751,F,45,F43+,20,2016-05-01 00:17:33,129.51,42.91,
-1663840927569383079,F,19,F23-,22,2016-05-01 00:43:15,116.45,33.93,

active_app_id,active_tfidf,installed_label,installed_app_id,install_tfidf,weekday,period,phone_brand
,,,,,Sunday,midnight,锤子
,,,,,Sunday,midnight,酷派
,,,,,Sunday,midnight,vivo

device_model
坚果手机
大神F1
X5Max+


In [21]:
ga_test = ga_test.join(events, on='device_id', how='inner')

## Applying Gradient boosting decision trees

In [22]:
# Creating validation set
import time
ga_train_id = gl.SFrame({'device_id':ga_train['device_id'].unique()})
ga_train_id = gl.cross_validation.shuffle(ga_train_id, random_seed=5)
folds = gl.cross_validation.KFold(ga_train_id, num_folds=5)

In [23]:
fold_train, fold_val = folds[0]
fold_train = fold_train.join(ga_train, on='device_id')
fold_val = fold_val.join(ga_train, on='device_id')

In [24]:
# make sure no device_id overlapping between train and validation set
assert set(fold_val['device_id'].unique()) & set(fold_train['device_id'].unique()) == set()

### 1) app features

In [25]:
# feature selection trials
target = "group"
features = ['active_tfidf']
model = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 50)

In [66]:
target = "group"
features = ['install_tfidf']
model = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 50)

In [67]:
target = "group"
features = ['active_label']
model = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 50)

In [89]:
target = "group"
features = ['installed_label']
model = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 50)

In [69]:
target = "group"
features = ['active_app_id']
model = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 50)

In [70]:
target = "group"
features = ['installed_app_id']
model = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 50)

### 2) time feature

In [71]:
target = "group"
features = ['period']
model = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 50)

In [72]:
target = "group"
features = ['weekday']
model = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 50)

In [73]:
target = "group"
features = ['period','weekday']
model = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 50)

### 3) location features

In [75]:
target = "group"
features = ['longitude']
model = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 50)

In [76]:
target = "group"
features = ['latitude']
model = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 50)

In [77]:
target = "group"
features = ['longitude','latitude']
model = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 50)

### 4) phone brand device model features

In [79]:
target = "group"
features = ['phone_brand']
model = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 50)

In [80]:
target = "group"
features = ['device_model']
model = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 50)

In [81]:
target = "group"
features = ['phone_brand','device_model']
model = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 50)

## final model

In [41]:
target = "group"
features = ['longitude', 'latitude', 'active_tfidf', 'install_tfidf','phone_brand','device_model','weekday','period']
model2 = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 100, max_depth=6,
                                                      step_size=0.1, column_subsample = 0.3)

## cross validation

In [47]:
log_loss_values = []
for i in range(5):
    fold_train, fold_val = folds[i]
    fold_train = fold_train.join(ga_train, on='device_id')
    fold_val = fold_val.join(ga_train, on='device_id')
    target = "group"
    features = ['longitude', 'latitude', 'active_tfidf', 'install_tfidf','phone_brand','device_model','weekday','period']
    model2 = gl.classifier.boosted_trees_classifier.create(fold_train,
                                                      target=target, features=features,
                                                      validation_set=fold_val, max_iterations = 100, max_depth=6,
                                                      step_size=0.1, column_subsample = 0.3,verbose=False)
    
    log_loss_values.append(model2.evaluate(fold_val, metric="log_loss")["log_loss"])

for i, ll in enumerate(log_loss_values):
    print "log loss for fold %d: %f" %(i+1, ll)

log loss for fold 1: 2.275107
log loss for fold 2: 2.255033
log loss for fold 3: 2.271240
log loss for fold 4: 2.266012
log loss for fold 5: 2.250840


## score on test set

In [42]:
model2.evaluate(ga_test, metric="log_loss")

{'log_loss': 2.254168160891516}