In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn import cross_validation as cv
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss,accuracy_score
from scipy.sparse import csr_matrix,hstack,vstack
import time
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.optimizers import SGD,RMSprop
from sklearn.feature_selection import SelectPercentile, f_classif, chi2, SelectKBest
from sklearn.cross_validation import StratifiedKFold,KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
import xgboost as xgb
seed = 7
np.random.seed(seed)

Using Theano backend.


In [3]:
events = pd.read_csv('input/events.csv',index_col='event_id')
label = pd.read_csv('input/label_categories.csv')
app_event = pd.read_csv('input/app_events.csv')
app_label = pd.read_csv('input/app_labels.csv')
device = pd.read_csv('input/phone_brand_device_model.csv')
device = device.drop_duplicates('device_id').set_index('device_id')
gender_age_train = pd.read_csv('input/gender_age_train.csv',index_col = 'device_id')
gender_age_test = pd.read_csv('input/gender_age_test.csv',index_col = 'device_id')

For the convinience of creating sparse matrix, we create another integer index for the training and testing set.

In [4]:
print('gender_age_train\n',gender_age_train.head(1))
print('\ngender_age_test\n',gender_age_test.head(1))
print('\ndevice\n',device.head(1))
print('\nevents\n',events.head(1))
print('\nlabel\n',label.head(1))
print('\napp_event\n',app_event.head(1))
print('\napp_label\n',app_label.head(1))

gender_age_train
                      gender  age   group
device_id                               
-8076087639492063270      M   35  M32-38

gender_age_test
 Empty DataFrame
Columns: []
Index: [1002079943728939269]

device
                      phone_brand device_model
device_id                                    
-8890648629457979026          小米           红米

events
                   device_id            timestamp  longitude  latitude
event_id                                                             
1         29182687948017175  2016-05-01 00:55:25     121.38     31.24

label
    label_id category
0         1      NaN

app_event
    event_id               app_id  is_installed  is_active
0         2  5927333115845830913             1          1

app_label
                 app_id  label_id
0  7324884708820027918       251


# Separation of two set of devices: with events and without events

In [5]:
gender_age_train_with = gender_age_train.loc[gender_age_train.index.isin(events.device_id.unique())].copy()
gender_age_train_without = gender_age_train.loc[~gender_age_train.index.isin(events.device_id.unique())].copy()
gender_age_test_with = gender_age_test.loc[gender_age_test.index.isin(events.device_id.unique())].copy()
gender_age_test_without = gender_age_test.loc[~gender_age_test.index.isin(events.device_id.unique())].copy()
print('{0:<40.40}{1:5}'.format('Size of training set without events:',gender_age_train_without.shape[0]))
print('{0:<40.40}{1:5}'.format('Size of training set with events:',gender_age_train_with.shape[0]))
print('{0:<40.40}{1:5}'.format('Total size of training set:',gender_age_train.shape[0]))
print('{0:<40.40}{1:5}'.format('Size of testing set without events:',gender_age_test_without.shape[0]))
print('{0:<40.40}{1:5}'.format('Size of training set with events:',gender_age_test_with.shape[0]))
print('{0:<40.40}{1:5}'.format('Total size of training set:',gender_age_test.shape[0]))

Size of training set without events:    51336
Size of training set with events:       23309
Total size of training set:             74645
Size of testing set without events:     76877
Size of training set with events:       35194
Total size of training set:             112071


In [6]:
gender_age_train_with['int_index'] = np.arange(gender_age_train_with.shape[0])
gender_age_test_with['int_index'] = np.arange(gender_age_test_with.shape[0])
gender_age_train_without['int_index'] = np.arange(gender_age_train_without.shape[0])
gender_age_test_without['int_index'] = np.arange(gender_age_test_without.shape[0])

# Feature engineering I: phone brand

Encoding the phone brand, and feed this as a feature into the training and testing test.

In [7]:
encoder = LabelEncoder()
encoder.fit(device.phone_brand)
device['brand'] = encoder.transform(device.phone_brand)
gender_age_train_with['brand'] = device['brand']
gender_age_test_with['brand'] = device['brand']
gender_age_train_without['brand'] = device['brand']
gender_age_test_without['brand'] = device['brand']
brandnumber = len(encoder.classes_)

Create sparse matrix with each row representing one device and each column representing one brand

In [8]:
X_train_brand_with = csr_matrix((np.ones(gender_age_train_with.shape[0]),
                            (gender_age_train_with.int_index,gender_age_train_with.brand)),
                               shape = (gender_age_train_with.shape[0],brandnumber))
X_test_brand_with = csr_matrix((np.ones(gender_age_test_with.shape[0]),
                           (gender_age_test_with.int_index,gender_age_test_with.brand)),
                              shape = (gender_age_test_with.shape[0],brandnumber))
print('X_train_brand_with shape:',X_train_brand_with.shape)
print('X_test_brand_with shape:',X_test_brand_with.shape)
X_train_brand_without = csr_matrix((np.ones(gender_age_train_without.shape[0]),
                            (gender_age_train_without.int_index,gender_age_train_without.brand)),
                                  shape = (gender_age_train_without.shape[0],brandnumber))
X_test_brand_without = csr_matrix((np.ones(gender_age_test_without.shape[0]),
                           (gender_age_test_without.int_index,gender_age_test_without.brand)),
                                 shape = (gender_age_test_without.shape[0],brandnumber))
print('X_train_brand_without shape:',X_train_brand_without.shape)
print('X_test_brand_without shape:',X_test_brand_without.shape)

X_train_brand_with shape: (23309, 131)
X_test_brand_with shape: (35194, 131)
X_train_brand_without shape: (51336, 131)
X_test_brand_without shape: (76877, 131)


# Feature engineering II: phone device model

In [9]:
brand_model = device.phone_brand.str.cat(device.device_model)
encoder3 = LabelEncoder()
encoder3.fit(brand_model)
device['model'] = encoder3.transform(brand_model)
gender_age_train_with['model'] = device['model']
gender_age_test_with['model'] = device['model']
gender_age_train_without['model'] = device['model']
gender_age_test_without['model'] = device['model']
modelnumber = len(encoder3.classes_)

In [10]:
X_train_model_with = csr_matrix((np.ones(gender_age_train_with.shape[0]),
                                 (gender_age_train_with.int_index,gender_age_train_with.model)),
                               shape = (gender_age_train_with.shape[0],modelnumber))
X_test_model_with = csr_matrix((np.ones(gender_age_test_with.shape[0]),
                                (gender_age_test_with.int_index,gender_age_test_with.model)),
                              shape = (gender_age_test_with.shape[0],modelnumber))
X_train_model_without = csr_matrix((np.ones(gender_age_train_without.shape[0]),
                                    (gender_age_train_without.int_index,gender_age_train_without.model)),
                                  shape = (gender_age_train_without.shape[0],modelnumber))
X_test_model_without = csr_matrix((np.ones(gender_age_test_without.shape[0]),
                                   (gender_age_test_without.int_index,gender_age_test_without.model)),
                                 shape = (gender_age_test_without.shape[0],modelnumber))

print('X_train_brand_with shape:',X_train_model_with.shape)
print('X_test_brand_with shape:',X_test_model_with.shape)
print('X_train_brand_without shape:',X_train_model_without.shape)
print('X_test_brand_without shape:',X_test_model_without.shape)
del device,brand_model

X_train_brand_with shape: (23309, 1667)
X_test_brand_with shape: (35194, 1667)
X_train_brand_without shape: (51336, 1667)
X_test_brand_without shape: (76877, 1667)


# Feature engineering III: Installed app

Encoding the app_id and store it into app column, and feed this as a feature into the training and testing test.

In [11]:
encoder2 = LabelEncoder()
encoder2.fit(app_event.app_id)
app_event['app'] = encoder2.transform(app_event.app_id)

In [12]:
np.sort(app_event.app.unique())

array([    0,     1,     2, ..., 19234, 19235, 19236])

In [13]:
print(app_event.head(1))
print(events.head(1))

   event_id               app_id  is_installed  is_active    app
0         2  5927333115845830913             1          1  15408
                  device_id            timestamp  longitude  latitude
event_id                                                             
1         29182687948017175  2016-05-01 00:55:25     121.38     31.24


In [14]:
installed_app = pd.merge(events[['device_id']],app_event[['event_id','app','is_installed']],
                         how='right',right_on = 'event_id',left_index = True)
print(installed_app.head())

             device_id  event_id    app  is_installed
0 -6401643145415154744         2  15408             1
1 -6401643145415154744         2   3384             1
2 -6401643145415154744         2   7620             1
3 -6401643145415154744         2   8902             1
4 -6401643145415154744         2  18686             1


In [15]:
installed_app_grouped = installed_app.groupby(['device_id','app'])['app'].agg(['size'])
print('installed_app_grouped:')
print(installed_app_grouped.head())
installed_app_train_with = pd.merge(installed_app_grouped,gender_age_train_with[['int_index']],
                               how = 'right',right_index=True,left_index=True)
installed_app_test_with = pd.merge(installed_app_grouped,gender_age_test_with[['int_index']],
                              how = 'right',right_index=True,left_index=True)
print('installed_app_train:')
print(installed_app_train_with.head())

installed_app_grouped:
                           size
device_id            app       
-9222956879900151005 548     18
                     1096    18
                     1248    26
                     1545    12
                     1664    18
installed_app_train:
                           size  int_index
device_id            app                  
-9222956879900151005 548     18       5145
                     1096    18       5145
                     1248    26       5145
                     1545    12       5145
                     1664    18       5145


In [16]:
installed_app_train_with = installed_app_train_with.reset_index()
installed_app_test_with = installed_app_test_with.reset_index()
installed_app_train_with = installed_app_train_with.dropna(subset=['int_index'])
installed_app_test_with = installed_app_test_with.dropna(subset=['int_index'])
print(installed_app_train_with.head())
print(installed_app_test_with.head())

             device_id   app  size  int_index
0 -9222956879900151005   548    18       5145
1 -9222956879900151005  1096    18       5145
2 -9222956879900151005  1248    26       5145
3 -9222956879900151005  1545    12       5145
4 -9222956879900151005  1664    18       5145
             device_id    app  size  int_index
0 -9222661944218806987   1867     3       2851
1 -9222661944218806987   7519     8       2851
2 -9222661944218806987   7843     1       2851
3 -9222661944218806987   8704     4       2851
4 -9222661944218806987  10000     1       2851


Calculate the number of unique app:

In [17]:
print(gender_age_test_with.shape[0])
print(installed_app_train_with.shape[0])

35194
915632


In [18]:
print('The number of unique apps:')
print(np.size(installed_app.app.unique()))
appnumber = np.size(installed_app.app.unique())

The number of unique apps:
19237


In [19]:
print(np.sort(installed_app_train_with.app.unique()))
print(installed_app_test_with.shape[0])

[    0     1     2 ..., 19234 19235 19236]
1387337


In [20]:
X_train_installed_with = csr_matrix((np.ones(installed_app_train_with.shape[0]),
                                (installed_app_train_with.int_index,installed_app_train_with.app)), 
                               shape = (gender_age_train_with.shape[0],appnumber))
X_test_installed_with = csr_matrix((np.ones(installed_app_test_with.shape[0]),
                               (installed_app_test_with.int_index,installed_app_test_with.app)),
                               shape = (gender_age_test_with.shape[0],appnumber))
print('X_train_installed shape:',X_train_installed_with.shape)
print('X_test_installed shape:',X_test_installed_with.shape)
del installed_app_test_with,installed_app_train_with

X_train_installed shape: (23309, 19237)
X_test_installed shape: (35194, 19237)


# Feature engineering IV: app label

In [21]:
print(app_event[['app_id','event_id']].head(1))
print(app_label[['app_id','label_id']].head(1))

                app_id  event_id
0  5927333115845830913         2
                app_id  label_id
0  7324884708820027918       251


In [22]:
app_label_new = app_label.loc[app_label.app_id.isin(app_event.app_id.unique())].copy()
app_label_new['app'] = encoder2.transform(app_label_new.app_id)
encoder4 = LabelEncoder().fit(app_label_new.label_id)
app_label_new['label'] = encoder4.transform(app_label_new.label_id)
labelnumber = len(encoder4.classes_)
print('app_label_new:')
print(app_label_new.head())

app_label_new:
                app_id  label_id    app  label
0  7324884708820027918       251  17355    207
1 -4494216993218550286       251   4618    207
2  6058196446775239644       406  15548    247
3  6058196446775239644       407  15548    248
4  8694625920731541625       406  18689    247


In [23]:
print(app_label.size)
print(installed_app.size)
print('installed_app_grouped:')
print(installed_app_grouped.head())
installed_label_grouped = (installed_app_grouped.reset_index()[['device_id','app']]
                          .merge(app_label_new[['app','label']])
                          .groupby(['device_id','label']))['app'].agg(['size']).reset_index()
                          
print('installed_label_grouped:')
print(installed_label_grouped.head())

919886
129892268
installed_app_grouped:
                           size
device_id            app       
-9222956879900151005 548     18
                     1096    18
                     1248    26
                     1545    12
                     1664    18
installed_label_grouped:
             device_id  label  size
0 -9222956879900151005    117     1
1 -9222956879900151005    120     1
2 -9222956879900151005    126     1
3 -9222956879900151005    138     2
4 -9222956879900151005    147     2


In [24]:
label_app_train_with = pd.merge(installed_label_grouped,gender_age_train_with[['int_index']],
                               how = 'right',right_index=True,left_on='device_id')
label_app_test_with = pd.merge(installed_label_grouped,gender_age_test_with[['int_index']],
                              how = 'right',right_index=True,left_on ='device_id' )
label_app_train_with = label_app_train_with.dropna(subset= ['int_index','label'])
label_app_test_with = label_app_test_with.dropna(subset= ['int_index','label'])

In [25]:
#binary
X_train_label_with = csr_matrix((np.ones(label_app_train_with.shape[0]),
                                 (label_app_train_with.int_index,label_app_train_with.label)),
                              shape = (gender_age_train_with.shape[0],labelnumber))
X_test_label_with = csr_matrix((np.ones(label_app_test_with.shape[0]),(label_app_test_with.int_index,label_app_test_with.label)),
                              shape = (gender_age_test_with.shape[0],labelnumber))
#count
'''X_train_label_with = csr_matrix((label_app_train_with['size'],
                                 (label_app_train_with.int_index,label_app_train_with.label)),
                              shape = (gender_age_train_with.shape[0],labelnumber))
X_test_label_with = csr_matrix((label_app_test_with['size'],
                                (label_app_test_with.int_index,label_app_test_with.label)),
                              shape = (gender_age_test_with.shape[0],labelnumber))'''
print('X_train_installed_with shape:',X_train_label_with.shape)
print('X_test_installed_with shape:',X_test_label_with.shape)
del installed_app_grouped,label,app_label,app_label_new,label_app_test_with,label_app_train_with,encoder4,installed_app

X_train_installed_with shape: (23309, 492)
X_test_installed_with shape: (35194, 492)


# Feature engineering V: active app

In [None]:
active_app = pd.merge(events[['device_id']],app_event[['event_id','app','is_active']],
                         how='right',right_on = 'event_id',left_index = True)
active_app = active_app.loc[active_app.is_active==1]
print(active_app.head())

In [None]:
active_app_grouped = active_app.groupby(['device_id','app'])['app'].agg(['size'])
print('active_app_grouped:')
print(active_app_grouped.head())

In [None]:
active_app_train_with = pd.merge(active_app_grouped,gender_age_train_with[['int_index']],
                               how = 'left',right_index=True,left_index=True)
active_app_test_with = pd.merge(active_app_grouped,gender_age_test_with[['int_index']],
                              how = 'left',right_index=True,left_index=True)
print('active_app_train_with:')
print(active_app_train_with.head())

In [None]:
active_app_train_with = active_app_train_with.reset_index()
active_app_test_with = active_app_test_with.reset_index()
active_app_train_with = active_app_train_with.dropna(subset=['int_index'])
active_app_test_with = active_app_test_with.dropna(subset=['int_index'])
print(active_app_train_with.head())
print(active_app_test_with.head())

In [None]:
# binary active map
#X_train_active_with = csr_matrix((np.ones(active_app_train_with.shape[0]),
#                            (active_app_train_with.int_index,active_app_train_with.app)), 
#                            shape = (gender_age_train_with.shape[0],appnumber))
#X_test_active = csr_matrix((np.ones(active_app_test_with.shape[0]),
#                            (active_app_test.int_index,active_app_test_with.app)),
#                            shape = (gender_age_test_with.shape[0],appnumber))
# count the number of active app
X_train_active_with = csr_matrix((active_app_train_with['size'],
                            (active_app_train_with.int_index,active_app_train_with.app)), 
                            shape = (gender_age_train_with.shape[0],appnumber))
X_test_active_with = csr_matrix((active_app_test_with['size'],
                            (active_app_test_with.int_index,active_app_test_with.app)),
                            shape = (gender_age_test_with.shape[0],appnumber))
print('X_train_active shape:',X_train_active_with.shape)
print('X_test_active shape:',X_test_active_with.shape)

In [None]:
'''scaler = StandardScaler(with_mean=False)
X_train_active_with = scaler.fit_transform(X_train_active_with)
X_test_active_with = scaler.transform(X_test_active_with)'''

# Feature engineering VI: active time period

In [27]:
events_time = events[['device_id','timestamp']].copy()
events_time['time']  = events_time['timestamp'].str[11:13].astype(int)
events_time.drop(['timestamp'],axis=1,inplace=True)
events_time = events_time.groupby(['device_id','time'])['time'].agg({'times':'count'}).reset_index()
print(events_time.head())
timenumber= events_time.time.unique().shape[0]

             device_id  time  times
0 -9222956879900151005     7      2
1 -9222956879900151005    11      7
2 -9222956879900151005    12     13
3 -9222956879900151005    13      3
4 -9222956879900151005    14      5


In [28]:
time_train_with = pd.merge(events_time,gender_age_train_with[['int_index']],
                               right_index=True,left_on='device_id')
time_test_with = pd.merge(events_time,gender_age_test_with[['int_index']],
                               right_index=True,left_on ='device_id' )
#binary
X_train_time_with = csr_matrix((np.ones(time_train_with.shape[0]),
                            (time_train_with.int_index,time_train_with.time)), 
                            shape = (gender_age_train_with.shape[0],timenumber))
X_test_time_with = csr_matrix((np.ones(time_test_with.shape[0]),
                            (time_test_with.int_index,time_test_with.time)),
                            shape = (gender_age_test_with.shape[0],timenumber))

#number
#X_train_time_with = csr_matrix((time_train_with['times'],
#                            (time_train_with.int_index,time_train_with.time)), 
#                            shape = (gender_age_train_with.shape[0],timenumber))
#X_test_time_with = csr_matrix((time_test_with['times'],
#                            (time_test_with.int_index,time_test_with.time)),
#                            shape = (gender_age_test_with.shape[0],timenumber))
print('X_train_time_with shape:',X_train_time_with.shape)
print('X_test_time_with shape:',X_test_time_with.shape)

X_train_time_with shape: (23309, 24)
X_test_time_with shape: (35194, 24)


Normalization of the time period count.

In [None]:
'''scaler = StandardScaler(with_mean=False)
X_train_time_with = scaler.fit_transform(X_train_time_with)
X_test_time_with = scaler.transform(X_test_time_with)'''

# feature join and selection

## Device without events

In [26]:
X_train_total_without = hstack((X_train_brand_without,X_train_model_without),format='csr')
temp_train = hstack((X_train_brand_with,X_train_model_with),format='csr')
X_test_total_without = hstack((X_test_brand_without,X_test_model_without),format='csr')

X_train_total_without= vstack((X_train_total_without,temp_train),format = 'csr')
gender_age_train_without_temp = pd.concat((gender_age_train_without,gender_age_train_with))

print('Training shape:')
print(X_train_total_without.shape)
print('Testing shape:')
print(X_test_total_without.shape)
print('y shape:')
print(gender_age_train_without_temp.shape)

Training shape:
(74645, 1798)
Testing shape:
(76877, 1798)
y shape:
(74645, 6)


## Device with events

In [28]:
X_train_total_with = hstack((X_train_brand_with,X_train_model_with,
                             #X_train_active_with,
                             #X_train_time_with,
                             X_train_installed_with,X_train_label_with),format='csr')
X_test_total_with = hstack((X_test_brand_with,X_test_model_with,
                            #X_test_active_with,
                            #X_test_time_with,
                           X_test_installed_with,X_test_label_with),format='csr')
print('Training shape:')
print(X_train_total_with.shape)
print('Testing shape:')
print(X_test_total_with.shape)

Training shape:
(23309, 21527)
Testing shape:
(35194, 21527)


In [29]:
#percentile selction
#selector = SelectPercentile(f_classif, percentile=80)
#selector.fit(X_train_total, y_train_total)
#X_train_total = selector.transform(X_train_total)
#X_test_total = selector.transform(X_test_total)
#X_val.shape

# Selection using chi-square
#selector = SelectKBest(chi2, k=15155).fit(X_train_total, y_train_total)
#X_train_total = selector.transform(X_train_total)
#X_test_total = selector.transform(X_test_total)
#print('Training shape:')
#print(X_train_total.shape)
#print('Testing shape:')
#print(X_test_total.shape)

# Clear the memory before we do the learning

In [30]:
del app_event

# Start training model

encode the target

In [31]:
targetencoder = LabelEncoder().fit(gender_age_train_with.group)
nclasses = len(targetencoder.classes_)
y_train_total_with = targetencoder.transform(gender_age_train_with.group)

# Subset I: device with events

## Neural network

Defining functions:

In [39]:
def batch_generator(X, y, batch_size, shuffle):
    #chenglong code for fiting from generator 
    #(https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0
def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0



In [None]:
def with_model(X_dim_input):
    # create model
    model = Sequential()
    #model.add(Dense(10, input_dim=X_train_total.shape[1], init='normal', activation='relu'))
    #model.add(Dropout(0.2))
    model.add(Dense(50, input_dim=X_dim_input, init='normal', activation='tanh'))
    model.add(Dropout(0.2))
    model.add(Dense(150, input_dim=X_dim_input, init='normal', activation='tanh'))
    model.add(Dropout(0.4))
    model.add(Dense(100, input_dim=X_dim_input, init='normal', activation='relu'))
    model.add(Dropout(0.1))
    #model.add(Dropout(0.1))
    model.add(Dense(12, init='normal', activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])  #logloss
    return model


dummy_y_with = np_utils.to_categorical(y_train_total_with)
for i in range(8):
    kf = StratifiedKFold(y_train_total_with,n_folds = 10,random_state = i)
    score_list_with=[]
    val_loss_list_with = []
    for index,(train, test) in enumerate(kf):
        X_train_with = X_train_total_with[train]
        y_train_with = dummy_y_with[train]
        X_val_with = X_train_total_with[test]
        y_val_with = dummy_y_with[test]
        #print(X_val.shape)
        print('*****************************************************')
        print('{}_fold'.format(index))
        model=with_model(X_train_total_with.shape[1])
        fit= model.fit_generator(generator=batch_generator(X_train_with, y_train_with, 1024, True),
                             nb_epoch=20,
                             samples_per_epoch=40000,
                             validation_data=(X_val_with.todense(), y_val_with), verbose=2
                             )
        scores_val_with = model.predict_generator(generator=batch_generatorp(X_val_with, 512, False), 
                                             val_samples=X_val_with.shape[0])
        scores_with = model.predict_generator(generator=batch_generatorp(X_test_total_with, 512, False), 
                                         val_samples=X_test_total_with.shape[0])
        score_list_with.append(scores_with)
        val_loss = log_loss(y_val_with, scores_val_with)
        val_loss_list_with.append(val_loss)
        print('logloss val {}'.format(val_loss))

    for index,i in enumerate(val_loss_list_with):
        if(index==0):
            sumi = i
        else:
            sumi = i+sumi
    val_loss_ave_with = sumi/len(val_loss_list_with)
    print('average logloss val {}'.format(val_loss_ave_with))
    for index,i in enumerate(score_list_with):
        if(index==0):
            sumi = i
        else:
            sumi = i+sumi
    score_ave_with = sumi/len(score_list_with)
    pred_with = pd.DataFrame(score_ave_with, index = gender_age_test_with.index, columns=targetencoder.classes_)
    pred_with.to_csv('nnet_with_50tanh_150tanh_100relu_softmax{}.csv'.format(val_loss_ave_with))

*****************************************************
0_fold
Epoch 1/20




11s - loss: 2.4084 - acc: 0.1427 - val_loss: 2.3477 - val_acc: 0.1627
Epoch 2/20
11s - loss: 2.2397 - acc: 0.2160 - val_loss: 2.2432 - val_acc: 0.1973
Epoch 3/20
12s - loss: 2.1262 - acc: 0.2479 - val_loss: 2.2271 - val_acc: 0.2145
Epoch 4/20
11s - loss: 2.0607 - acc: 0.2724 - val_loss: 2.1842 - val_acc: 0.2295
Epoch 5/20
11s - loss: 2.0194 - acc: 0.2851 - val_loss: 2.1608 - val_acc: 0.2504
Epoch 6/20
11s - loss: 1.9814 - acc: 0.2982 - val_loss: 2.2211 - val_acc: 0.2230
Epoch 7/20
11s - loss: 1.9610 - acc: 0.3092 - val_loss: 2.1383 - val_acc: 0.2539
Epoch 8/20
11s - loss: 1.9338 - acc: 0.3167 - val_loss: 2.1167 - val_acc: 0.2551
Epoch 9/20
11s - loss: 1.9031 - acc: 0.3271 - val_loss: 2.1202 - val_acc: 0.2628
Epoch 10/20
11s - loss: 1.8904 - acc: 0.3308 - val_loss: 2.1307 - val_acc: 0.2586
Epoch 11/20
11s - loss: 1.8791 - acc: 0.3373 - val_loss: 2.1143 - val_acc: 0.2594
Epoch 12/20
11s - loss: 1.8607 - acc: 0.3396 - val_loss: 2.1657 - val_acc: 0.2513
Epoch 13/20
11s - loss: 1.8418 - acc

## XGboost

In [32]:
(X_train,X_val,y_train,y_val) = cv.train_test_split(X_train_total_with,y_train_total_with,test_size=0.2,
                                                    stratify=y_train_total_with)
xg_train = xgb.DMatrix(X_train,label = y_train)
xg_test = xgb.DMatrix(X_val,label = y_val)
param = {'max_depth':50,
         'eta':0.05,
         'silent':0,
         'objective':'multi:softprob',
         'nthread':2,
         'num_class':12,
         'eval_metric':'mlogloss',
        'lambda':5,
        'lambda_bias':0,
        'alpha':2}
num_round = 200
watchlist = [(xg_train,'train'),(xg_test,'test')]
bst = xgb.train(param,xg_train,num_round,watchlist)
yprob = bst.predict(xg_test).reshape(y_val.shape[0],12)
val_loss = log_loss(y_val, yprob)
print('logloss val {}'.format(val_loss))
del X_train,X_val,y_train,y_val

[0]	train-mlogloss:2.43511	test-mlogloss:2.46524
[1]	train-mlogloss:2.38744	test-mlogloss:2.4473
[2]	train-mlogloss:2.34223	test-mlogloss:2.43129
[3]	train-mlogloss:2.29861	test-mlogloss:2.41536
[4]	train-mlogloss:2.25617	test-mlogloss:2.40103
[5]	train-mlogloss:2.21544	test-mlogloss:2.3872
[6]	train-mlogloss:2.17581	test-mlogloss:2.37492
[7]	train-mlogloss:2.13781	test-mlogloss:2.36293
[8]	train-mlogloss:2.10104	test-mlogloss:2.3511
[9]	train-mlogloss:2.06506	test-mlogloss:2.34079
[10]	train-mlogloss:2.03024	test-mlogloss:2.33037
[11]	train-mlogloss:1.99595	test-mlogloss:2.32055
[12]	train-mlogloss:1.96324	test-mlogloss:2.31174
[13]	train-mlogloss:1.93135	test-mlogloss:2.30254
[14]	train-mlogloss:1.9004	test-mlogloss:2.29434
[15]	train-mlogloss:1.87004	test-mlogloss:2.28647
[16]	train-mlogloss:1.84064	test-mlogloss:2.27927
[17]	train-mlogloss:1.81241	test-mlogloss:2.27213
[18]	train-mlogloss:1.78478	test-mlogloss:2.26535
[19]	train-mlogloss:1.75778	test-mlogloss:2.25913
[20]	train-mlo

# training with all data set

In [33]:
xg_train = xgb.DMatrix(X_train_total_with,label = y_train_total_with)
xg_test = xgb.DMatrix(X_val)
bst = xgb.train(param,xg_train,num_round)
yprob = bst.predict(xg_test).reshape(X_test_total_with.shape[0],12)
pred_with_xgb = pd.DataFrame(yprob, 
                            index = gender_age_test_with.index, 
                            columns=targetencoder.classes_)
pred_with_xgb.to_csv('xgb_with_result{}.csv'.format(val_loss))

AttributeError: feature_names not found

## Logistic regression

In [67]:
(X_train,X_val,y_train,y_val) = cv.train_test_split(X_train_total_with,y_train_total_with,test_size=0.2,
                                                    stratify=y_train_total_with)
#lr grid search
lr = LogisticRegression(multi_class='multinomial')
solver_value = ['newton-cg']
C_value = np.logspace(-2,0,40)
clf = GridSearchCV(estimator=lr,param_grid = dict(C=C_value,solver=solver_value),
                   scoring='log_loss')
clf.fit(X_train,y_train)
print(clf.best_estimator_.C)
print(clf.best_estimator_.solver)
val_loss_ave_without= -clf.score(X_val,y_val)
print(val_loss_ave_without)

0.0180472176683
newton-cg
1.99455721395




In [68]:
lr = LogisticRegression(C=0.0180472176683,multi_class='multinomial',solver='newton-cg')
lr.fit(X_train_total_with,y_train_total_with)
score_ave_with_lr = lr.predict_proba(X_test_total_with)
pred_with_lr = pd.DataFrame(score_ave_with_lr, 
                            index = gender_age_test_with.index, 
                            columns=targetencoder.classes_)
pred_with_lr.to_csv('lr_with_result{}.csv'.format(val_loss_ave_with))

# Subset II: device without events

In [43]:
y_train_total_without = targetencoder.transform(gender_age_train_without_temp.group)

##  Device without eventsI: Naive Bayes, Logistic Regression

Grid search of parameters.

In [61]:
#NB grid search
nbc = MultinomialNB()
alpha_value = np.logspace(-3,4,100)
clf = GridSearchCV(estimator=nbc,param_grid = dict(alpha=alpha_value),scoring='log_loss')
clf.fit(X_train,y_train)
print(clf.best_estimator_.alpha)
print(-clf.score(X_val,y_val))

1204.50354026
2.42330723892




In [55]:
#lr grid search
(X_train,X_val,y_train,y_val) = cv.train_test_split(X_train_total_with,y_train_total_with,test_size=0.2,
                                                    stratify=y_train_total_without)
lr = LogisticRegression(multi_class='multinomial')
solver_value = ['lbfgs']
C_value = np.logspace(-2,0,40)
clf = GridSearchCV(estimator=lr,param_grid = dict(C=C_value,solver=solver_value),
                   scoring='log_loss')
clf.fit(X_train,y_train)
print(clf.best_estimator_.C)
print(clf.best_estimator_.solver)
val_loss_ave_without= -clf.score(X_val,y_val)
print(val_loss_ave_without)
del X_train,X_val,y_train,y_val

0.0744380301325
lbfgs
2.39019928895




Actual training

In [56]:
#lr = LogisticRegression(C=0.02,multi_class='multinomial',solver='newton-cg')
lr = LogisticRegression(C=0.0744380301325,multi_class='multinomial',solver='lbfgs')
lr.fit(X_train_total_without,y_train_total_without)
score_ave_without_lr = lr.predict_proba(X_test_total_without)
pred_without_lr = pd.DataFrame(score_ave_without_lr, 
                            index = gender_age_test_without.index, 
                            columns=targetencoder.classes_)
pred_without_lr.to_csv('lr_without_result{}.csv'.format(val_loss_ave_without))

In [40]:
'''kf = StratifiedKFold(y_train_total_without,n_folds = 10,random_state = 1)
score_list_without=[]
val_loss_list_without = []
for index,(train, test) in enumerate(kf):
    X_train_without = X_train_total_without[train]
    y_train_without = y_train_total_without[train]
    X_val_without = X_train_total_without[test]
    y_val_without = y_train_total_without[test]
    print('*****************************************************')
    print('{}_fold'.format(index))
    lr = LogisticRegression(C=0.0774263682681,multi_class='multinomial',solver='newton-cg')
    lr.fit(X_train_without,y_train_without)
    scores_val_without = lr.predict_proba(X_val_without)
    val_loss = log_loss(y_val_without, scores_val_without)
    val_loss_list_without.append(val_loss)
    print('logloss val {}'.format(val_loss))
    
    scores_without = lr.predict_proba(X_test_total_without)
    score_list_without.append(scores_without)
    
for index,i in enumerate(val_loss_list_without):
    if(index==0):
        sumi = i
    else:
        sumi = i+sumi
val_loss_ave_without = sumi/len(score_list_without)
print('average logloss val {}'.format(val_loss_ave_without))
for index,i in enumerate(score_list_without):
    if(index==0):
        sumi = i
    else:
        sumi = i+sumi
score_ave_without = sumi/len(score_list_without)
pred_without = pd.DataFrame(score_ave_without, index = gender_age_test_without.index, columns=targetencoder.classes_)'''

"kf = StratifiedKFold(y_train_total_without,n_folds = 10,random_state = 1)\nscore_list_without=[]\nval_loss_list_without = []\nfor index,(train, test) in enumerate(kf):\n    X_train_without = X_train_total_without[train]\n    y_train_without = y_train_total_without[train]\n    X_val_without = X_train_total_without[test]\n    y_val_without = y_train_total_without[test]\n    print('*****************************************************')\n    print('{}_fold'.format(index))\n    lr = LogisticRegression(C=0.0774263682681,multi_class='multinomial',solver='newton-cg')\n    lr.fit(X_train_without,y_train_without)\n    scores_val_without = lr.predict_proba(X_val_without)\n    val_loss = log_loss(y_val_without, scores_val_without)\n    val_loss_list_without.append(val_loss)\n    print('logloss val {}'.format(val_loss))\n    \n    scores_without = lr.predict_proba(X_test_total_without)\n    score_list_without.append(scores_without)\n    \nfor index,i in enumerate(val_loss_list_without):\n    if(i

## Device without eventsII: Deep learning

In [57]:
# define baseline model
def without_model(X_dim_input):
    # create model
    model = Sequential()
    #model.add(Dense(10, input_dim=X_train_total.shape[1], init='normal', activation='relu'))
    #model.add(Dropout(0.2))
    model.add(Dense(100, input_dim=X_dim_input, init='normal', activation='relu'))
    #model.add(Dropout(0.1))
    model.add(Dense(40, input_dim=X_dim_input, init='normal', activation='relu'))
    model.add(Dense(40, input_dim=X_dim_input, init='normal', activation='relu'))
    #model.add(Dropout(0.1))
    model.add(Dense(12, init='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])  #logloss
    return model

dummy_y_without = np_utils.to_categorical(y_train_total_without)
kf = StratifiedKFold(y_train_total_without,n_folds = 10,random_state = 1)
score_list_without=[]
val_loss_list_without = []
for index,(train, test) in enumerate(kf):
    X_train_without = X_train_total_without[train]
    y_train_without = dummy_y_without[train]
    X_val_without = X_train_total_without[test]
    y_val_without = dummy_y_without[test]
    print('*****************************************************')
    print('{}_fold'.format(index))
    model=without_model(X_train_total_without.shape[1])
    fit= model.fit_generator(generator=batch_generator(X_train_without, y_train_without, 512, True),
                         nb_epoch=20,
                         samples_per_epoch=80000,
                         validation_data=(X_val_without.todense(), y_val_without), verbose=2
                         )
    scores_val_without = model.predict_generator(generator=batch_generatorp(X_val_without, 512, False), 
                                         val_samples=X_val_without.shape[0])
    scores_without = model.predict_generator(generator=batch_generatorp(X_test_total_without, 512, False), 
                                     val_samples=X_test_total_without.shape[0])
    score_list_without.append(scores_without)
    
    val_loss = log_loss(y_val_without, scores_val_without)
    val_loss_list_without.append(val_loss)
    print('logloss val {}'.format(val_loss))

for index,i in enumerate(val_loss_list_without):
    if(index==0):
        sumi = i
    else:
        sumi = i+sumi
val_loss_ave_without = sumi/len(score_list_without)
print('average logloss val {}'.format(val_loss_ave_without))
for index,i in enumerate(score_list_without):
    if(index==0):
        sumi = i
    else:
        sumi = i+sumi
score_ave_without = sumi/len(score_list_without)
pred_without = pd.DataFrame(score_ave_without, index = gender_age_test_without.index, columns=targetencoder.classes_)

*****************************************************
0_fold
Epoch 1/20




1s - loss: 2.4629 - acc: 0.1261 - val_loss: 2.4279 - val_acc: 0.1287
Epoch 2/20
1s - loss: 2.4256 - acc: 0.1298 - val_loss: 2.4264 - val_acc: 0.1287
Epoch 3/20
1s - loss: 2.4268 - acc: 0.1271 - val_loss: 2.4253 - val_acc: 0.1287
Epoch 4/20
1s - loss: 2.4219 - acc: 0.1285 - val_loss: 2.4250 - val_acc: 0.1287
Epoch 5/20
1s - loss: 2.4209 - acc: 0.1289 - val_loss: 2.4248 - val_acc: 0.1287
Epoch 6/20
1s - loss: 2.4194 - acc: 0.1308 - val_loss: 2.4247 - val_acc: 0.1293
Epoch 7/20
1s - loss: 2.4163 - acc: 0.1341 - val_loss: 2.4246 - val_acc: 0.1342
Epoch 8/20
1s - loss: 2.4120 - acc: 0.1401 - val_loss: 2.4236 - val_acc: 0.1407
Epoch 9/20
1s - loss: 2.4066 - acc: 0.1465 - val_loss: 2.4183 - val_acc: 0.1399
Epoch 10/20
1s - loss: 2.4017 - acc: 0.1440 - val_loss: 2.4161 - val_acc: 0.1390
Epoch 11/20
1s - loss: 2.3980 - acc: 0.1443 - val_loss: 2.4128 - val_acc: 0.1402
Epoch 12/20
1s - loss: 2.3968 - acc: 0.1431 - val_loss: 2.4130 - val_acc: 0.1431
Epoch 13/20
1s - loss: 2.3945 - acc: 0.1450 - va

In [58]:
pred_without.to_csv('nnt_without_result{}.csv'.format(val_loss_ave_without))

In [None]:
score_ensemble = (score_ave_without+score_ave_without_lr)/2

## Device without eventsIII: XGBoost

In [None]:
(X_train,X_val,y_train,y_val) = cv.train_test_split(X_train_total_without,y_train_total_without,test_size=0.02,
                                                    stratify=y_train_total_without)
xg_train = xgb.DMatrix(X_train,label = y_train)
xg_test = xgb.DMatrix(X_val,label = y_val)
param = {'max_depth':50,
         'eta':0.03,
         'silent':0,
         'objective':'multi:softprob',
         'nthread':2,
         'num_class':12,
         'eval_metric':'mlogloss',
        'lambda':4,
        'lambda_bias':0,
        'alpha':2}
num_round = 200
watchlist = [(xg_train,'train'),(xg_test,'test')]
bst = xgb.train(param,xg_train,num_round,watchlist)
yprob = bst.predict(xg_test).reshape(y_val.shape[0],12)
val_loss = log_loss(y_val, yprob)
print('logloss val {}'.format(val_loss))
del X_train,X_val,y_train,y_val

## training with all data

In [None]:
xg_train = xgb.DMatrix(X_train_total_without,label = y_train_total_without)
xg_test = xgb.DMatrix(X_val)
bst = xgb.train(param,xg_train,num_round)
yprob = bst.predict(xg_test).reshape(X_test_total_without.shape[0],12)
pred_without_xgb = pd.DataFrame(yprob, 
                            index = gender_age_test_without.index, 
                            columns=targetencoder.classes_)
pred_without_xgb.to_csv('xgb_without_result{}.csv'.format(val_loss))

# putting together and save into final file

Final score based on the percentage of testing set

In [87]:
#val_score_final = val_loss_ave_without*76877/112071+val_loss_ave_with*35194/112071
val_score_final = (val_loss_ave_without*X_test_total_without.shape[0]+
                   val_loss_ave_with*X_test_total_with.shape[0])/(X_test_total_without.shape[0]+X_test_total_with.shape[0])
print('with score:{}'.format(val_loss_ave_with))
print('without score:{}'.format(val_loss_ave_without))
print('final validation score:{}'.format(val_score_final))

with score:1.9392115926279074
without score:2.3931157227003554
final validation score:2.2505748160093315


In [88]:
pred = pd.concat((pred_with,pred_without))
pred.to_csv('doublemodel_v6.csv',index=True)