# Talking data demographic

## Import necessary packages

In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import time
import os
seed = 7
np.random.seed(seed)

In [2]:
import xgboost as xgb



In [3]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.optimizers import SGD,RMSprop

Using TensorFlow backend.


In [4]:
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss,accuracy_score
from scipy.sparse import csr_matrix,hstack,vstack
from sklearn.feature_selection import SelectPercentile, f_classif, chi2, SelectKBest
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB

## Read data from input folder and print the format of the data.

In [5]:
# When a user uses TalkingData SDK, the event gets logged in this data. 
# Each event has an event id, location (lat/long), and the event 
# corresponds to a list of apps in app_events.
events = pd.read_csv('input/events.csv',index_col='event_id')
app_event = pd.read_csv('input/app_events.csv')
print('\nevents\n',events.head(1), '\n')
print('\napp_event\n',app_event.head(1), '\n')

# Apps' labels and their categories in text
label = pd.read_csv('input/label_categories.csv')
print('\nlabel\n',label.head(1), '\n')

# Apps and their labels, the label_id's can be used to join with 
# label_categories
app_label = pd.read_csv('input/app_labels.csv')
print('\napp_label\n',app_label.head(1), '\n')

# Device ids, brand, and models
device = pd.read_csv('input/phone_brand_device_model.csv')
device = device.drop_duplicates('device_id').set_index('device_id')
print('\ndevice\n',device.head(1), '\n')

# Training set
gender_age_train = pd.read_csv('input/gender_age_train.csv',index_col = 'device_id')
print('gender_age_train\n',gender_age_train.head(1),'\n')

# Test set. Group: this is the target variable you are going to predict.
gender_age_test = pd.read_csv('input/gender_age_test.csv',index_col = 'device_id')
print('\ngender_age_test\n',gender_age_test.head(1))

  mask |= (ar1 == a)



events
                   device_id            timestamp  longitude  latitude
event_id                                                             
1         29182687948017175  2016-05-01 00:55:25     121.38     31.24 


app_event
    event_id               app_id  is_installed  is_active
0         2  5927333115845830913             1          1 


label
    label_id category
0         1      NaN 


app_label
                 app_id  label_id
0  7324884708820027918       251 


device
                      phone_brand device_model
device_id                                    
-8890648629457979026          小米           红米 

gender_age_train
                      gender  age   group
device_id                               
-8076087639492063270      M   35  M32-38 


gender_age_test
 Empty DataFrame
Columns: []
Index: [1002079943728939269]


# Separation of two set of devices: with events and without events

We noticed that about 70% of the data has no event information. This could come from the fact that the users disable the data sharing function or they are inactive users. If it is because of the first reason, treating the features as no active usage will clearly mislead the model. So we separated the data into two groups and built models for both sets of data. Test data were also separated into two groups and fed into the corresponding models, after which the output was combined into the final output.

In [6]:
gender_age_train_with = gender_age_train.loc[gender_age_train.index.isin(events.device_id.unique())].copy()

gender_age_train_without = gender_age_train.loc[~gender_age_train.index.isin(events.device_id.unique())].copy()

gender_age_test_with = gender_age_test.loc[gender_age_test.index.isin(events.device_id.unique())].copy()

gender_age_test_without = gender_age_test.loc[~gender_age_test.index.isin(events.device_id.unique())].copy()

print('{0:<40.40}{1:5}'.format('Size of training set without events:',gender_age_train_without.shape[0]))
print('{0:<40.40}{1:5}'.format('Size of training set with events:',gender_age_train_with.shape[0]))
print('{0:<40.40}{1:5}'.format('Total size of training set:',gender_age_train.shape[0]))
print('{0:<40.40}{1:5}'.format('Size of testing set without events:',gender_age_test_without.shape[0]))
print('{0:<40.40}{1:5}'.format('Size of training set with events:',gender_age_test_with.shape[0]))
print('{0:<40.40}{1:5}'.format('Total size of training set:',gender_age_test.shape[0]))

Size of training set without events:    51336
Size of training set with events:       23309
Total size of training set:             74645
Size of testing set without events:     76877
Size of training set with events:       35194
Total size of training set:             112071


For the convinience of creating sparse matrix, we create another integer index for the training and testing set.

In [7]:
gender_age_train_with['int_index'] = np.arange(gender_age_train_with.shape[0])
gender_age_test_with['int_index'] = np.arange(gender_age_test_with.shape[0])
gender_age_train_without['int_index'] = np.arange(gender_age_train_without.shape[0])
gender_age_test_without['int_index'] = np.arange(gender_age_test_without.shape[0])

# Feature engineering I: phone brand

Encoding the phone brand, and feed this as a feature into the training and testing test.

In [8]:
# Encode labels 'brand' with value between 0 and n_classes-1
encoder = LabelEncoder()
encoder.fit(device.phone_brand)
device['brand'] = encoder.transform(device.phone_brand)

# The training and test sets share the same index col 
# with device data frame, so simply insert the encoded
# brand number. 
gender_age_train_with['brand'] = device['brand']
gender_age_test_with['brand'] = device['brand']
gender_age_train_without['brand'] = device['brand']
gender_age_test_without['brand'] = device['brand']

brandnumber = len(encoder.classes_)
print('There are {0} kinds of phone brands.'.format(brandnumber))

There are 131 kinds of phone brands.


There are a lot of zeros in the matrix. In order to improve efficiency, we created sparse matrix with each row representing one device and each column representing one brand

In [9]:
# The sparse matrix satisfies the following condition:
#     X_train_brand_with(gender_age_train_with.int_index[k], gender_age_train_with.brand) = 1
X_train_brand_with = csr_matrix(arg1 = (np.ones(gender_age_train_with.shape[0]),
                                        (gender_age_train_with.int_index,
                                         gender_age_train_with.brand)),
                                shape = (gender_age_train_with.shape[0],
                                         brandnumber)
                               )

X_test_brand_with = csr_matrix(arg1 = (np.ones(gender_age_test_with.shape[0]),
                                       (gender_age_test_with.int_index,
                                        gender_age_test_with.brand)),
                              shape = (gender_age_test_with.shape[0],
                                       brandnumber))


X_train_brand_without = csr_matrix(arg1 = (np.ones(gender_age_train_without.shape[0]),
                                           (gender_age_train_without.int_index,
                                            gender_age_train_without.brand)),
                                  shape = (gender_age_train_without.shape[0],
                                           brandnumber))

X_test_brand_without = csr_matrix(arg1 = (np.ones(gender_age_test_without.shape[0]),
                                          (gender_age_test_without.int_index,
                                           gender_age_test_without.brand)),
                                 shape = (gender_age_test_without.shape[0],
                                          brandnumber))

print('X_train_brand_with shape:',X_train_brand_with.shape)
print('X_test_brand_with shape:',X_test_brand_with.shape)
print('X_train_brand_without shape:',X_train_brand_without.shape)
print('X_test_brand_without shape:',X_test_brand_without.shape)

X_train_brand_with shape: (23309, 131)
X_test_brand_with shape: (35194, 131)
X_train_brand_without shape: (51336, 131)
X_test_brand_without shape: (76877, 131)


# Feature engineering II: phone device model

This feature not only contains the phone brand but also the model information.

In [10]:
# Concatenate the phone brand and model strings
brand_model = device.phone_brand.str.cat(device.device_model)

# Encode the feature
encoder3 = LabelEncoder()
encoder3.fit(brand_model)
device['model'] = encoder3.transform(brand_model)

gender_age_train_with['model'] = device['model']
gender_age_test_with['model'] = device['model']
gender_age_train_without['model'] = device['model']
gender_age_test_without['model'] = device['model']

modelnumber = len(encoder3.classes_)
print('There are {0} kinds of phone brand and model combination.'.format(modelnumber))

There are 1667 kinds of phone brand and model combination.


In [11]:
# Create the sparse matrix as above.
X_train_model_with = csr_matrix(arg1 = (np.ones(gender_age_train_with.shape[0]),
                                        (gender_age_train_with.int_index,
                                         gender_age_train_with.model)),
                               shape = (gender_age_train_with.shape[0],
                                        modelnumber))

X_test_model_with = csr_matrix(arg1 = (np.ones(gender_age_test_with.shape[0]),
                                       (gender_age_test_with.int_index,
                                        gender_age_test_with.model)),
                              shape = (gender_age_test_with.shape[0],
                                       modelnumber))

X_train_model_without = csr_matrix(arg1 = (np.ones(gender_age_train_without.shape[0]),
                                           (gender_age_train_without.int_index,
                                            gender_age_train_without.model)),
                                  shape = (gender_age_train_without.shape[0],
                                           modelnumber))

X_test_model_without = csr_matrix(arg1 = (np.ones(gender_age_test_without.shape[0]),
                                          (gender_age_test_without.int_index,
                                           gender_age_test_without.model)),
                                 shape = (gender_age_test_without.shape[0],
                                          modelnumber))

print('X_train_brand_with shape:',X_train_model_with.shape)
print('X_test_brand_with shape:',X_test_model_with.shape)
print('X_train_brand_without shape:',X_train_model_without.shape)
print('X_test_brand_without shape:',X_test_model_without.shape)

X_train_brand_with shape: (23309, 1667)
X_test_brand_with shape: (35194, 1667)
X_train_brand_without shape: (51336, 1667)
X_test_brand_without shape: (76877, 1667)


In [12]:
# Clean the memory related to devices and brand model.
del device,brand_model

# Feature engineering III: Installed app
From the mobile events, we extracted the apps installed and used that as a feature. Only the data set with events has such feature. 

In [13]:
#Encoding the app_id and store it into app column, and feed this 
#as a feature into the training and testing test.
encoder2 = LabelEncoder()
encoder2.fit(app_event.app_id)
app_event['app'] = encoder2.transform(app_event.app_id)

In [14]:
print('The events dataframe:')
print(events.head(2),'\n')
print('The app_event dataframe:')
print(app_event.head(2), '\n')

# Merge the events dataframe with the app_events data frame
installed_app = pd.merge(events[['device_id']],app_event[['event_id','app','is_installed']],
                         how='right',right_on = 'event_id',left_index = True)

print(installed_app.head(), '\n')

appnumber = np.size(installed_app.app.unique())
print('Consider the installation of {0} apps. This will be\
    the number of features after encoding.'.format(appnumber))

The events dataframe:
                    device_id            timestamp  longitude  latitude
event_id                                                               
1           29182687948017175  2016-05-01 00:55:25     121.38     31.24
2        -6401643145415154744  2016-05-01 00:54:12     103.65     30.97 

The app_event dataframe:
   event_id               app_id  is_installed  is_active    app
0         2  5927333115845830913             1          1  15408
1         2 -5720078949152207372             1          0   3384 

             device_id  event_id    app  is_installed
0 -6401643145415154744         2  15408             1
1 -6401643145415154744         2   3384             1
2 -6401643145415154744         2   7620             1
3 -6401643145415154744         2   8902             1
4 -6401643145415154744         2  18686             1 

Consider the installation of 19237 apps. This will be    the number of features after encoding.


In [15]:
# Use a new dataFrame installed_app_grouped to store the 
# installed column for each device,app index
installed_app_grouped = pd.DataFrame()
installed_app_grouped['installed'] = installed_app\
                                        .groupby(['device_id','app'])['app'].size()>0

print('installed_app_grouped:')
print(installed_app_grouped.head(), '\n')
installed_app_train_with = pd.merge(installed_app_grouped,
                                    gender_age_train_with[['int_index']],
                                    how = 'right',
                                    right_index=True,
                                    left_index=True)
installed_app_test_with = pd.merge(installed_app_grouped,
                                   gender_age_test_with[['int_index']],
                                   how = 'right',
                                   right_index=True,
                                   left_index=True)
print('installed_app_train:')
print(installed_app_train_with.head())

installed_app_grouped:
                           installed
device_id            app            
-9222956879900151005 548        True
                     1096       True
                     1248       True
                     1545       True
                     1664       True 

installed_app_train:
                           installed  int_index
device_id            app                       
-9222956879900151005 548        True       5145
                     1096       True       5145
                     1248       True       5145
                     1545       True       5145
                     1664       True       5145


In [16]:
installed_app_train_with = installed_app_train_with.reset_index()
installed_app_test_with = installed_app_test_with.reset_index()

# Make sure there is no NA in int_index column
installed_app_train_with = installed_app_train_with.dropna(subset=['int_index'])
installed_app_test_with = installed_app_test_with.dropna(subset=['int_index'])
print('The training set:')
print(installed_app_train_with.head())
print('The test set:')
print(installed_app_test_with.head())

The training set:
             device_id   app  installed  int_index
0 -9222956879900151005   548       True       5145
1 -9222956879900151005  1096       True       5145
2 -9222956879900151005  1248       True       5145
3 -9222956879900151005  1545       True       5145
4 -9222956879900151005  1664       True       5145
The test set:
             device_id    app  installed  int_index
0 -9222661944218806987   1867       True       2851
1 -9222661944218806987   7519       True       2851
2 -9222661944218806987   7843       True       2851
3 -9222661944218806987   8704       True       2851
4 -9222661944218806987  10000       True       2851


In [17]:
# Used the table above, we can create the sparse matrix that corresponds to the 
# app installation feature.
X_train_installed_with = csr_matrix(arg1 = (np.ones(installed_app_train_with.shape[0]),
                                            (installed_app_train_with.int_index,
                                             installed_app_train_with.app)), 
                                   shape = (gender_age_train_with.shape[0],
                                            appnumber))

X_test_installed_with = csr_matrix(arg1 = (np.ones(installed_app_test_with.shape[0]),
                                           (installed_app_test_with.int_index,
                                            installed_app_test_with.app)),
                                   shape = (gender_age_test_with.shape[0],
                                            appnumber))

print('X_train_installed shape:',X_train_installed_with.shape)
print('X_test_installed shape:',X_test_installed_with.shape)

del installed_app_test_with,installed_app_train_with

X_train_installed shape: (23309, 19237)
X_test_installed shape: (35194, 19237)


# Feature engineering IV: app label

While knowing the installation of different individual apps might be useful, we also grouped the apps based on their types (label_id in the data). After this step, instead of the dichotomous features we had above, we will have numerical features. This set of feature essentially represent the total number usage of different types of apps.

In [18]:
print('The two dataframe used:')
print('app_events:\n',app_event[['app_id','event_id']].head(2),'\n')
print('app_label:\n',app_label[['app_id','label_id']].head(2),'\n')

The two dataframe used:
app_events:
                 app_id  event_id
0  5927333115845830913         2
1 -5720078949152207372         2 

app_label:
                 app_id  label_id
0  7324884708820027918       251
1 -4494216993218550286       251 



In [19]:
# Not every app in the app_label data frame shows up in the app_event.
# So we use only the ones appear in the app_event to save memory.
# This part need to be fixed for streaming data set. 
app_label_new = app_label.loc[app_label.app_id.isin(app_event.app_id.unique())].copy()

# Make use of the encoder in Feature engineering II to get label of app.
app_label_new['app'] = encoder2.transform(app_label_new.app_id)

# Encode the labels into smaller range to save memory.
encoder4 = LabelEncoder().fit(app_label_new.label_id)
app_label_new['label'] = encoder4.transform(app_label_new.label_id)

labelnumber = len(encoder4.classes_)

print('Total number of labels:',labelnumber)
print('app_label_new:')
print(app_label_new.sort_values(by='label').head(5))

Total number of labels: 492
app_label_new:
                     app_id  label_id    app  label
434376 -2600987541603275322         2   6493      0
434377 -2600987541603275322         4   6493      1
435078  3489720147367481003         5  13156      2
435221 -3009285907035616624         5   6118      2
418666 -1968479138889230354         6   7100      3


In [20]:
print('Before merging: installed_app_grouped:')
print(installed_app_grouped.head(),'\n')

installed_label_grouped = (installed_app_grouped.reset_index()[['device_id','app']]
                                                # merge with the app_label_new data frame
                                                .merge(app_label_new[['app','label']])
                                                .groupby(['device_id','label'])
                           # calculate the size of different device_id, label pairs
                          )['app'].agg(['size'])\
                          .reset_index()
                          
print('After merging: installed_label_grouped:')
print(installed_label_grouped.head())

Before merging: installed_app_grouped:
                           installed
device_id            app            
-9222956879900151005 548        True
                     1096       True
                     1248       True
                     1545       True
                     1664       True 

After merging: installed_label_grouped:
             device_id  label  size
0 -9222956879900151005    117     1
1 -9222956879900151005    120     1
2 -9222956879900151005    126     1
3 -9222956879900151005    138     2
4 -9222956879900151005    147     2


In [21]:
# Merge the feature into the training and testing sets.
label_app_train_with = pd.merge(installed_label_grouped,
                                gender_age_train_with[['int_index']],
                                how = 'right',
                                right_index=True,
                                left_on='device_id')

label_app_test_with = pd.merge(installed_label_grouped,
                               gender_age_test_with[['int_index']],
                               how = 'right',
                               right_index=True,
                               left_on ='device_id' )

# Make sure there is no NA value in the int_index and label since they will be used 
# as the indexes in the sparse matrix.
label_app_train_with = label_app_train_with.dropna(subset= ['int_index','label'])

label_app_test_with = label_app_test_with.dropna(subset= ['int_index','label'])

In [22]:
# Dichotomous feature as to whether apps with such label was installed or not.
'''X_train_label_with = csr_matrix(arg1 = (np.ones(label_app_train_with.shape[0]),
                                            (label_app_train_with.int_index,
                                             label_app_train_with.label)),
                                   shape = (gender_age_train_with.shape[0],
                                            labelnumber))
                                            
X_test_label_with = csr_matrix(arg1 = (np.ones(label_app_test_with.shape[0]),
                                        (label_app_test_with.int_index,
                                         label_app_test_with.label)),
                              shape = (gender_age_test_with.shape[0],
                                       labelnumber))'''

# Numerical feature as to how many apps with such label was installed.
X_train_label_with = csr_matrix(arg1 = (label_app_train_with['size'],
                                         (label_app_train_with.int_index,
                                          label_app_train_with.label)),
                                shape = (gender_age_train_with.shape[0],
                                         labelnumber))

X_test_label_with = csr_matrix(arg1 = (label_app_test_with['size'],
                                        (label_app_test_with.int_index,
                                         label_app_test_with.label)),
                              shape = (gender_age_test_with.shape[0],
                                       labelnumber))

print('X_train_installed_with shape:',X_train_label_with.shape)

print('X_test_installed_with shape:',X_test_label_with.shape)

X_train_installed_with shape: (23309, 492)
X_test_installed_with shape: (35194, 492)


In [23]:
del installed_app_grouped,label,app_label,app_label_new,\
    label_app_test_with,label_app_train_with,encoder4,installed_app

# Feature engineering V: active app

In [24]:
# Merge the events dataframe with the app_events data frame
active_app = pd.merge(events[['device_id']],app_event[['event_id','app','is_active']],
                         how='right',right_on = 'event_id',left_index = True)

# Select the active ones
active_app = active_app.loc[active_app.is_active==1]

print(active_app.head())

             device_id  event_id    app  is_active
0 -6401643145415154744         2  15408          1
3 -6401643145415154744         2   8902          1
4 -6401643145415154744         2  18686          1
5 -6401643145415154744         2  14346          1
9 -6401643145415154744         2  16908          1


In [25]:
# Group by (device_id, app) index and calculat the number of active usage
active_app_grouped = active_app.groupby(['device_id','app'])['app'].agg(['size'])

print('active_app_grouped:')
print(active_app_grouped.head())

active_app_grouped:
                           size
device_id            app       
-9222956879900151005 548      4
                     1248    15
                     1545     2
                     1848    31
                     2236    17


In [26]:
active_app_train_with = pd.merge(active_app_grouped,gender_age_train_with[['int_index']],
                               how = 'left',right_index=True,left_index=True)

active_app_test_with = pd.merge(active_app_grouped,gender_age_test_with[['int_index']],
                              how = 'left',right_index=True,left_index=True)

print('active_app_train_with:')
print(active_app_train_with.head())

active_app_train_with:
                           size  int_index
device_id            app                  
-9222956879900151005 548      4     5145.0
                     1248    15     5145.0
                     1545     2     5145.0
                     1848    31     5145.0
                     2236    17     5145.0


In [27]:
# reset the indexes and make sure no NA in the int_index columns
active_app_train_with = active_app_train_with.reset_index()
active_app_test_with = active_app_test_with.reset_index()
active_app_train_with = active_app_train_with.dropna(subset=['int_index'])
active_app_test_with = active_app_test_with.dropna(subset=['int_index'])
print(active_app_train_with.head())
print(active_app_test_with.head())

             device_id   app  size  int_index
0 -9222956879900151005   548     4     5145.0
1 -9222956879900151005  1248    15     5145.0
2 -9222956879900151005  1545     2     5145.0
3 -9222956879900151005  1848    31     5145.0
4 -9222956879900151005  2236    17     5145.0
              device_id    app  size  int_index
55 -9222661944218806987   1867     3     2851.0
56 -9222661944218806987   7519     7     2851.0
57 -9222661944218806987   7843     1     2851.0
58 -9222661944218806987   8704     3     2851.0
59 -9222661944218806987  10000     1     2851.0


In [28]:
# binary active map
#X_train_active_with = csr_matrix((np.ones(active_app_train_with.shape[0]),
#                            (active_app_train_with.int_index,active_app_train_with.app)), 
#                            shape = (gender_age_train_with.shape[0],appnumber))
#X_test_active = csr_matrix((np.ones(active_app_test_with.shape[0]),
#                            (active_app_test.int_index,active_app_test_with.app)),
#                            shape = (gender_age_test_with.shape[0],appnumber))
# count the number of active app
X_train_active_with = csr_matrix(arg1 = (active_app_train_with['size'],
                                         (active_app_train_with.int_index,
                                          active_app_train_with.app)), 
                                 shape = (gender_age_train_with.shape[0],
                                          appnumber))

X_test_active_with = csr_matrix(arg1 = (active_app_test_with['size'],
                                        (active_app_test_with.int_index,
                                         active_app_test_with.app)),
                                shape = (gender_age_test_with.shape[0],
                                         appnumber))

print('X_train_active shape:',X_train_active_with.shape)
print('X_test_active shape:',X_test_active_with.shape)

X_train_active shape: (23309, 19237)
X_test_active shape: (35194, 19237)


In [29]:
# Standardlizing the data
'''scaler = StandardScaler(with_mean=False)
X_train_active_with = scaler.fit_transform(X_train_active_with)
X_test_active_with = scaler.transform(X_test_active_with)'''

'scaler = StandardScaler(with_mean=False)\nX_train_active_with = scaler.fit_transform(X_train_active_with)\nX_test_active_with = scaler.transform(X_test_active_with)'

# Feature engineering VI: active time period

In [39]:
# Use the hour number to represent the time, so there are 24 different values
events_time = events[['device_id','timestamp']].copy()

events_time['time']  = events_time['timestamp'].str[11:13].astype(int)

events_time.drop(['timestamp'],axis=1,inplace=True)

events_time = events_time.groupby(['device_id','time'])['time']\
            .agg(['count']).reset_index().rename(columns = {'count':'times'})


timenumber= events_time.time.unique().shape[0]

print(events_time.head())

             device_id  time  times
0 -9222956879900151005     7      2
1 -9222956879900151005    11      7
2 -9222956879900151005    12     13
3 -9222956879900151005    13      3
4 -9222956879900151005    14      5


In [33]:
time_train_with = pd.merge(events_time,gender_age_train_with[['int_index']],
                               right_index=True,left_on='device_id')
time_test_with = pd.merge(events_time,gender_age_test_with[['int_index']],
                               right_index=True,left_on ='device_id' )
#binary
X_train_time_with = csr_matrix(arg1 = (np.ones(time_train_with.shape[0]),
                                       (time_train_with.int_index,
                                        time_train_with.time)), 
                               shape = (gender_age_train_with.shape[0],timenumber))
X_test_time_with = csr_matrix((np.ones(time_test_with.shape[0]),
                            (time_test_with.int_index,time_test_with.time)),
                            shape = (gender_age_test_with.shape[0],timenumber))

#number
#X_train_time_with = csr_matrix((time_train_with['times'],
#                            (time_train_with.int_index,time_train_with.time)), 
#                            shape = (gender_age_train_with.shape[0],timenumber))
#X_test_time_with = csr_matrix((time_test_with['times'],
#                            (time_test_with.int_index,time_test_with.time)),
#                            shape = (gender_age_test_with.shape[0],timenumber))
print('X_train_time_with shape:',X_train_time_with.shape)
print('X_test_time_with shape:',X_test_time_with.shape)

X_train_time_with shape: (23309, 24)
X_test_time_with shape: (35194, 24)


Normalization of the time period count.

In [34]:
'''scaler = StandardScaler(with_mean=False)
X_train_time_with = scaler.fit_transform(X_train_time_with)
X_test_time_with = scaler.transform(X_test_time_with)'''

'scaler = StandardScaler(with_mean=False)\nX_train_time_with = scaler.fit_transform(X_train_time_with)\nX_test_time_with = scaler.transform(X_test_time_with)'

# feature join and selection

## Device without events

In [35]:
X_train_total_without = hstack((X_train_brand_without,X_train_model_without),format='csr')
temp_train = hstack((X_train_brand_with,X_train_model_with),format='csr')
X_test_total_without = hstack((X_test_brand_without,X_test_model_without),format='csr')

X_train_total_without= vstack((X_train_total_without,temp_train),format = 'csr')
gender_age_train_without_temp = pd.concat((gender_age_train_without,gender_age_train_with))

print('Training shape:')
print(X_train_total_without.shape)
print('Testing shape:')
print(X_test_total_without.shape)
print('y shape:')
print(gender_age_train_without_temp.shape)

Training shape:
(74645, 1798)
Testing shape:
(76877, 1798)
y shape:
(74645, 6)


## Device with events

In [36]:
X_train_total_with = hstack((X_train_brand_with,X_train_model_with,
                             X_train_active_with,
                             X_train_time_with,
                             X_train_installed_with,X_train_label_with),format='csr')
X_test_total_with = hstack((X_test_brand_with,X_test_model_with,
                            X_test_active_with,
                            X_test_time_with,
                           X_test_installed_with,X_test_label_with),format='csr')
print('Training shape:')
print(X_train_total_with.shape)
print('Testing shape:')
print(X_test_total_with.shape)

Training shape:
(23309, 40788)
Testing shape:
(35194, 40788)


In [37]:
#percentile selction
#selector = SelectPercentile(f_classif, percentile=80)
#selector.fit(X_train_total, y_train_total)
#X_train_total = selector.transform(X_train_total)
#X_test_total = selector.transform(X_test_total)
#X_val.shape

# Selection using chi-square
#selector = SelectKBest(chi2, k=15155).fit(X_train_total, y_train_total)
#X_train_total = selector.transform(X_train_total)
#X_test_total = selector.transform(X_test_total)
#print('Training shape:')
#print(X_train_total.shape)
#print('Testing shape:')
#print(X_test_total.shape)

# Clear the memory before we do the learning

In [38]:
del app_event

# Start training model

encode the target

In [39]:
targetencoder = LabelEncoder().fit(gender_age_train_with.group)
nclasses = len(targetencoder.classes_)
y_train_total_with = targetencoder.transform(gender_age_train_with.group)

# Subset I: device with events

##  Device with events I: Neural network

Defining functions:

In [None]:
def batch_generator(X, y, batch_size, shuffle):
    #chenglong code for fiting from generator 
    #(https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0
def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0



In [None]:
def with_model(X_dim_input):
    # create model
    model = Sequential()
    #model.add(Dense(10, input_dim=X_train_total.shape[1], init='normal', activation='relu'))
    #model.add(Dropout(0.2))
    #model.add(Dense(50, input_dim=X_dim_input, init='normal', activation='tanh'))
    #model.add(Dropout(0.4))
    model.add(Dense(50, input_dim=X_dim_input, init='normal', activation='relu'))
    model.add(Dropout(0.7))
    #model.add(Dense(100, input_dim=X_dim_input, init='normal', activation='relu'))
    #model.add(Dropout(0.4))
    #model.add(Dropout(0.1))
    model.add(Dense(12, init='normal', activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])  #logloss
    return model


dummy_y_with = np_utils.to_categorical(y_train_total_with)
for i in range(8):
    kf = StratifiedKFold(y_train_total_with,n_folds = 10,random_state = i)
    score_list_with=[]
    val_loss_list_with = []
    for index,(train, test) in enumerate(kf):
        X_train_with = X_train_total_with[train]
        y_train_with = dummy_y_with[train]
        X_val_with = X_train_total_with[test]
        y_val_with = dummy_y_with[test]
        #print(X_val.shape)
        print('*****************************************************')
        print('{}_fold'.format(index))
        model=with_model(X_train_total_with.shape[1])
        fit= model.fit_generator(generator=batch_generator(X_train_with, y_train_with, 32, True),
                             nb_epoch=15,
                             samples_per_epoch=30000,
                             validation_data=(X_val_with.todense(), y_val_with), verbose=2
                             )
        scores_val_with = model.predict_generator(generator=batch_generatorp(X_val_with, 32, False), 
                                             val_samples=X_val_with.shape[0])
        scores_with = model.predict_generator(generator=batch_generatorp(X_test_total_with, 32, False), 
                                         val_samples=X_test_total_with.shape[0])
        score_list_with.append(scores_with)
        val_loss = log_loss(y_val_with, scores_val_with)
        val_loss_list_with.append(val_loss)
        print('logloss val {}'.format(val_loss))

    for index,i in enumerate(val_loss_list_with):
        if(index==0):
            sumi = i
        else:
            sumi = i+sumi
    val_loss_ave_with = sumi/len(val_loss_list_with)
    print('average logloss val {}'.format(val_loss_ave_with))
    for index,i in enumerate(score_list_with):
        if(index==0):
            sumi = i
        else:
            sumi = i+sumi
    score_ave_with = sumi/len(score_list_with)
    pred_with = pd.DataFrame(score_ave_with, index = gender_age_test_with.index, columns=targetencoder.classes_)
    pred_with.to_csv('nnet_with_all_feature_100relu_softmax{}.csv'.format(val_loss_ave_with))

##  Device with events II: XGboost

In [None]:
y_train_total_with

In [None]:
(X_train,X_val,y_train,y_val) = cv.train_test_split(X_train_total_with,y_train_total_with,test_size=0.02,
                                                    stratify=y_train_total_with)
#grid search cv
xgbclassifier = xgb.XGBClassifier(objective="multi:softprob", nthread=1)

clf = GridSearchCV(estimator=xgbclassifier,param_grid={
        'max_depth': [ 6,7,8],
        'learning_rate': [0.15,0.2,0.25],
        'reg_lambda':[4,5],
        'reg_alpha':[2,3,4]
    },verbose=10,scoring='log_loss')
clf.fit(X_train,y_train)
clf.best_estimator_
val_loss_ave_with= -clf.score(X_val,y_val)
print(val_loss_ave_with)
del X_train,X_val,y_train,y_val
os.system("printf '\a'")

In [None]:
clf.best_estimator_

In [None]:
(X_train,X_val,y_train,y_val) = cv.train_test_split(X_train_total_with,y_train_total_with,test_size=0.02,
                                                    stratify=y_train_total_with,random_state=1)
xg_train = xgb.DMatrix(X_train,label = y_train)
xg_test = xgb.DMatrix(X_val,label = y_val)
param = {'max_depth':6,
         'eta':0.1,
         'silent':0,
         'objective':'multi:softprob',
         'nthread':2,
         'num_class':12,
         'eval_metric':'mlogloss',
        'lambda':5,
        'lambda_bias':0,
        'alpha':1}
num_round = 300
watchlist = [(xg_train,'train'),(xg_test,'test')]
bst = xgb.train(param,xg_train,num_round,watchlist)
yprob = bst.predict(xg_test).reshape(y_val.shape[0],12)
val_loss = log_loss(y_val, yprob)
print('logloss val {}'.format(val_loss))
del X_train,X_val,y_train,y_val
os.system("printf '\a'")

After obtaining the best parameters, we combined the training set with the development set and started the final training.

In [None]:
xg_train = xgb.DMatrix(X_train_total_with,label = y_train_total_with)
xg_test = xgb.DMatrix(X_test_total_with)
bst = xgb.train(param,xg_train,num_round)
yprob = bst.predict(xg_test).reshape(X_test_total_with.shape[0],12)
pred_with_xgb = pd.DataFrame(yprob, 
                            index = gender_age_test_with.index, 
                            columns=targetencoder.classes_)
pred_with_xgb.to_csv('xgb_with_result{}.csv'.format(val_loss))
os.system("printf '\a'")

##  Device with events III: Logistic regression

In [None]:
(X_train,X_val,y_train,y_val) = cv.train_test_split(X_train_total_with,y_train_total_with,test_size=0.2,
                                                    stratify=y_train_total_with)
#lr grid search
lr = LogisticRegression(multi_class='multinomial')
solver_value = ['lbfgs']
C_value = np.logspace(-2,0,40)
clf = GridSearchCV(estimator=lr,param_grid = dict(C=C_value,solver=solver_value),
                   scoring='log_loss')
clf.fit(X_train,y_train)
print(clf.best_estimator_.C)
print(clf.best_estimator_.solver)
val_loss_ave_with= -clf.score(X_val,y_val)
print(val_loss_ave_with)
os.system("printf '\a'")

In [None]:
lr = LogisticRegression(C=0.0180472176683,multi_class='multinomial',solver='lbfgs')
lr.fit(X_train_total_with,y_train_total_with)
score_ave_with_lr = lr.predict_proba(X_test_total_with)
pred_with_lr = pd.DataFrame(score_ave_with_lr, 
                            index = gender_age_test_with.index, 
                            columns=targetencoder.classes_)
pred_with_lr.to_csv('lr_with_result{}.csv'.format(val_loss_ave_with))
os.system("printf '\a'")

# Subset II: device without events

In [None]:
y_train_total_without = targetencoder.transform(gender_age_train_without_temp.group)

##  Device without events I: Naive Bayes, Logistic Regression

Grid search of parameters.

In [None]:
#NB grid search
nbc = MultinomialNB()
alpha_value = np.logspace(-3,4,100)
clf = GridSearchCV(estimator=nbc,param_grid = dict(alpha=alpha_value),scoring='log_loss')
clf.fit(X_train,y_train)
print(clf.best_estimator_.alpha)
print(-clf.score(X_val,y_val))

In [None]:
#lr grid search
(X_train,X_val,y_train,y_val) = cv.train_test_split(X_train_total_without,y_train_total_without,test_size=0.2,
                                                    stratify=y_train_total_without)
lr = LogisticRegression(multi_class='multinomial')
solver_value = ['lbfgs']
C_value = np.logspace(-2,0,40)
clf = GridSearchCV(estimator=lr,param_grid = dict(C=C_value,solver=solver_value),
                   scoring='log_loss')
clf.fit(X_train,y_train)
print(clf.best_estimator_.C)
print(clf.best_estimator_.solver)
val_loss_ave_without= -clf.score(X_val,y_val)
print(val_loss_ave_without)
del X_train,X_val,y_train,y_val

After obtaining the best parameters, we combined the training set with the development set and started the final training.

In [None]:
#lr = LogisticRegression(C=0.02,multi_class='multinomial',solver='newton-cg')
lr = LogisticRegression(C=0.0744380301325,multi_class='multinomial',solver='lbfgs')
lr.fit(X_train_total_without,y_train_total_without)
score_ave_without_lr = lr.predict_proba(X_test_total_without)
pred_without_lr = pd.DataFrame(score_ave_without_lr, 
                            index = gender_age_test_without.index, 
                            columns=targetencoder.classes_)
pred_without_lr.to_csv('lr_without_result{}.csv'.format(val_loss_ave_without))

In [None]:
'''kf = StratifiedKFold(y_train_total_without,n_folds = 10,random_state = 1)
score_list_without=[]
val_loss_list_without = []
for index,(train, test) in enumerate(kf):
    X_train_without = X_train_total_without[train]
    y_train_without = y_train_total_without[train]
    X_val_without = X_train_total_without[test]
    y_val_without = y_train_total_without[test]
    print('*****************************************************')
    print('{}_fold'.format(index))
    lr = LogisticRegression(C=0.0774263682681,multi_class='multinomial',solver='newton-cg')
    lr.fit(X_train_without,y_train_without)
    scores_val_without = lr.predict_proba(X_val_without)
    val_loss = log_loss(y_val_without, scores_val_without)
    val_loss_list_without.append(val_loss)
    print('logloss val {}'.format(val_loss))
    
    scores_without = lr.predict_proba(X_test_total_without)
    score_list_without.append(scores_without)
    
for index,i in enumerate(val_loss_list_without):
    if(index==0):
        sumi = i
    else:
        sumi = i+sumi
val_loss_ave_without = sumi/len(score_list_without)
print('average logloss val {}'.format(val_loss_ave_without))
for index,i in enumerate(score_list_without):
    if(index==0):
        sumi = i
    else:
        sumi = i+sumi
score_ave_without = sumi/len(score_list_without)
pred_without = pd.DataFrame(score_ave_without, index = gender_age_test_without.index, columns=targetencoder.classes_)'''

## Device without events II: Neural Network

In [None]:
# define baseline model
def without_model(X_dim_input):
    # create model
    model = Sequential()
    #model.add(Dense(10, input_dim=X_train_total.shape[1], init='normal', activation='relu'))
    #model.add(Dropout(0.2))
    model.add(Dense(100, input_dim=X_dim_input, init='normal', activation='relu'))
    #model.add(Dropout(0.1))
    #model.add(Dense(40, input_dim=X_dim_input, init='normal', activation='relu'))
    #model.add(Dense(40, input_dim=X_dim_input, init='normal', activation='relu'))
    #model.add(Dropout(0.1))
    model.add(Dense(12, init='normal', activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])  #logloss
    return model

dummy_y_without = np_utils.to_categorical(y_train_total_without)
for i in range(8):
    kf = StratifiedKFold(y_train_total_without,n_folds = 10,random_state = i*2)
    score_list_without=[]
    val_loss_list_without = []
    for index,(train, test) in enumerate(kf):
        X_train_without = X_train_total_without[train]
        y_train_without = dummy_y_without[train]
        X_val_without = X_train_total_without[test]
        y_val_without = dummy_y_without[test]
        print('*****************************************************')
        print('{}_fold'.format(index))
        model=without_model(X_train_total_without.shape[1])
        fit= model.fit_generator(generator=batch_generator(X_train_without, y_train_without, 512, True),
                             nb_epoch=20,
                             samples_per_epoch=80000,
                             validation_data=(X_val_without.todense(), y_val_without), verbose=2
                             )
        scores_val_without = model.predict_generator(generator=batch_generatorp(X_val_without, 512, False), 
                                             val_samples=X_val_without.shape[0])
        scores_without = model.predict_generator(generator=batch_generatorp(X_test_total_without, 512, False), 
                                         val_samples=X_test_total_without.shape[0])
        score_list_without.append(scores_without)

        val_loss = log_loss(y_val_without, scores_val_without)
        val_loss_list_without.append(val_loss)
        print('logloss val {}'.format(val_loss))

    for index,i in enumerate(val_loss_list_without):
        if(index==0):
            sumi = i
        else:
            sumi = i+sumi
    val_loss_ave_without = sumi/len(score_list_without)
    print('average logloss val {}'.format(val_loss_ave_without))
    for index,i in enumerate(score_list_without):
        if(index==0):
            sumi = i
        else:
            sumi = i+sumi
    score_ave_without = sumi/len(score_list_without)
    pred_without = pd.DataFrame(score_ave_without, index = gender_age_test_without.index, columns=targetencoder.classes_)
    pred_without.to_csv('nnt_without_100_relu{}.csv'.format(val_loss_ave_without))

## Device without events III: XGBoost

In [None]:
(X_train,X_val,y_train,y_val) = cv.train_test_split(X_train_total_without,y_train_total_without,test_size=0.02,
                                                    stratify=y_train_total_without)
#grid search cv
xgbclassifier = xgb.XGBClassifier(objective="multi:softprob", nthread=1)

clf = GridSearchCV(estimator=xgbclassifier,param_grid={
        'max_depth': [ 4,5,6,7,8,9,10],
        'learning_rate': [0.01,0.02, 0.1, 0.2],
    },verbose=10,scoring='log_loss')
clf.fit(X_train,y_train)

val_loss_ave_with= -clf.score(X_val,y_val)
print(val_loss_ave_with)
del X_train,X_val,y_train,y_val

In [None]:
(X_train,X_val,y_train,y_val) = cv.train_test_split(X_train_total_without,y_train_total_without,test_size=0.02,
                                                    stratify=y_train_total_without,random_state=1)
xg_train = xgb.DMatrix(X_train,label = y_train)
xg_test = xgb.DMatrix(X_val,label = y_val)
param = {'max_depth':6,
         'eta':0.1,
         'silent':0,
         'objective':'multi:softprob',
         'nthread':2,
         'num_class':12,
         'eval_metric':'mlogloss',
        'lambda':4,
        'lambda_bias':0,
        'alpha':0}
num_round = 300
watchlist = [(xg_train,'train'),(xg_test,'test')]
bst = xgb.train(param,xg_train,num_round,watchlist)
yprob = bst.predict(xg_test).reshape(y_val.shape[0],12)
val_loss = log_loss(y_val, yprob)
print('logloss val {}'.format(val_loss))
del X_train,X_val,y_train,y_val
os.system("printf '\a'")

After obtaining the best parameters, we combined the training set with the development set and started the final training.

In [None]:
xg_train = xgb.DMatrix(X_train_total_without,label = y_train_total_without)
xg_test = xgb.DMatrix(X_test_total_without)
bst = xgb.train(param,xg_train,num_round)
yprob = bst.predict(xg_test).reshape(X_test_total_without.shape[0],12)
pred_without_xgb = pd.DataFrame(yprob, 
                            index = gender_age_test_without.index, 
                            columns=targetencoder.classes_)
pred_without_xgb.to_csv('xgb_without_result{}.csv'.format(val_loss))
os.system("printf '\a'")

# Save the result of one model into a file.
### Note: Ensemble will be done in another notebook.
Final score based on the percentage of testing set

In [None]:
#val_score_final = val_loss_ave_without*76877/112071+val_loss_ave_with*35194/112071
val_score_final = (val_loss_ave_without*X_test_total_without.shape[0]+
                   val_loss_ave_with*X_test_total_with.shape[0])/(X_test_total_without.shape[0]+X_test_total_with.shape[0])
print('with score:{}'.format(val_loss_ave_with))
print('without score:{}'.format(val_loss_ave_without))
print('final validation score:{}'.format(val_score_final))

In [None]:
pred = pd.concat((pred_with,pred_without))
pred.to_csv('doublemodel_v6.csv',index=True)