In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.svm import LinearSVC
from sklearn import cross_validation as cv
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss,accuracy_score
from scipy.sparse import csr_matrix,hstack
import time
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


In [2]:
events = pd.read_csv('C:/Users/sxl/Study/Kaggle_project/Talking_data_project/input/events.csv',index_col='event_id')
label = pd.read_csv('C:/Users/sxl/Study/Kaggle_project/Talking_data_project/input/label_categories.csv')
app_event = pd.read_csv('C:/Users/sxl/Study/Kaggle_project/Talking_data_project/input/app_events.csv')
app_label = pd.read_csv('C:/Users/sxl/Study/Kaggle_project/Talking_data_project/input/app_labels.csv')
device = pd.read_csv('C:/Users/sxl/Study/Kaggle_project/Talking_data_project/input/phone_brand_device_model.csv')
device = device.drop_duplicates('device_id').set_index('device_id')
gender_age_train = pd.read_csv('C:/Users/sxl/Study/Kaggle_project/Talking_data_project/input/gender_age_train.csv',index_col = 'device_id')
gender_age_test = pd.read_csv('C:/Users/sxl/Study/Kaggle_project/Talking_data_project/input/gender_age_test.csv',index_col = 'device_id')

For the convinience of creating sparse matrix, we create another integer index for the training and testing set.

In [3]:
gender_age_train['int_index'] = np.arange(gender_age_train.shape[0])
gender_age_test['int_index'] = np.arange(gender_age_test.shape[0])

In [4]:
print('gender_age_train\n',gender_age_train.head(1))
print('\ngender_age_test\n',gender_age_test.head(1))
print('\ndevice\n',device.head(1))
print('\nevents\n',events.head(1))
print('\nlabel\n',label.head(1))
print('\napp_event\n',app_event.head(1))
print('\napp_label\n',app_label.head(1))

gender_age_train
                      gender  age   group  int_index
device_id                                          
-8076087639492063270      M   35  M32-38          0

gender_age_test
                      int_index
device_id                     
1002079943728939269          0

device
                      phone_brand device_model
device_id                                    
-8890648629457979026          小米           红米

events
                   device_id            timestamp  longitude  latitude
event_id                                                             
1         29182687948017175  2016-05-01 00:55:25     121.38     31.24

label
    label_id category
0         1      NaN

app_event
    event_id               app_id  is_installed  is_active
0         2  5927333115845830913             1          1

app_label
                 app_id  label_id
0  7324884708820027918       251


# Feature engineering I: phone brand

Encoding the phone brand, and feed this as a feature into the training and testing test.

In [5]:
encoder = LabelEncoder()
encoder.fit(device.phone_brand)
device['brand'] = encoder.transform(device.phone_brand)
gender_age_train['brand'] = device['brand']
gender_age_test['brand'] = device['brand']

Create sparse matrix with each row representing one device and each column representing one brand

In [6]:
X_train_brand = csr_matrix((np.ones(gender_age_train.shape[0]),
                            (gender_age_train.int_index,gender_age_train.brand)))
X_test_brand = csr_matrix((np.ones(gender_age_test.shape[0]),
                           (gender_age_test.int_index,gender_age_test.brand)))
print('X_train_brand shape:',X_train_brand.shape)
print('X_test_brand shape:',X_test_brand.shape)

X_train_brand shape: (74645, 131)
X_test_brand shape: (112071, 131)


# Feature engineering II: Installed app

Encoding the app_id and store it into app column, and feed this as a feature into the training and testing test.

In [7]:
encoder2 = LabelEncoder()
encoder2.fit(app_event.app_id)
app_event['app'] = encoder2.transform(app_event.app_id)

In [8]:
np.sort(app_event.app.unique())

array([    0,     1,     2, ..., 19234, 19235, 19236], dtype=int64)

In [9]:
print(app_event.head(1))
print(events.head(1))

   event_id               app_id  is_installed  is_active    app
0         2  5927333115845830913             1          1  15408
                  device_id            timestamp  longitude  latitude
event_id                                                             
1         29182687948017175  2016-05-01 00:55:25     121.38     31.24


In [10]:
installed_app = pd.merge(events[['device_id']],app_event[['event_id','app','is_installed']],
                         how='right',right_on = 'event_id',left_index = True)
print(installed_app.head())

             device_id  event_id    app  is_installed
0 -6401643145415154744         2  15408             1
1 -6401643145415154744         2   3384             1
2 -6401643145415154744         2   7620             1
3 -6401643145415154744         2   8902             1
4 -6401643145415154744         2  18686             1


In [11]:
installed_app_grouped = installed_app.groupby(['device_id','app'])['app'].agg(['size'])
print('installed_app_grouped:')
print(installed_app_grouped.head())
installed_app_train = pd.merge(installed_app_grouped,gender_age_train[['int_index']],
                               how = 'right',right_index=True,left_index=True)
installed_app_test = pd.merge(installed_app_grouped,gender_age_test[['int_index']],
                              how = 'right',right_index=True,left_index=True)
print('installed_app_train:')
print(installed_app_train.head())

installed_app_grouped:
                           size
device_id            app       
-9222956879900151005 548     18
                     1096    18
                     1248    26
                     1545    12
                     1664    18
installed_app_train:
                           size  int_index
device_id            app                  
-9222956879900151005 548     18      21594
                     1096    18      21594
                     1248    26      21594
                     1545    12      21594
                     1664    18      21594


In [12]:
installed_app_train = installed_app_train.reset_index()
installed_app_test = installed_app_test.reset_index()
installed_app_train = installed_app_train.dropna(subset=['int_index'])
installed_app_test = installed_app_test.dropna(subset=['int_index'])
print(installed_app_train.head())
print(installed_app_test.head())

             device_id   app  size  int_index
0 -9222956879900151005   548    18      21594
1 -9222956879900151005  1096    18      21594
2 -9222956879900151005  1248    26      21594
3 -9222956879900151005  1545    12      21594
4 -9222956879900151005  1664    18      21594
             device_id    app  size  int_index
0 -9222661944218806987   1867     3      13612
1 -9222661944218806987   7519     8      13612
2 -9222661944218806987   7843     1      13612
3 -9222661944218806987   8704     4      13612
4 -9222661944218806987  10000     1      13612


Calculate the number of unique app:

In [13]:
print('The number of unique apps:')
print(np.size(installed_app.app.unique()))
appnumber = np.size(installed_app.app.unique())

The number of unique apps:
19237


In [14]:
print(gender_age_test.shape[0])
print(installed_app_train.shape[0])

112071
915632


In [15]:
print(np.sort(installed_app_train.app.unique()))
print(installed_app_test.shape[0])

[    0     1     2 ..., 19234 19235 19236]
1387337


In [16]:
X_train_installed = csr_matrix((np.ones(installed_app_train.shape[0]),
                                (installed_app_train.int_index,installed_app_train.app)), 
                               shape = (gender_age_train.shape[0],appnumber))
X_test_installed = csr_matrix((np.ones(installed_app_test.shape[0]),
                               (installed_app_test.int_index,installed_app_test.app)),
                               shape = (gender_age_test.shape[0],appnumber))
print('X_train_installed shape:',X_train_installed.shape)
print('X_test_installed shape:',X_test_installed.shape)

X_train_installed shape: (74645, 19237)
X_test_installed shape: (112071, 19237)


In [17]:
np.sort(installed_app_test.int_index.unique())

array([     0,      1,      2, ..., 112059, 112060, 112063], dtype=int64)

# Feature engineering III: phone device model

In [18]:
brand_model = device.phone_brand.str.cat(device.device_model)
encoder3 = LabelEncoder()
encoder3.fit(brand_model)
device['model'] = encoder3.transform(brand_model)
gender_age_train['model'] = device['model']
gender_age_test['model'] = device['model']

In [19]:
X_train_model = csr_matrix((np.ones(gender_age_train.shape[0]),(gender_age_train.int_index,gender_age_train.model)))
X_test_model = csr_matrix((np.ones(gender_age_test.shape[0]),(gender_age_test.int_index,gender_age_test.model)))
print('X_train_brand shape:',X_train_model.shape)
print('X_test_brand shape:',X_test_model.shape)

X_train_brand shape: (74645, 1667)
X_test_brand shape: (112071, 1667)


# Feature engineering IV: app label

In [20]:
print(app_event[['app_id','event_id']].head(1))
print(app_label[['app_id','label_id']].head(1))

                app_id  event_id
0  5927333115845830913         2
                app_id  label_id
0  7324884708820027918       251


In [21]:
app_label_new = app_label.loc[app_label.app_id.isin(app_event.app_id.unique())].copy()
app_label_new['app'] = encoder2.transform(app_label_new.app_id)
encoder4 = LabelEncoder().fit(app_label_new.label_id)
app_label_new['label'] = encoder4.transform(app_label_new.label_id)
labelnumber = len(encoder4.classes_)
print('app_label_new:')
print(app_label_new.head())

app_label_new:
                app_id  label_id    app  label
0  7324884708820027918       251  17355    207
1 -4494216993218550286       251   4618    207
2  6058196446775239644       406  15548    247
3  6058196446775239644       407  15548    248
4  8694625920731541625       406  18689    247


In [22]:
print(app_label.size)
print(installed_app.size)
print('installed_app_grouped:')
print(installed_app_grouped.head())
installed_label_grouped = (installed_app_grouped.reset_index()[['device_id','app']]
                          .merge(app_label_new[['app','label']])
                          .groupby(['device_id','label']))['app'].agg(['size']).reset_index()
                          
print('installed_label_grouped:')
print(installed_label_grouped.head())

919886
129892268
installed_app_grouped:
                           size
device_id            app       
-9222956879900151005 548     18
                     1096    18
                     1248    26
                     1545    12
                     1664    18
installed_label_grouped:
             device_id  label  size
0 -9222956879900151005    117     1
1 -9222956879900151005    120     1
2 -9222956879900151005    126     1
3 -9222956879900151005    138     2
4 -9222956879900151005    147     2


In [23]:
label_app_train = pd.merge(installed_label_grouped,gender_age_train[['int_index']],
                               how = 'right',right_index=True,left_on='device_id')
label_app_test = pd.merge(installed_label_grouped,gender_age_test[['int_index']],
                              how = 'right',right_index=True,left_on ='device_id' )
label_app_train = label_app_train.dropna(subset= ['int_index','label'])
label_app_test = label_app_test.dropna(subset= ['int_index','label'])

In [24]:
X_train_label = csr_matrix((np.ones(label_app_train.shape[0]),(label_app_train.int_index,label_app_train.label)),
                              shape = (gender_age_train.shape[0],labelnumber))
X_test_label = csr_matrix((np.ones(label_app_test.shape[0]),(label_app_test.int_index,label_app_test.label)),
                              shape = (gender_age_test.shape[0],labelnumber))
print('X_train_installed shape:',X_train_label.shape)
print('X_test_installed shape:',X_test_label.shape)

X_train_installed shape: (74645, 492)
X_test_installed shape: (112071, 492)


# Feature engineering V: active app

In [25]:
active_app = pd.merge(events[['device_id']],app_event[['event_id','app','is_active']],
                         how='right',right_on = 'event_id',left_index = True)
active_app = active_app.loc[active_app.is_active==1]
print(active_app.head())

             device_id  event_id    app  is_active
0 -6401643145415154744         2  15408          1
3 -6401643145415154744         2   8902          1
4 -6401643145415154744         2  18686          1
5 -6401643145415154744         2  14346          1
9 -6401643145415154744         2  16908          1


In [26]:
active_app_grouped = active_app.groupby(['device_id','app'])['app'].agg(['size']).reset_index()
print('active_app_grouped:')
print(active_app_grouped.head())
# active_app_grouped = pd.merge(active_app_grouped,active_app_count[['app']]).set_index('device_id')
# print('active_app_grouped:')
# print(active_app_grouped.head())

active_app_grouped:
             device_id   app  size
0 -9222956879900151005   548     4
1 -9222956879900151005  1248    15
2 -9222956879900151005  1545     2
3 -9222956879900151005  1848    31
4 -9222956879900151005  2236    17


In [27]:
active_app_train = pd.merge(active_app_grouped,gender_age_train[['int_index']],
                               how = 'left', left_on = 'device_id', right_index = True)
active_app_test = pd.merge(active_app_grouped,gender_age_test[['int_index']],
                              how = 'left',left_on = 'device_id',right_index=True)
print('active_app_train:')
print(active_app_train.head())

active_app_train:
             device_id   app  size  int_index
0 -9222956879900151005   548     4    21594.0
1 -9222956879900151005  1248    15    21594.0
2 -9222956879900151005  1545     2    21594.0
3 -9222956879900151005  1848    31    21594.0
4 -9222956879900151005  2236    17    21594.0


In [28]:
active_app_train = active_app_train.reset_index()
active_app_test = active_app_test.reset_index()
active_app_train = active_app_train.dropna(subset=['int_index'])
active_app_test = active_app_test.dropna(subset=['int_index'])
print(active_app_train.head())
print(active_app_test.head())

   index            device_id   app  size  int_index
0      0 -9222956879900151005   548     4    21594.0
1      1 -9222956879900151005  1248    15    21594.0
2      2 -9222956879900151005  1545     2    21594.0
3      3 -9222956879900151005  1848    31    21594.0
4      4 -9222956879900151005  2236    17    21594.0
    index            device_id    app  size  int_index
55     55 -9222661944218806987   1867     3    13612.0
56     56 -9222661944218806987   7519     7    13612.0
57     57 -9222661944218806987   7843     1    13612.0
58     58 -9222661944218806987   8704     3    13612.0
59     59 -9222661944218806987  10000     1    13612.0


In [29]:
print(np.sort(active_app_train.app.unique()))
print(appnumber)

[    0     5     6 ..., 19225 19228 19236]
19237


In [30]:
appnumber = np.size(installed_app.app.unique())
# binary active map
X_train_active = csr_matrix((np.ones(active_app_train.shape[0]),
                            (active_app_train.int_index,active_app_train.app)), 
                            shape = (gender_age_train.shape[0],appnumber))
X_test_active = csr_matrix((np.ones(active_app_test.shape[0]),
                            (active_app_test.int_index,active_app_test.app)),
                            shape = (gender_age_test.shape[0],appnumber))
# count the number of active app
#X_train_active = csr_matrix((np.log(np.log(active_app_train['size']+1)+1),
#                            (active_app_train.int_index,active_app_train.app)), 
#                            shape = (gender_age_train.shape[0],appnumber))
#X_test_active = csr_matrix((np.log(np.log(active_app_test['size']+1)+1),
#                            (active_app_test.int_index,active_app_test.app)),
#                            shape = (gender_age_test.shape[0],appnumber))
print('X_train_active shape:',X_train_active.shape)
print('X_test_active shape:',X_test_active.shape)

X_train_active shape: (74645, 19237)
X_test_active shape: (112071, 19237)


# Feature engineering VI: active time period

In [31]:
events.head()

Unnamed: 0_level_0,device_id,timestamp,longitude,latitude
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,29182687948017175,2016-05-01 00:55:25,121.38,31.24
2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97
3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7
4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28
5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66


In [32]:
events_time = events[['device_id','timestamp']].copy()
events_time['time']  = events_time['timestamp'].str[11:13].astype(int)
events_time.drop(['timestamp'],axis=1,inplace=True)
events_time = events_time.groupby(['device_id','time'])['time'].agg({'times':'count'}).reset_index()
print(events_time.head())
timenumber= events_time.time.unique().shape[0]

             device_id  time  times
0 -9222956879900151005     7      2
1 -9222956879900151005    11      7
2 -9222956879900151005    12     13
3 -9222956879900151005    13      3
4 -9222956879900151005    14      5


In [33]:
time_train = pd.merge(events_time,gender_age_train[['int_index']],
                               right_index=True,left_on='device_id')
time_test = pd.merge(events_time,gender_age_test[['int_index']],
                               right_index=True,left_on ='device_id' )
#binary
X_train_time = csr_matrix((np.ones(time_train.shape[0]),
                            (time_train.int_index,time_train.time)), 
                            shape = (gender_age_train.shape[0],timenumber))
X_test_time = csr_matrix((np.ones(time_test.shape[0]),
                            (time_test.int_index,time_test.time)),
                            shape = (gender_age_test.shape[0],timenumber))

#number
#X_train_time = csr_matrix((np.log(np.log(time_train['times']+1)+1),
#                            (time_train.int_index,time_train.time)), 
#                            shape = (gender_age_train.shape[0],timenumber))
#X_test_time = csr_matrix((np.log(np.log(time_test['times']+1)+1),
#                            (time_test.int_index,time_test.time)),
#                            shape = (gender_age_test.shape[0],timenumber))
print('X_train_time shape:',X_train_time.shape)
print('X_test_time shape:',X_test_time.shape)

X_train_time shape: (74645, 24)
X_test_time shape: (112071, 24)


Feature Selection

In [34]:
X_train_total = hstack((X_train_brand, X_train_model, X_train_installed, X_train_active, X_train_label, X_train_time), format = 'csr')
X_test_total = hstack((X_test_brand, X_test_model, X_test_installed, X_test_active, X_test_label, X_test_time), format = 'csr')

targetencoder = LabelEncoder()
targetencoder.fit(gender_age_train.group)
y_train_total = targetencoder.transform(gender_age_train.group)
target_len = len(targetencoder.classes_)

In [35]:
for i in np.arange(200, 300, 100):

    X_train_sel = SelectKBest(chi2, k=i).fit_transform(X_train_total, y_train_total)
    X_train,X_test,y_train,y_test = cv.train_test_split(X_train_sel,y_train_total)
    model = LogisticRegression(C=0.02,multi_class='multinomial',solver='newton-cg')
    model.fit(X_train,y_train)
    y_pred = model.predict_proba(X_test)
    cost = log_loss(y_test,y_pred)
    score = model.score(X_test,y_test)
    print('number of features, cost and score', i, cost, score)


number of features, cost and score 200 2.32045806034 0.178598220984


# Model training and fitting: Logistic Regression

In [36]:
X_train_total = hstack((X_train_brand,X_train_active,X_train_time),format='csr')
X_test_total = hstack((X_test_brand,X_test_active,X_test_time),format='csr')
print('Training shape:')
print(X_train_total.shape)
print('Testing shape:')
print(X_test_total.shape)

Training shape:
(74645, 19392)
Testing shape:
(112071, 19392)


In [37]:
X_train_model.shape[1]

1667

In [38]:
targetencoder = LabelEncoder()
targetencoder.fit(gender_age_train.group)
y_train_total = targetencoder.transform(gender_age_train.group)
target_len = len(targetencoder.classes_)
print(target_len)

12


In [39]:
X_train,X_test,y_train,y_test = cv.train_test_split(X_train_sel,y_train_total)

In [40]:
model = LogisticRegression(C=0.02,multi_class='multinomial',solver='newton-cg')
model.fit(X_train,y_train)

LogisticRegression(C=0.02, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [41]:
y_pred = model.predict_proba(X_test)

In [42]:
log_loss(y_test,y_pred)

2.3141623514268459

In [43]:
model.score(X_test,y_test)

0.18068802915014467

In [44]:
result = model.fit(X_train_total,y_train_total)
pred = pd.DataFrame(model.predict_proba(X_test_total), index = gender_age_test.index, columns=targetencoder.classes_)
pred.head()

Unnamed: 0_level_0,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1002079943728939269,0.035388,0.047032,0.039529,0.050952,0.078769,0.069661,0.071671,0.107903,0.071642,0.100387,0.169435,0.157631
-1547860181818787117,0.020799,0.039336,0.03396,0.07088,0.087704,0.087793,0.034579,0.118753,0.120879,0.085591,0.199289,0.100438
7374582448058474277,0.044346,0.056017,0.045979,0.079115,0.138317,0.052719,0.04094,0.094286,0.056253,0.106847,0.141936,0.143244
-6220210354783429585,0.014332,0.031069,0.014213,0.028614,0.045357,0.049677,0.059665,0.117228,0.086981,0.179114,0.197897,0.175855
-5893464122623104785,0.057547,0.059611,0.045369,0.063594,0.06579,0.049493,0.123285,0.147503,0.082012,0.098812,0.114898,0.092087


In [None]:
pred.to_csv('logreg_submission.csv',index=True)

# Model fitting :RF

In [None]:
model2 = rfc(n_estimators=300,verbose=1,n_jobs=-1)
model2.fit(X_train,y_train)

In [None]:
y_pred2 = model2.predict_proba(X_test)

In [None]:
log_loss(y_test,y_pred2)

Feature Selection

In [None]:
X_train_total = hstack((X_train_brand, X_train_model, X_train_installed, X_train_active, X_train_label, X_train_time), format = 'csr')
X_test_total = hstack((X_test_brand, X_test_model, X_test_installed, X_test_active, X_test_label, X_test_time), format = 'csr')

targetencoder = LabelEncoder()
targetencoder.fit(gender_age_train.group)
y_train_total = targetencoder.transform(gender_age_train.group)
target_len = len(targetencoder.classes_)

In [None]:
for i in np.arange(200, 300, 100):

    X_train_sel = SelectKBest(chi2, k=i).fit_transform(X_train_total, y_train_total)
    X_train,X_test,y_train,y_test = cv.train_test_split(X_train_sel,y_train_total, test_size = 0.2)
    model = LogisticRegression(C=0.02,multi_class='multinomial',solver='newton-cg')
    model.fit(X_train,y_train)
    y_pred = model.predict_proba(X_test)
    cost = log_loss(y_test,y_pred)
    score = model.score(X_test,y_test)
    print('number of features is {}, cost is {} and score is {}\n'.format(i, cost, score))