In [1]:
# imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from os import (environ, path)
import sys
import calendar
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv(path.join(environ['DATA_FOLDER'], "train_sample.csv.zip"))
df.set_index('ip', inplace=True)
print(df.head(5))
print(df.dtypes)

        app  device  os  channel           click_time attributed_time  \
ip                                                                      
87540    12       1  13      497  2017-11-07 09:30:38             NaN   
105560   25       1  17      259  2017-11-07 13:40:27             NaN   
101424   12       1  19      212  2017-11-07 18:05:24             NaN   
94584    13       1  13      477  2017-11-07 04:58:08             NaN   
68413    12       1   1      178  2017-11-09 09:00:09             NaN   

        is_attributed  
ip                     
87540               0  
105560              0  
101424              0  
94584               0  
68413               0  
app                 int64
device              int64
os                  int64
channel             int64
click_time         object
attributed_time    object
is_attributed       int64
dtype: object


In [3]:
def explode_date_attributes(dataframe, column_name):

    days = pd.to_datetime(dataframe[column_name]).dt.weekday_name
    dummies_day = pd.get_dummies(days)
    
    hours = pd.to_datetime(dataframe[column_name]).dt.hour
    dummies_hour = pd.get_dummies(hours, prefix='hour')
    
    return pd.concat([dataframe, dummies_day, dummies_hour], axis=1)


df_date_dummy = explode_date_attributes(df, 'click_time')
df_date_dummy = df_date_dummy.drop('click_time', axis='columns')
print(df_date_dummy.head(5))

        app  device  os  channel attributed_time  is_attributed  Monday  \
ip                                                                        
87540    12       1  13      497             NaN              0       0   
105560   25       1  17      259             NaN              0       0   
101424   12       1  19      212             NaN              0       0   
94584    13       1  13      477             NaN              0       0   
68413    12       1   1      178             NaN              0       0   

        Thursday  Tuesday  Wednesday   ...     hour_14  hour_15  hour_16  \
ip                                     ...                                 
87540          0        1          0   ...           0        0        0   
105560         0        1          0   ...           0        0        0   
101424         0        1          0   ...           0        0        0   
94584          0        1          0   ...           0        0        0   
68413          1  

In [4]:
# not sure how to use attributed_time yet, just dropping it for now
df_train = df_date_dummy.drop('attributed_time', axis='columns')
print(df_train.head(5))

        app  device  os  channel  is_attributed  Monday  Thursday  Tuesday  \
ip                                                                           
87540    12       1  13      497              0       0         0        1   
105560   25       1  17      259              0       0         0        1   
101424   12       1  19      212              0       0         0        1   
94584    13       1  13      477              0       0         0        1   
68413    12       1   1      178              0       0         1        0   

        Wednesday  hour_0   ...     hour_14  hour_15  hour_16  hour_17  \
ip                          ...                                          
87540           0       0   ...           0        0        0        0   
105560          0       0   ...           0        0        0        0   
101424          0       0   ...           0        0        0        0   
94584           0       0   ...           0        0        0        0   
68413    

In [5]:
# All rest to dummy values
to_dummies = ['app', 'device', 'os', 'channel']
dummies = [pd.get_dummies(df_train[x], prefix=x) for x in to_dummies]
dummies.append(df_train)
df_train = pd.concat(dummies, axis='columns')
df_train.drop(to_dummies, axis='columns')
print(df_train.head(5))


        app_1  app_2  app_3  app_4  app_5  app_6  app_7  app_8  app_9  app_10  \
ip                                                                              
87540       0      0      0      0      0      0      0      0      0       0   
105560      0      0      0      0      0      0      0      0      0       0   
101424      0      0      0      0      0      0      0      0      0       0   
94584       0      0      0      0      0      0      0      0      0       0   
68413       0      0      0      0      0      0      0      0      0       0   

         ...     hour_14  hour_15  hour_16  hour_17  hour_18  hour_19  \
ip       ...                                                            
87540    ...           0        0        0        0        0        0   
105560   ...           0        0        0        0        0        0   
101424   ...           0        0        0        0        1        0   
94584    ...           0        0        0        0        0       

In [6]:
# Getting features X and dependent variable Y
y = 'is_attributed'
X = [x for x in df_train.columns.values.tolist() if x not in [y]]
print(X)

['app_1', 'app_2', 'app_3', 'app_4', 'app_5', 'app_6', 'app_7', 'app_8', 'app_9', 'app_10', 'app_11', 'app_12', 'app_13', 'app_14', 'app_15', 'app_16', 'app_17', 'app_18', 'app_19', 'app_20', 'app_21', 'app_22', 'app_23', 'app_24', 'app_25', 'app_26', 'app_27', 'app_28', 'app_29', 'app_30', 'app_31', 'app_32', 'app_33', 'app_34', 'app_35', 'app_36', 'app_37', 'app_38', 'app_39', 'app_42', 'app_43', 'app_44', 'app_45', 'app_46', 'app_47', 'app_48', 'app_49', 'app_50', 'app_52', 'app_53', 'app_54', 'app_55', 'app_56', 'app_58', 'app_59', 'app_60', 'app_61', 'app_62', 'app_64', 'app_65', 'app_66', 'app_67', 'app_68', 'app_70', 'app_71', 'app_72', 'app_74', 'app_75', 'app_76', 'app_78', 'app_79', 'app_80', 'app_81', 'app_82', 'app_83', 'app_84', 'app_85', 'app_86', 'app_87', 'app_88', 'app_91', 'app_92', 'app_93', 'app_94', 'app_95', 'app_96', 'app_97', 'app_99', 'app_100', 'app_101', 'app_103', 'app_104', 'app_105', 'app_107', 'app_108', 'app_109', 'app_110', 'app_112', 'app_115', 'app_11

In [7]:
# Splitting between train and test in our sample
msk = np.random.rand(len(df)) < 0.8
train = df_train[msk]
test = df_train[~msk]
print(len(train))
print(len(test))

80022
19978


In [8]:
# model
print("Training..")
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# n_jobs: -1 to run on all cores
# verbose: 1 (default is 0)
logreg = LogisticRegression(n_jobs=-1)

rfe = RFE(logreg, 18)
rfe = rfe.fit(train[X], train[y])

Training..


  " = {}.".format(self.n_jobs))


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

In [36]:
test = test.sort_index()
print(test.head())
with open(path.join(environ['DATA_FOLDER'], 'predictions.csv'), 'w') as outfile:
    outfile.write('click_id,is_attributed' + '\n')
    for click_id, x in zip(test.index,rfe.predict_proba(test[X])[:,1]):
        outfile.write('%s,%.3f\n' % (click_id,round(x,3)))

    app_1  app_2  app_3  app_4  app_5  app_6  app_7  app_8  app_9  app_10  \
ip                                                                          
20      0      0      0      0      0      0      0      0      0       0   
20      0      1      0      0      0      0      0      0      0       0   
27      0      0      0      0      0      0      0      0      0       0   
27      1      0      0      0      0      0      0      0      0       0   
27      0      0      0      0      0      0      0      0      1       0   

     ...     hour_14  hour_15  hour_16  hour_17  hour_18  hour_19  hour_20  \
ip   ...                                                                     
20   ...           0        0        0        0        0        0        0   
20   ...           0        0        0        0        0        0        0   
27   ...           0        0        0        0        0        0        0   
27   ...           0        0        0        0        0        0     