In [1]:
# imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from os import (environ, path)
import sys
import calendar
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv(path.join(environ['DATA_FOLDER'],"train_sample_full_history.csv.gz"), compression='gzip')
print(df.head(5))
print(df.dtypes)

       ip  app  device  os  channel           click_time attributed_time  \
0  164537   58       1  19      120  2017-11-06 15:14:35             NaN   
1   88180    3       1  20      379  2017-11-06 15:41:20             NaN   
2  159103   58       1  13      120  2017-11-06 15:45:33             NaN   
3   44744   14       1  19      478  2017-11-06 15:50:19             NaN   
4    5729    2       2  37      477  2017-11-06 16:00:00             NaN   

   is_attributed  
0              0  
1              0  
2              0  
3              0  
4              0  
ip                  int64
app                 int64
device              int64
os                  int64
channel             int64
click_time         object
attributed_time    object
is_attributed       int64
dtype: object


In [3]:
# Exploding date to dummy values
def explode_date_attributes(dataframe, column_name):
    years = pd.to_datetime(dataframe[column_name]).dt.year
    dummies_year = pd.get_dummies(years, prefix='year')
    
    months = pd.to_datetime(dataframe[column_name]).dt.month.apply(lambda x: calendar.month_abbr[x])
    dummies_month = pd.get_dummies(months)
    
    days = pd.to_datetime(dataframe[column_name]).dt.weekday_name
    dummies_day = pd.get_dummies(days)
    
    hours = pd.to_datetime(dataframe[column_name]).dt.hour
    dummies_hour = pd.get_dummies(hours, prefix='hour')
    
    return pd.concat([dataframe, dummies_year, dummies_month, dummies_day, dummies_hour], axis=1)
    
df_date_dummy = explode_date_attributes(df, 'click_time')
df_date_dummy = df_date_dummy.drop('click_time', axis='columns')
print(df_date_dummy.head(5))

       ip  app  device  os  channel attributed_time  is_attributed  year_2017  \
0  164537   58       1  19      120             NaN              0          1   
1   88180    3       1  20      379             NaN              0          1   
2  159103   58       1  13      120             NaN              0          1   
3   44744   14       1  19      478             NaN              0          1   
4    5729    2       2  37      477             NaN              0          1   

   Nov  Monday   ...     hour_14  hour_15  hour_16  hour_17  hour_18  hour_19  \
0    1       1   ...           0        1        0        0        0        0   
1    1       1   ...           0        1        0        0        0        0   
2    1       1   ...           0        1        0        0        0        0   
3    1       1   ...           0        1        0        0        0        0   
4    1       1   ...           0        0        1        0        0        0   

   hour_20  hour_21  hour_

In [4]:
# not sure how to use attributed_time yet, just dropping it for now
df_train = df_date_dummy.drop('attributed_time', axis='columns')
print(df_train.head(5))

       ip  app  device  os  channel  is_attributed  year_2017  Nov  Monday  \
0  164537   58       1  19      120              0          1    1       1   
1   88180    3       1  20      379              0          1    1       1   
2  159103   58       1  13      120              0          1    1       1   
3   44744   14       1  19      478              0          1    1       1   
4    5729    2       2  37      477              0          1    1       1   

   Thursday   ...     hour_14  hour_15  hour_16  hour_17  hour_18  hour_19  \
0         0   ...           0        1        0        0        0        0   
1         0   ...           0        1        0        0        0        0   
2         0   ...           0        1        0        0        0        0   
3         0   ...           0        1        0        0        0        0   
4         0   ...           0        0        1        0        0        0   

   hour_20  hour_21  hour_22  hour_23  
0        0        0   

In [5]:
# All rest to dummy values
    
dummies = [pd.get_dummies(df_train[x], prefix=x) for x in ['ip', 'app', 'device', 'os', 'channel']]
dummies.append(df_train)
df_train = pd.concat(dummies, axis=1)
print(df_train.head(5))

   ip_107  ip_441  ip_525  ip_1293  ip_1659  ip_2128  ip_2399  ip_3052  \
0       0       0       0        0        0        0        0        0   
1       0       0       0        0        0        0        0        0   
2       0       0       0        0        0        0        0        0   
3       0       0       0        0        0        0        0        0   
4       0       0       0        0        0        0        0        0   

   ip_3088  ip_3745   ...     hour_14  hour_15  hour_16  hour_17  hour_18  \
0        0        0   ...           0        1        0        0        0   
1        0        0   ...           0        1        0        0        0   
2        0        0   ...           0        1        0        0        0   
3        0        0   ...           0        1        0        0        0   
4        0        0   ...           0        0        1        0        0   

   hour_19  hour_20  hour_21  hour_22  hour_23  
0        0        0        0        0      

In [11]:
# Getting features X and dependent variable Y
Y = ['is_attributed']
X = [x for x in df_train.columns.values.tolist() if x not in Y ]
print(X)

['ip_107', 'ip_441', 'ip_525', 'ip_1293', 'ip_1659', 'ip_2128', 'ip_2399', 'ip_3052', 'ip_3088', 'ip_3745', 'ip_4343', 'ip_4505', 'ip_5280', 'ip_5683', 'ip_5729', 'ip_5988', 'ip_6043', 'ip_6277', 'ip_6726', 'ip_6974', 'ip_7051', 'ip_7115', 'ip_7157', 'ip_7583', 'ip_7593', 'ip_7891', 'ip_8129', 'ip_8343', 'ip_8692', 'ip_8715', 'ip_8975', 'ip_9298', 'ip_9916', 'ip_9959', 'ip_10254', 'ip_11102', 'ip_11644', 'ip_11924', 'ip_12129', 'ip_12879', 'ip_13065', 'ip_13222', 'ip_13411', 'ip_13487', 'ip_13643', 'ip_13900', 'ip_13982', 'ip_14084', 'ip_14389', 'ip_14424', 'ip_14597', 'ip_14888', 'ip_15372', 'ip_16163', 'ip_16626', 'ip_16929', 'ip_17139', 'ip_17188', 'ip_17292', 'ip_17814', 'ip_17922', 'ip_18208', 'ip_18551', 'ip_18716', 'ip_18750', 'ip_18780', 'ip_18855', 'ip_19054', 'ip_19515', 'ip_20376', 'ip_20456', 'ip_20465', 'ip_20547', 'ip_20938', 'ip_21660', 'ip_21741', 'ip_22043', 'ip_22214', 'ip_22324', 'ip_22491', 'ip_22730', 'ip_23067', 'ip_23139', 'ip_24278', 'ip_24666', 'ip_26029', 'ip_

In [14]:
# Splitting between train and test in our sample
msk = np.random.rand(len(df)) < 0.8
train = df_train[msk]
test = df_train[~msk]
print(len(train))
print(len(test))

1647327
410945


In [None]:
# model

from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

rfe = RFE(logreg, 18)
rfe = rfe.fit(train[X], train[Y] )
print(rfe.support_)
print(rfe.ranking_)