In [1]:
import tarfile
import json
import pandas as pd
from pandas.io.json import json_normalize

# read in training data
TRAIN_TAR = 'training_data.tgz'
train_archive = tarfile.open(TRAIN_TAR, 'r:gz')

train_example_list = []
for member in train_archive:
    member_file = train_archive.extractfile(member)
    for example in member_file:
        example_dict = json.loads(example.decode('utf-8'))
        train_example_list.append(example_dict)

train_df = json_normalize(train_example_list)
train_archive.close()

y_train = train_df.loc[:, 'event.conversion']
X_train = train_df.drop('event.conversion', axis=1).set_index('event.id')

# read in test data
TEST_TAR = 'all_test_data.tgz'
test_archive = tarfile.open(TRAIN_TAR, 'r:gz')

test_example_list = []
for member in train_archive:
    member_file = test_archive.extractfile(member)
    for example in member_file:
        example_dict = json.loads(example.decode('utf-8'))
        test_example_list.append(example_dict)

test_df = json_normalize(test_example_list)
test_archive.close()

y_test = pd.read_csv('test_labels.txt', sep=' ', index_col=0)
X_test = test_df.drop('event.conversion', axis=1).set_index('event.id')

In [2]:
import numpy as np

# select desired features
num_features = [
    'event.placement.attributes.publisherCvr7Days', 
    'event.placement.attributes.publisherForensiqTraffic', 
    'event.placement.attributes.publisherMargin',
    'event.placement.attributes.publisherQualityScoreAverage3Weeks',
    'event.user.attributes.local_minutes_since_midnight'
]

cat_features = [
    'event.user.attributes.browser_family',
    'event.user.attributes.country_code',
    'event.user.attributes.ip_isp',
    'event.user.attributes.ip_connection_type',
    'event.user.attributes.language',
    'event.user.attributes.local_day_of_month',
    'event.user.attributes.local_day_of_week',
    'event.user.attributes.local_timezone',
    'event.user.attributes.os_family',
]

X_train_num = X_train[num_features]
X_train_cat = pd.get_dummies(X_train[cat_features], columns=cat_features)
X_train_selected = pd.concat([X_train_num, X_train_cat], axis=1)

X_test_num = X_test[num_features]
X_test_cat = pd.get_dummies(X_test[cat_features], columns=cat_features)
X_test_selected = pd.concat([X_test_num, X_test_cat], axis=1)

# replace missing value identifier so it can play well with sklearn
X_train_selected = X_train_selected.replace('\\N', np.nan)
X_test_selected = X_test_selected.replace('\\N', np.nan)

In [3]:
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

imputer = Imputer()
scaler = StandardScaler()
model = DecisionTreeClassifier()

pipeline = Pipeline([('imputation', imputer), ('scaling', scaler), ('model', model)])
pipeline.fit(X_train_selected, y_train)

Pipeline(steps=[('imputation', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('scaling', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])

In [4]:
from sklearn.metrics import roc_auc_score

#y_predict = pipeline.predict(X_test_selected)
print(X_test.index)
ordered_predictions = []
for sample_id in y_test.index.values:
    idx = np.nonzero(X_test.index.values == sample_id)[0]
    if len(idx) > 0:
        ordered_predictions.append(y_predict[idx])
    
roc_auc_score(y_test, ordered_predictions)

Index(['56158bed-0a23-40e1-bcda-1a87179b97a6',
       '9834ebc1-78e0-4a48-a42f-be9d2d5604b8',
       'a26c21bc-b950-4ea7-ab67-4437b13e88cb',
       '0dfbaa00-546c-471f-a43c-4a3fa0b9bd44',
       'fcf0216f-af9b-4bad-896f-b733584759c5',
       'a475b673-0ef1-492e-9a4b-8745f5c46d09',
       '312f5171-a021-49f7-88d4-ead5c716571d',
       '3a7a37be-aafe-453d-99a2-31a20639201a',
       '541bb777-1420-428d-92bb-8cf4f80731c2',
       '1f41b8e3-f1e4-4f7e-af1f-788e19e63e83',
       ...
       '26309552-dc87-48c7-8f1b-9cec46f4f8eb',
       '0847d5bf-4559-4730-9197-04e0ad46f6da',
       '1d0f1bee-5814-4231-ab8e-14b58832fbda',
       'f12b8ab3-4c10-4f94-8391-53baf1c1f8b0',
       '0a89e9fb-b43e-4205-8c2c-df4c51094f8b',
       '1532bbca-4c63-4894-a370-e21b75570fbe',
       'e3ec022d-9f69-41fc-8748-58c2b7b7d7ff',
       '48a36aaa-2018-4348-a610-0cc6cc8c2c41',
       '42b99fe0-7cc8-42a5-9cc9-64d36346f33a',
       'a5ed85d4-ce24-4526-b975-1127bdbf7dbc'],
      dtype='object', name='event.id', length=26

ValueError: Found input variables with inconsistent numbers of samples: [75999, 0]

In [6]:
print(len(ordered_predictions))

0
