In [1]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder 
from sklearn.svm import SVC

In [2]:
#Cargo los archivos
eventos = pd.read_csv("../events_up_to_01062018.csv")
labels = pd.read_csv("../labels_training_set.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#Veo las columnas de eventos 
list(eventos)

['timestamp',
 'event',
 'person',
 'url',
 'sku',
 'model',
 'condition',
 'storage',
 'color',
 'skus',
 'search_term',
 'staticpage',
 'campaign_source',
 'search_engine',
 'channel',
 'new_vs_returning',
 'city',
 'region',
 'country',
 'device_type',
 'screen_resolution',
 'operating_system_version',
 'browser_version']

In [4]:
#Dropeo columnas que son irrelevantes
train_set = eventos.drop(columns=['city','staticpage','url','timestamp','color','skus', 'channel','search_term','campaign_source','search_engine','screen_resolution','operating_system_version','browser_version'])
train_set.head()

Unnamed: 0,event,person,sku,model,condition,storage,new_vs_returning,region,country,device_type
0,viewed product,4886f805,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,,,,
1,viewed product,ad93850f,304.0,iPhone 5s,Muito Bom,32GB,,,,
2,viewed product,0297fc1e,6888.0,iPhone 6S,Muito Bom,64GB,,,,
3,viewed product,2d681dd8,11890.0,iPhone 7,Bom,128GB,,,,
4,viewed product,cccea85e,7517.0,LG G4 H818P,Excelente,32GB,,,,


In [5]:
#Me quedo con los set de datos que coincidan con los labels y sean inferiores al 01/06/2018
set_entrenamiento = pd.merge(train_set, labels, on='person', how='inner')


In [6]:
list(set_entrenamiento)

['event',
 'person',
 'sku',
 'model',
 'condition',
 'storage',
 'new_vs_returning',
 'region',
 'country',
 'device_type',
 'label']

In [7]:
#Encodeamos columnas Strings

lb_make = LabelEncoder()
set_entrenamiento['event'] = lb_make.fit_transform(set_entrenamiento['event'].astype(str))
set_entrenamiento['sku'] = lb_make.fit_transform(set_entrenamiento['sku'].astype(str))
set_entrenamiento['model'] = lb_make.fit_transform(set_entrenamiento['model'].astype(str))

set_entrenamiento['person'] = lb_make.fit_transform(set_entrenamiento['person'].astype(str))
set_entrenamiento['condition'] = lb_make.fit_transform(set_entrenamiento['condition'].astype(str))
set_entrenamiento['storage'] = lb_make.fit_transform(set_entrenamiento['storage'].astype(str))
set_entrenamiento['new_vs_returning'] = lb_make.fit_transform(set_entrenamiento['new_vs_returning'].astype(str))
set_entrenamiento['region'] = lb_make.fit_transform(set_entrenamiento['region'].astype(str))
set_entrenamiento['country'] = lb_make.fit_transform(set_entrenamiento['country'].astype(str))
set_entrenamiento['device_type'] = lb_make.fit_transform(set_entrenamiento['device_type'].astype(str))

set_entrenamiento.head()


Unnamed: 0,event,person,sku,model,condition,storage,new_vs_returning,region,country,device_type,label
0,9,13068,1104,187,3,3,2,84,39,4,0
1,9,13068,1139,187,3,6,2,84,39,4,0
2,9,13068,1492,187,1,1,2,84,39,4,0
3,9,13068,1500,187,1,1,2,84,39,4,0
4,0,13068,2220,198,5,8,2,84,39,4,0


In [11]:
##Empezamos a cronstruir el modelo. Nos basamos en un ejemplo

# Split-out validation dataset
array = set_entrenamiento.values
X = array[:,0:10]
Y = array[:,10]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [12]:
scoring = 'accuracy'

In [None]:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed)
	cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

LR: 0.914817 (0.000607)
LDA: 0.914817 (0.000607)
KNN: 0.970933 (0.000479)
CART: 0.998153 (0.000143)
NB: 0.914817 (0.000607)


In [13]:
knn = KNeighborsClassifier(n_neighbors=20, weights='distance', n_jobs=-1)
knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
           weights='distance')

In [16]:
#Leo el set de datos para predecir
predictions = pd.read_csv("../trocafone_kaggle_test.csv");
predictions.head()

Unnamed: 0,person
0,4886f805
1,0297fc1e
2,2d681dd8
3,cccea85e
4,4c8a8b93


In [17]:
predicciones = pd.merge(train_set,predictions, on='person', how='inner')
predicciones.head()

Unnamed: 0,event,person,sku,model,condition,storage,new_vs_returning,region,country,device_type
0,viewed product,4886f805,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,,,,
1,viewed product,4886f805,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,,,,
2,search engine hit,4886f805,,,,,,,,
3,checkout,4886f805,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,,,,
4,viewed product,4886f805,9287.0,Samsung Galaxy J7 Prime,Muito Bom,32GB,,,,
