In [2]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split


In [3]:
#Cargo los archivos de entrenamiento
eventos = pd.read_csv("../events_up_to_01062018.csv")
labels = pd.read_csv("../labels_training_set.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
#Veo las columnas de eventos 
list(eventos)

['timestamp',
 'event',
 'person',
 'url',
 'sku',
 'model',
 'condition',
 'storage',
 'color',
 'skus',
 'search_term',
 'staticpage',
 'campaign_source',
 'search_engine',
 'channel',
 'new_vs_returning',
 'city',
 'region',
 'country',
 'device_type',
 'screen_resolution',
 'operating_system_version',
 'browser_version']

In [5]:
#Me quedo con los set de datos que coincidan con los labels y sean inferiores al 01/06/2018
set_entrenamiento = pd.merge(eventos, labels, on='person', how='inner')
set_entrenamiento.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,...,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,label
0,2018-05-18 00:11:27,viewed product,ad93850f,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,...,,,,,,,,,,0
1,2018-05-18 00:23:33,viewed product,ad93850f,,318.0,iPhone 5s,Muito Bom,64GB,Prateado,,...,,,,,,,,,,0
2,2018-05-18 00:16:10,viewed product,ad93850f,,5907.0,iPhone 5s,Bom - Sem Touch ID,16GB,Cinza espacial,,...,,,,,,,,,,0
3,2018-05-18 00:14:55,viewed product,ad93850f,,6023.0,iPhone 5s,Bom - Sem Touch ID,16GB,Dourado,,...,,,,,,,,,,0
4,2018-05-18 00:11:26,ad campaign hit,ad93850f,/comprar/iphone/iphone-5s,,,,,,,...,,,,,,,,,,0


In [6]:
set_entrenamiento = set_entrenamiento.set_index('person')


In [7]:
set_entrenamiento.head()

Unnamed: 0_level_0,timestamp,event,url,sku,model,condition,storage,color,skus,search_term,...,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version,label
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ad93850f,2018-05-18 00:11:27,viewed product,,304.0,iPhone 5s,Muito Bom,32GB,Cinza espacial,,,...,,,,,,,,,,0
ad93850f,2018-05-18 00:23:33,viewed product,,318.0,iPhone 5s,Muito Bom,64GB,Prateado,,,...,,,,,,,,,,0
ad93850f,2018-05-18 00:16:10,viewed product,,5907.0,iPhone 5s,Bom - Sem Touch ID,16GB,Cinza espacial,,,...,,,,,,,,,,0
ad93850f,2018-05-18 00:14:55,viewed product,,6023.0,iPhone 5s,Bom - Sem Touch ID,16GB,Dourado,,,...,,,,,,,,,,0
ad93850f,2018-05-18 00:11:26,ad campaign hit,/comprar/iphone/iphone-5s,,,,,,,,...,,,,,,,,,,0


In [8]:
#Armamos arrays con los features que vamos a tener en cuenta
features=list(set_entrenamiento.columns)
features

['timestamp',
 'event',
 'url',
 'sku',
 'model',
 'condition',
 'storage',
 'color',
 'skus',
 'search_term',
 'staticpage',
 'campaign_source',
 'search_engine',
 'channel',
 'new_vs_returning',
 'city',
 'region',
 'country',
 'device_type',
 'screen_resolution',
 'operating_system_version',
 'browser_version',
 'label']

In [9]:
#Eliminamos algunos features que consideramos irrelevantes
#features.remove('person')
features.remove('timestamp')
features.remove('url')
features.remove('skus')
features.remove('sku')
features.remove('staticpage')
features.remove('search_engine')
features.remove('browser_version')
features.remove('label')


In [10]:
#Lista de Features
features

['event',
 'model',
 'condition',
 'storage',
 'color',
 'search_term',
 'campaign_source',
 'channel',
 'new_vs_returning',
 'city',
 'region',
 'country',
 'device_type',
 'screen_resolution',
 'operating_system_version']

In [11]:
#Dividimos el set de entrenamientos para usar para la construccion del modelo y validarlo
train, validation=train_test_split(set_entrenamiento,test_size=0.20)
print("Tamaño set entrenamiento: ", len(train))
print("Tamaño set de validacion: ", len(validation))

Tamaño set entrenamiento:  937508
Tamaño set de validacion:  234378


In [12]:
#Encodeamos nuestras features
lb_make = LabelEncoder()

train['event'] = lb_make.fit_transform(train['event'].astype(str))
train['model'] = lb_make.fit_transform(train['model'].astype(str))
train['condition'] = lb_make.fit_transform(train['condition'].astype(str))
train['color'] = lb_make.fit_transform(train['color'].astype(str))
train['search_term'] = lb_make.fit_transform(train['search_term'].astype(str))
train['campaign_source'] = lb_make.fit_transform(train['campaign_source'].astype(str))
train['channel'] = lb_make.fit_transform(train['channel'].astype(str))
train['storage'] = lb_make.fit_transform(train['storage'].astype(str))
train['new_vs_returning'] = lb_make.fit_transform(train['new_vs_returning'].astype(str))
train['city'] = lb_make.fit_transform(train['city'].astype(str))
train['region'] = lb_make.fit_transform(train['region'].astype(str))
train['country'] = lb_make.fit_transform(train['country'].astype(str))
train['device_type'] = lb_make.fit_transform(train['device_type'].astype(str))
train['screen_resolution'] = lb_make.fit_transform(train['screen_resolution'].astype(str))
train['operating_system_version'] = lb_make.fit_transform(train['operating_system_version'].astype(str))

validation['event'] = lb_make.fit_transform(validation['event'].astype(str))
validation['model'] = lb_make.fit_transform(validation['model'].astype(str))
validation['condition'] = lb_make.fit_transform(validation['condition'].astype(str))
validation['color'] = lb_make.fit_transform(validation['color'].astype(str))
validation['search_term'] = lb_make.fit_transform(validation['search_term'].astype(str))
validation['campaign_source'] = lb_make.fit_transform(validation['campaign_source'].astype(str))
validation['channel'] = lb_make.fit_transform(validation['channel'].astype(str))
validation['storage'] = lb_make.fit_transform(validation['storage'].astype(str))
validation['new_vs_returning'] = lb_make.fit_transform(validation['new_vs_returning'].astype(str))
validation['city'] = lb_make.fit_transform(validation['city'].astype(str))
validation['region'] = lb_make.fit_transform(validation['region'].astype(str))
validation['country'] = lb_make.fit_transform(validation['country'].astype(str))
validation['device_type'] = lb_make.fit_transform(validation['device_type'].astype(str))
validation['screen_resolution'] = lb_make.fit_transform(validation['screen_resolution'].astype(str))
validation['operating_system_version'] = lb_make.fit_transform(validation['operating_system_version'].astype(str))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexin

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [13]:
#Empezamos a definir la variables x_train y_train 
x_train=train[features]
y_train=train['label']
x_validation = validation[features]
y_validation = validation['label']



In [14]:
#Validamos nuestro modelo
model = KNeighborsClassifier(n_neighbors=10, n_jobs=-1)
model.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
           weights='uniform')

In [15]:
#Hacemos un score del modelo

score = model.score(x_validation, y_validation)*100
print(score)

#predicted = model.predict(x_validation)
#print('Finalize')

91.38229697326541


In [17]:
#Nos traemos el set de datos de para predecir
#Leo el set de datos para predecir
kaggle_test = pd.read_csv("../trocafone_kaggle_test.csv");
kaggle_test.head()

Unnamed: 0,person
0,4886f805
1,0297fc1e
2,2d681dd8
3,cccea85e
4,4c8a8b93


In [18]:
test = pd.merge(kaggle_test,eventos, on='person', how='inner')
test.set_index('person', inplace=True)
test.head()

Unnamed: 0_level_0,timestamp,event,url,sku,model,condition,storage,color,skus,search_term,...,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4886f805,2018-05-18 00:11:59,viewed product,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,...,,,,,,,,,,
4886f805,2018-05-18 00:30:30,viewed product,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,...,,,,,,,,,,
4886f805,2018-05-18 00:07:23,search engine hit,,,,,,,,,...,Google,,,,,,,,,
4886f805,2018-05-18 00:11:56,checkout,,9288.0,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,...,,,,,,,,,,
4886f805,2018-05-18 00:11:35,viewed product,,9287.0,Samsung Galaxy J7 Prime,Muito Bom,32GB,Dourado,,,...,,,,,,,,,,


In [19]:
test= test[features]
test['label'] = 0 
test.head()

Unnamed: 0_level_0,event,model,condition,storage,color,search_term,campaign_source,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,label
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
4886f805,viewed product,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,,,0
4886f805,viewed product,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,,,0
4886f805,search engine hit,,,,,,,,,,,,,,,0
4886f805,checkout,Samsung Galaxy J7 Prime,Excelente,32GB,Dourado,,,,,,,,,,,0
4886f805,viewed product,Samsung Galaxy J7 Prime,Muito Bom,32GB,Dourado,,,,,,,,,,,0


In [20]:
test['event'] = lb_make.fit_transform(test['event'].astype(str))
test['model'] = lb_make.fit_transform(test['model'].astype(str))
test['condition'] = lb_make.fit_transform(test['condition'].astype(str))
test['color'] = lb_make.fit_transform(test['color'].astype(str))
test['search_term'] = lb_make.fit_transform(test['search_term'].astype(str))
test['campaign_source'] = lb_make.fit_transform(test['campaign_source'].astype(str))
test['channel'] = lb_make.fit_transform(test['channel'].astype(str))
test['storage'] = lb_make.fit_transform(test['storage'].astype(str))
test['new_vs_returning'] = lb_make.fit_transform(test['new_vs_returning'].astype(str))
test['city'] = lb_make.fit_transform(test['city'].astype(str))
test['region'] = lb_make.fit_transform(test['region'].astype(str))
test['country'] = lb_make.fit_transform(test['country'].astype(str))
test['device_type'] = lb_make.fit_transform(test['device_type'].astype(str))
test['screen_resolution'] = lb_make.fit_transform(test['screen_resolution'].astype(str))
test['operating_system_version'] = lb_make.fit_transform(test['operating_system_version'].astype(str))

In [21]:
test = test[features]
prediccion_final=model.predict(test)
prediccion_final

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [22]:
test['label'] = prediccion_final
test.head()

Unnamed: 0_level_0,event,model,condition,storage,color,search_term,campaign_source,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,label
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
4886f805,9,100,2,3,28,5675,16,7,2,1799,95,37,4,268,110,0
4886f805,9,100,2,3,28,5675,16,7,2,1799,95,37,4,268,110,0
4886f805,6,202,5,8,62,5675,16,7,2,1799,95,37,4,268,110,0
4886f805,2,100,2,3,28,5675,16,7,2,1799,95,37,4,268,110,0
4886f805,9,100,3,3,28,5675,16,7,2,1799,95,37,4,268,110,0


In [23]:
test = test.reset_index().drop_duplicates(subset='person', keep='last').set_index('person')
test.head()

Unnamed: 0_level_0,event,model,condition,storage,color,search_term,campaign_source,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,label
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
4886f805,10,202,5,8,62,5675,16,2,0,330,77,6,1,169,23,0
0297fc1e,5,192,5,8,62,5675,16,7,2,1799,95,37,4,268,110,0
2d681dd8,10,202,5,8,62,5675,16,3,1,877,81,6,0,55,69,0
cccea85e,10,202,5,8,62,5675,16,3,1,1147,81,6,0,83,67,0
4c8a8b93,10,202,5,8,62,5675,16,4,1,1487,81,6,1,169,23,0


In [24]:
test.drop(features, axis=1, inplace=True )


In [25]:
test.reset_index()

Unnamed: 0,person,label
0,4886f805,0
1,0297fc1e,0
2,2d681dd8,0
3,cccea85e,0
4,4c8a8b93,0
5,29ebb414,0
6,3dc1950f,0
7,8ea4c165,0
8,d8cfe234,0
9,d6bc64df,0


In [26]:
test.to_csv("../knn_n=10.csv")