# Resultados

Ajustamos os modelos definidos e calculamos a métrica da competição para facilitar a comparação com os modelos dos colegas de classe.

### Bibliotecas Necessárias

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
import math

### Carregar os dados

In [5]:
data_coupon1 = pd.read_csv('data/data_coupon1.csv', sep='|').set_index('Unnamed: 0')
data_coupon2 = pd.read_csv('data/data_coupon2.csv', sep='|').set_index('Unnamed: 0')
data_coupon3 = pd.read_csv('data/data_coupon3.csv', sep='|').set_index('Unnamed: 0')
y_basket = pd.read_csv('data/DMC_2015_orders_train.txt', sep='|')['basketValue']

In [6]:
data = data_coupon1.merge(data_coupon2, on=['orderTime', 'couponsReceived'])
data = data.merge(data_coupon3, on=['orderTime', 'couponsReceived'])

X = data.copy()
del X['coupon1Used']
del X['coupon2Used']
del X['coupon3Used']

In [7]:
y_coupon1 = data['coupon1Used']
y_coupon2 = data['coupon2Used']
y_coupon3 = data['coupon3Used']

### Modelos Selecionados

In [8]:
knnCoupon1 = KNeighborsClassifier(n_neighbors=35, n_jobs=-1)
knnCoupon2 = KNeighborsClassifier(n_neighbors=51, n_jobs=-1)
knnCoupon3 = KNeighborsClassifier(n_neighbors=41, n_jobs=-1)
knnBasket = KNeighborsRegressor(n_neighbors=101, n_jobs=-1)

In [9]:
knnCoupon1.fit(X, y_coupon1)
knnCoupon2.fit(X, y_coupon2)
knnCoupon3.fit(X, y_coupon3)
knnBasket.fit(X, y_basket)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=101, p=2,
          weights='uniform')

### Tratamento dos dados da competição

In [10]:
data_class = pd.read_csv('data/DMC_2015_orders_class.txt', sep='|')

In [11]:
del data_class['userID']
del data_class['orderID']
del data_class['couponID1']
del data_class['couponID2']
del data_class['couponID3']
data_class['orderTime'] = pd.DatetimeIndex(data_class['orderTime']).astype('int')
data_class['couponsReceived'] = pd.DatetimeIndex(data_class['couponsReceived']).astype('int')

In [12]:
data_class_coupon1 = data_class.copy()
data_class_coupon2 = data_class.copy()
data_class_coupon3 = data_class.copy()

In [13]:
data_class_coupon1 = data_class_coupon1.loc[:,[(x[-1] != '2' and x[-1] != '3') for x in data_class_coupon1.columns]].iloc[:,:-3]

data_class_coupon2 = data_class_coupon2.loc[:,[(x[-1] != '1' and x[-1] != '3') for x in data_class_coupon2.columns]].iloc[:,:-2]
del data_class_coupon2['coupon1Used']

data_class_coupon3 = data_class_coupon3.loc[:,[(x[-1] != '1' and x[-1] != '2') for x in data_class_coupon3.columns]].iloc[:,:-1]
del data_class_coupon3['coupon1Used']
del data_class_coupon3['coupon2Used']

In [14]:
data_class_coupon1 = pd.get_dummies(data_class_coupon1, columns=["brand1", "productGroup1"])
data_class_coupon2 = pd.get_dummies(data_class_coupon2, columns=["brand2", "productGroup2"])
data_class_coupon3 = pd.get_dummies(data_class_coupon3, columns=["brand3", "productGroup3"])

In [15]:
vetor1 = [d.split(',') for d in data_class_coupon1['categoryIDs1']]
categoryIDs1 = set(x for l in vetor1 for x in l)
vetor2 = [d.split(',') for d in data_class_coupon2['categoryIDs2']]
categoryIDs2 = set(x for l in vetor2 for x in l)
vetor3 = [d.split(',') for d in data_class_coupon3['categoryIDs3']]
categoryIDs3 = set(x for l in vetor3 for x in l)

In [16]:
d1 = dict()
for c in categoryIDs1:
    d1['categoryIDs1_'+c] = []
for i in vetor1:
    for c in categoryIDs1:
        if c in i:
            d1['categoryIDs1_'+c].append(1.0)
        else:
            d1['categoryIDs1_'+c].append(0)
            
d2 = dict()
for c in categoryIDs2:
    d2['categoryIDs2_'+c] = []
for i in vetor2:
    for c in categoryIDs2:
        if c in i:
            d2['categoryIDs2_'+c].append(1.0)
        else:
            d2['categoryIDs2_'+c].append(0)     
            
d3 = dict()
for c in categoryIDs3:
    d3['categoryIDs3_'+c] = []
for i in vetor3:
    for c in categoryIDs3:
        if c in i:
            d3['categoryIDs3_'+c].append(1.0)
        else:
            d3['categoryIDs3_'+c].append(0)     

In [17]:
data_class_coupon1 = data_class_coupon1.join(pd.DataFrame(d1))
del data_class_coupon1['categoryIDs1']
data_class_coupon2 = data_class_coupon2.join(pd.DataFrame(d2))
del data_class_coupon2['categoryIDs2']
data_class_coupon3 = data_class_coupon3.join(pd.DataFrame(d3))
del data_class_coupon3['categoryIDs3']

In [18]:
data_class_coupon1.fillna(data_class_coupon1.mean())
data_class_coupon2.fillna(data_class_coupon2.mean())
data_class_coupon3.fillna(data_class_coupon3.mean())

def normalize(df):
    return (df - df.min())/(df.max() - df.min())

data_class_coupon1 = normalize(data_class_coupon1)
data_class_coupon2 = normalize(data_class_coupon2)
data_class_coupon3 = normalize(data_class_coupon3)

In [19]:
data_c_s = data_class_coupon1.merge(data_class_coupon2, on=['orderTime', 'couponsReceived'])
data_c_s = data_c_s.merge(data_class_coupon3, on=['orderTime', 'couponsReceived'])

X_class = data_c_s.copy()

In [20]:
del X_class['coupon1Used']
del X_class['coupon2Used']
del X_class['coupon3Used']


In [21]:
col_to_delete = [c for c in X.columns if 'category' in c]
for c in col_to_delete:
    del X[c]

In [22]:
col_to_delete = [c for c in X_class.columns if 'category' in c]
for c in col_to_delete:
    del X_class[c]

### Treinamento dos modelos na base da competição

In [23]:
knnCoupon1.fit(X, y_coupon1)
knnCoupon2.fit(X, y_coupon2)
knnCoupon3.fit(X, y_coupon3)
knnBasket.fit(X, y_basket)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=101, p=2,
          weights='uniform')

In [24]:
train_brands = [c for c in X.columns if 'brand' in c]
class_brands = [c for c in X_class.columns if 'brand' in c]

In [25]:
different_brands = [c for c in train_brands if c not in class_brands]

In [76]:
for c in different_brands:
    del X[c]

In [26]:
different_brands = [c for c in class_brands if c not in train_brands]

In [78]:
for c in different_brands:
    del X_class[c]

In [30]:
class_groups = [c for c in X_class.columns if 'Group' in c]

In [31]:
train_groups = [c for c in X.columns if 'Group' in c]

In [32]:
different_groups = [c for c in train_groups if c not in class_groups]

In [33]:
different_groups = [c for c in class_groups if c not in train_groups]

In [93]:
for g in different_groups:
    del X[g]

In [95]:
for g in different_groups:
    del X_class[g]

### Previsões e cálculo da métrica da competição

In [100]:
predicted_coupon1 = knnCoupon1.predict(X_class)
predicted_coupon2 = knnCoupon2.predict(X_class)
predicted_coupon3 = knnCoupon3.predict(X_class)
predicted_basket = knnBasket.predict(X_class)

In [105]:
results = pd.read_csv('data/DMC_2015_realclass.txt', sep='|')

In [107]:
def E(pre_c1, pre_c2, pre_c3, pre_basket, real_c1, real_c2, real_c3, real_basket):
    c1 = []
    c2 = []
    c3 = []
    basket = []
    for i in range(0, len(pre_c1)):
        c1.append(math.pow(math.fabs(real_c1[i]-pre_c1[i])/(sum(real_c1)/len(real_c1)), 2))
        c2.append(math.pow(math.fabs(real_c2[i]-pre_c2[i])/(sum(real_c2)/len(real_c2)), 2))
        c3.append(math.pow(math.fabs(real_c3[i]-pre_c3[i])/(sum(real_c3)/len(real_c3)), 2))
        basket.append(math.pow(math.fabs(real_basket[i]-pre_basket[i])/(sum(real_basket)/len(real_basket)), 2))
    return sum(c1+c2+c3+basket)

In [108]:
E(predicted_coupon1, predicted_coupon2, predicted_coupon3, predicted_basket, results['coupon1Used'], results['coupon2Used'], results['coupon3Used'], results['basketValue'])

15136.2378092399