# Tratamento dos Dados Brutos

### Bibliotecas Necessárias

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

### Importar os dados

Depois de importar os dados, é necessário remover as colunas que não serão utilizadas, ou seja, as colunas de identificadores.

Em seguida transformamos as strings de data para valores inteiros.

In [2]:
data = pd.read_csv('data/DMC_2015_orders_train.txt',sep="|")
del data['userID']
del data['orderID']
del data['couponID1']
del data['couponID2']
del data['couponID3']
data['orderTime'] = pd.DatetimeIndex(data['orderTime']).astype('int')
data['couponsReceived'] = pd.DatetimeIndex(data['couponsReceived']).astype('int')

### Criação dos dados para cada classificador

Cada classificador tera seu próprio conjunto de dados e cada um receberá um tratamento para eliminar as informações dos demais.

In [3]:
data_coupon1 = data.copy()
data_coupon2 = data.copy()
data_coupon3 = data.copy()

In [4]:
data_coupon1 = data_coupon1.loc[:,[(x[-1] != '2' and x[-1] != '3') for x in data_coupon1.columns]].iloc[:,:-3]

data_coupon2 = data_coupon2.loc[:,[(x[-1] != '1' and x[-1] != '3') for x in data_coupon2.columns]].iloc[:,:-2]
del data_coupon2['coupon1Used']

data_coupon3 = data_coupon3.loc[:,[(x[-1] != '1' and x[-1] != '2') for x in data_coupon3.columns]].iloc[:,:-1]
del data_coupon3['coupon1Used']
del data_coupon3['coupon2Used']

### Substituir valores categóricos

para cada valor diferente incluir uma nova coluna com valores \[1, 0\] indicando presença ou ausência

In [5]:
data_coupon1 = pd.get_dummies(data_coupon1, columns=["brand1", "productGroup1"])
data_coupon2 = pd.get_dummies(data_coupon2, columns=["brand2", "productGroup2"])
data_coupon3 = pd.get_dummies(data_coupon3, columns=["brand3", "productGroup3"])

In [7]:
# vetor = [d.split(',') for d in data_cupom1['categoryIDs1']]
# tam = [len(d) for d in vetor]
# print(max(tam))

In [10]:
del data_coupon1['categoryIDs1']
del data_coupon2['categoryIDs2']
del data_coupon3['categoryIDs3']

In [11]:
data_coupon1.fillna(data_coupon1.mean())
data_coupon2.fillna(data_coupon2.mean())
data_coupon3.fillna(data_coupon3.mean())

def normalize(df):
    return (df - df.min())/(df.max() - df.min())

data_coupon1 = normalize(data_coupon1)
data_coupon2 = normalize(data_coupon2)
data_coupon3 = normalize(data_coupon3)

In [15]:
def makeAccsKnn(X, y, test_values):
    accs = []
    best_k = 1
    maxi = 0.
    for k in test_values:
        knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
        scores = cross_val_score(knn, X, y, cv=10)
        acc = np.mean(scores)
        if(acc > maxi):
            maxi = acc
            best_k = k
        accs.append(acc)
    return accs, maxi, best_k

In [16]:
def plotAcc(X, y, test_values):
    accs, maxi, best_k = makeAccsKnn(X, y, test_values)
    plt.plot(test_values, accs)
    plt.xlabel('K')
    plt.ylabel('Acurácia')
    plt.title('Acurácias por valores de K')
    plt.show()
    print('Maior acurácia: '+ str(maxi))
    print('Melhor k: '+ str(best_k))

In [None]:
X = data_coupon1.copy()
del X['coupon1Used']
y = data_coupon1['coupon1Used']
plotAcc(X, y, list(range(1, 203, 2)))

In [12]:
X = data_coupon1.copy()
del X['coupon1Used']
y = data_coupon1['coupon1Used']

scores = cross_val_score(knn, X, y, cv=10)
np.mean(scores)

0.76160872348523223

In [13]:
X = data_coupon2.copy()
del X['coupon2Used']
y = data_coupon2['coupon2Used']

scores = cross_val_score(knn, X, y, cv=10)
np.mean(scores)

0.8134810651444434

In [14]:
X = data_coupon3.copy()
del X['coupon3Used']
y = data_coupon3['coupon3Used']

scores = cross_val_score(knn, X, y, cv=10)
np.mean(scores)

0.83347143925711065