In [None]:
# Modelo y performance
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Visualización de la data
import matplotlib.pyplot as plt
import seaborn as sns

# Manipulación de la data
import pandas as pd
import numpy as np

In [None]:
def plot_corre_heatmap(corr):
    '''
    Recibe como parametro un dataframe de correlaciones entre las columnas.
    '''
    
    plt.figure(figsize=(30,26))
    sns.heatmap(corr, cbar = True,  square = False, annot=True, fmt= '.2f',annot_kws={'size': 15},cmap= 'coolwarm')
    #plt.xticks(rotation = 45)
    #plt.yticks(rotation = 45)

    # Se arregla un problema de visualización
    b, t = plt.ylim() # Muestra los valores
    b += 0.5 # Se agrega 0.5 al bottom
    t -= 0.5 # Se resta 0.5 del top
    plt.ylim(b, t) # Se actualiza el valor del ylim(bottom, top)
    plt.show()

In [None]:
def transform(df, target='', test_size=False, random_state=False, ros=False):

    ord_enc = OrdinalEncoder(dtype=np.int64)
    one_hot = OneHotEncoder(dtype=np.int64)

    columns_to_ordinal = ['product_id', 'seller_id', 'customer_id', 'cod_estado_customer', 'cod_estado_seller']
    columns_to_one_hot = ['payment_type']

    df[columns_to_ordinal] = ord_enc.fit_transform(df[columns_to_ordinal])
    df_one_hot = one_hot.fit_transform(df[columns_to_one_hot]).toarray()
    df_one_hot_labels = np.concatenate(one_hot.categories_).tolist()
    df_one_hot = pd.DataFrame(df_one_hot, columns=df_one_hot_labels)
    df = pd.concat([df, df_one_hot], axis=1)
    df.drop('payment_type', axis=1, inplace=True)

    df[[target]] = df[[target]].mask(df[[target]] <= 3, 0)
    df[[target]] = df[[target]].mask(df[[target]] > 3, 1)

    y = df[[target]]
    X = df.drop(target, axis=1)

    scaler = StandardScaler()

    if test_size:

        if ros:
            ros = RandomOverSampler(sampling_strategy=ros)
            X, y = ros.fit_resample(X, y)

        X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        X_Train = scaler.fit_transform(X_Train)
        X_Test = scaler.transform(X_Test)

        return X_Train, X_Test, Y_Train, Y_Test

    else:

        X = scaler.fit_transform(X)

        return X, y

In [None]:
df_train_test = pd.read_csv('datasets\olist_to_train_test.csv')
X_Train, X_Test, Y_Train, Y_Test = transform(df_train_test, target='review_score', test_size=0.2, random_state=40, ros=0.8)

In [None]:
# Para correlacionar con los datos estandarizados

a = X_Train.copy()
a = pd.DataFrame(a)
b = Y_Train.reset_index(drop=True)
a = pd.concat([a, b], axis=1, ignore_index=True)


In [None]:
#corr = a.corr()
#plot_corre_heatmap(corr)

In [None]:
rfc_model = RandomForestClassifier()
rfc_model.fit(X_Train, Y_Train.values.ravel())
rfc_Y_Pred = rfc_model.predict(X_Test)

In [None]:
naives_model = GaussianNB()
naives_model.fit(X_Train, Y_Train.values.ravel())
gaus_Y_Pred = naives_model.predict(X_Test)

In [None]:
tree_model = DecisionTreeClassifier()
tree_model.fit(X_Train, Y_Train.values.ravel())
tree_Y_Pred = tree_model.predict(X_Test)

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_Train, Y_Train.values.ravel())
knn_Y_Pred = knn_model.predict(X_Test)

In [None]:
svc_model = SVC(kernel='linear')
svc_model.fit(X_Train, Y_Train.values.ravel())
svc_Y_Pred = svc_model.predict(X_Test)

In [None]:
df_to_pred = pd.read_csv('datasets\olist_to_predict.csv')
X_to_pred, y_real = transform(df_to_pred, target='review_score')

In [None]:
rfc_Y_Pred2 = rfc_model.predict(X_to_pred)
gaus_Y_Pred2 = naives_model.predict(X_to_pred)
tree_Y_Pred2 = tree_model.predict(X_to_pred)
knn_Y_Pred2 = knn_model.predict(X_to_pred)
svc_Y_Pred2 = svc_model.predict(X_to_pred)


In [None]:
#RandomForestClassifier
print('Train', classification_report(Y_Test, rfc_Y_Pred), 'Real', classification_report(y_real, rfc_Y_Pred2))

In [None]:
#GaussianNB
print('Train', classification_report(Y_Test, gaus_Y_Pred), 'Real', classification_report(y_real, gaus_Y_Pred2))

In [None]:
#DecisionTreeClassifier
print('Train', classification_report(Y_Test, tree_Y_Pred), 'Real', classification_report(y_real, tree_Y_Pred2))

In [None]:
#KNeighborsClassifier
print('Train', classification_report(Y_Test, knn_Y_Pred), 'Real', classification_report(y_real, knn_Y_Pred2))

In [None]:
#SVC
print('Train', classification_report(Y_Test, svc_Y_Pred), 'Real', classification_report(y_real, svc_Y_Pred2))

In [None]:
'''
import sqlite3 as sql

query = """SELECT
    oi.product_id, oi.seller_id, strftime('%s', oi.shipping_limit_date) shipping_limit_date, oi.price, oi.freight_value,
    o.customer_id, strftime('%s', o.order_purchase_timestamp) order_purchase_timestamp, strftime('%s', o.order_approved_at) order_approved_at, strftime('%s', o.order_delivered_carrier_date) order_delivered_carrier_date, strftime('%s', o.order_delivered_customer_date) order_delivered_customer_date, strftime('%s', o.order_estimated_delivery_date) order_estimated_delivery_date,
    p.product_name_lenght, p.product_description_lenght, p.product_photos_qty, p.product_weight_g, p.product_length_cm, p.product_height_cm, p.product_width_cm,
    pc.product_category_id,
    c.CEP AS CEP_customer,
    g_cus.cod_estado AS cod_estado_customer,
    s.CEP AS CEP_seller,
    g_sell.cod_estado AS cod_estado_seller,
    op.payment_sequential, op.payment_type, op.payment_installments, op.payment_value
FROM order_items oi
JOIN orders o ON (oi.order_id = o.order_id)
JOIN products p ON (oi.product_id = p.product_id)
JOIN product_category pc ON (p.product_category_id = pc.product_category_id)
JOIN customers c ON (o.customer_id = c.customer_id)
JOIN geolocation g_cus ON (c.CEP = g_cus.CEP)
JOIN sellers s ON (oi.seller_id = s.seller_id)
JOIN order_payments op ON (op.order_id = o.order_id)
JOIN geolocation g_sell ON (s.CEP = g_sell.CEP)
WHERE o.order_status != 'canceled'
AND o.order_delivered_customer_date != '2016-01-01 00:00:00.000000'
AND o.order_purchase_timestamp BETWEEN '2018-08-22' AND DATETIME('now')"""

conn = sql.connect('olist.db')
cursor = conn.cursor()
cursor.execute(query)
nombres_columnas = [desc[0] for desc in cursor.description]
resultados = cursor.fetchall()
df = pd.DataFrame(resultados, columns=nombres_columnas)
conn.close()
'''  