In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import helpers.Utilidades as utilidades
import helpers.Preprocessors as pp
from sklearn.svm import SVC

In [2]:
dataset = pd.read_csv("./data/dataset.csv")
dataset.head()

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
0,10107,30,95.7,2,2871.0,2/24/2003 0:00,Shipped,1,2,2003,...,897 Long Airport Avenue,,NYC,NY,10022.0,USA,,Yu,Kwai,Small
1,10121,34,81.35,5,2765.9,5/07/2003 00:00,Shipped,2,5,2003,...,59 rue de l'Abbaye,,Reims,,51100.0,France,EMEA,Henriot,Paul,Small
2,10134,41,94.74,2,3884.34,7/01/2003 00:00,Shipped,3,7,2003,...,27 rue du Colonel Pierre Avia,,Paris,,75508.0,France,EMEA,Da Cunha,Daniel,Medium
3,10145,45,83.26,6,3746.7,8/25/2003 0:00,Shipped,3,8,2003,...,78934 Hillside Dr.,,Pasadena,CA,90003.0,USA,,Young,Julie,Medium
4,10159,49,100.0,14,5205.27,10/10/2003 00:00,Shipped,4,10,2003,...,7734 Strong St.,,San Francisco,CA,,USA,,Brown,Julie,Medium


In [3]:
columnasNumericasDiscretas, columnasNumericasContinuas, columnasCategoricas, columnasFecha = utilidades.ObtenerTiposColumnas(dataset)

In [4]:
y = "DEALSIZE"
columnasCategoricas.remove(y)
columnasCategoricas.remove("ORDERDATE")
columnasCategoricas.remove("TERRITORY")
columnasCategoricas.remove("ADDRESSLINE2")
columnasCategoricas.remove("STATE")
columnasNumericasDiscretas.remove("ORDERLINENUMBER")
columnasNumericasContinuas.remove("ORDERNUMBER")

# FEATURE ENGINEERING

## Variables Categoricas

In [5]:
dataset[columnasCategoricas].isnull().mean()

STATUS              0.000000
PRODUCTLINE         0.000000
PRODUCTCODE         0.000000
CUSTOMERNAME        0.000000
PHONE               0.000000
ADDRESSLINE1        0.000000
CITY                0.000000
POSTALCODE          0.026632
COUNTRY             0.000000
CONTACTLASTNAME     0.000000
CONTACTFIRSTNAME    0.000000
dtype: float64

## Varabiles Númericas Discretas

In [6]:
dataset[columnasNumericasDiscretas].isnull().mean()

QTR_ID      0.0
MONTH_ID    0.0
YEAR_ID     0.0
dtype: float64

## Variables Númericas Continuas 

In [7]:
dataset[columnasNumericasContinuas].isnull().mean()

QUANTITYORDERED    0.0
PRICEEACH          0.0
SALES              0.0
MSRP               0.0
dtype: float64

In [8]:
dataset[columnasNumericasContinuas]

Unnamed: 0,QUANTITYORDERED,PRICEEACH,SALES,MSRP
0,30,95.70,2871.00,95
1,34,81.35,2765.90,95
2,41,94.74,3884.34,95
3,45,83.26,3746.70,95
4,49,100.00,5205.27,95
...,...,...,...,...
2661,20,100.00,2244.40,54
2662,29,100.00,3978.51,54
2663,43,100.00,5417.57,54
2664,34,62.24,2116.16,54


## Pipeline

## Prepración

In [9]:
def instanciatePipeline(df, y):
    columnasNumericasDiscretas, columnasNumericasContinuas, columnasCategoricas, columnasFecha = utilidades.ObtenerTiposColumnas(df)
    
    columnasCategoricas.remove(y)
    columnasCategoricas.remove("ORDERDATE")
    columnasCategoricas.remove("TERRITORY")
    columnasCategoricas.remove("ADDRESSLINE2")
    columnasCategoricas.remove("STATE")
    columnasNumericasDiscretas.remove("ORDERLINENUMBER")
    columnasNumericasContinuas.remove("ORDERNUMBER")

    bankChurner_Pipeline = Pipeline(steps=[
        ('frequency-encoder',
            pp.CategoricalEncoderOperator(columnasCategoricas)),
        ('CCA-encoder',
            pp.CategoricalCCAEnconderOperator("POSTALCODE")),
        ('outlier-treatment',
            pp.OutliersTreatmentOperator(columnas= columnasNumericasContinuas))
    ])

    return bankChurner_Pipeline

In [10]:
X = dataset.drop(['ORDERNUMBER', 'ORDERLINENUMBER', 'ORDERDATE', 'TERRITORY', 'ADDRESSLINE2', 'STATE', 'DEALSIZE'], axis=1)
y = dataset['DEALSIZE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=2022)
dfSalida = instanciatePipeline(dataset, 'DEALSIZE').fit_transform(X_train, y_train)
dfSalida["DEALSIZE"] = pd.get_dummies(y, drop_first=True)

In [11]:
dfSalida.to_csv("./data/fe_dataset.csv")

## Aplicación Modelo

In [26]:
def instanciatePipelinePrediction():

    bankChurner_Pipeline = Pipeline(steps=[
        ('classifier',
            SVC(C= 0.1, kernel = "linear"))
    ])

    return bankChurner_Pipeline

In [27]:
datasetProcesado = pd.read_csv("./data/fe_dataset.csv")
X_fe = datasetProcesado.drop(["DEALSIZE"], axis = 1)
y_fe = datasetProcesado["DEALSIZE"]

X_train_fe, X_test_fe, y_train_fe, y_test_fe = train_test_split(X_fe, y_fe, test_size=0.3, shuffle=True, random_state=2022)
lr = instanciatePipelinePrediction().fit(X_train_fe, y_train_fe)
y_fe_prediccion = pd.DataFrame(lr.predict(X_test_fe), columns=["predicción"])
y_fe_prediccion["y_fe"] = y_fe
y_fe_prediccion

Unnamed: 0,predicción,y_fe
0,0,0
1,0,1
2,1,1
3,0,0
4,0,0
...,...,...
541,1,0
542,0,0
543,1,0
544,1,1
