# Clean data

### Import Libraries

In [None]:
import pandas as pd # Librería para la manipulación y el análisis de datos
import numpy as np # Librería para la manipulación de datos y para la ejecución de operaciones matemáticas
import matplotlib.pyplot as plt # Librería para la visualización de datos
import seaborn as sns # Librería para la visualización de datos
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OrdinalEncoder # Librería para crear modelos de ML

### Import Dataset

In [None]:
data = pd.read_csv('dataset_seguros_sin_nulos.csv')
data2 = data.copy()
data2.head()

### Analyse dataset

In [None]:
data2.info()

In [None]:
data2.shape

In [None]:
for i in data2.columns:
    print(i, data2[i].value_counts(),'/n')

In [None]:
for i in data2.columns:
    print(i, data2[i].dtype.kind)

### Set target

In [None]:
target = ["PRECIO"]

### Functions
num_list, category_list, bool_list **get_vtype_grps**(dataset, target)

In [None]:
def get_vtype_grps(dataset,target):
    num_list = []
    category_list = []
    bool_list = []

    for i in dataset:
        if (dataset[i].dtype.kind=="i" or dataset[i].dtype.kind=="f") and len(dataset[i].unique()) != 2 and (i not in target):
            num_list.append(i)
        elif (dataset[i].dtype.kind=="i" or dataset[i].dtype.kind=="f") and len(dataset[i].unique()) == 2 and (i not in target):
            bool_list.append(i)
        elif (dataset[i].dtype.kind=="O") and (i not in target):
            category_list.append(i)

    return num_list, category_list, bool_list

### Transform data

Change the variable FUMADOR from bool (True, False) to integer (1, 0). 
First create a new variable transformed, then delete the variable FUMADOR

In [None]:
data2["FUMADOR_BOOL"] = data2["FUMADOR"].astype(int) # map({"SI": 1, "NO": 0})
del(data2["FUMADOR"])
data2.head()

Change the variable SEXO from string to integer (1, 0)

In [None]:
data2["SEXO_BOOL"] = data2["SEXO"].map({"MASCULINO": 0, "FEMENINO": 1})
# data2["SEXO_BOOL"] = np.where(data2["SEXO"] == "MASCULINO", 0, 1)
del(data2["SEXO"])
data2.head()

### Data analysis II

In [None]:
num_list, category_list, bool_list = get_vtype_grps(data2, target)

In [None]:
for i in num_list:
    print(data2.hist(i,bins=20))

In [None]:
for i in category_list:
    print(data2[i].value_counts(),'\n')

### Analyse Correlations

In [None]:
corr = data2.corr(numeric_only=True)
corr.style.background_gradient(cmap='coolwarm')

As we see big correlation between PESO and IMC, we can eliminate one of them

In [None]:
del(data2["PESO"])
data2.head()

In [None]:
sns.boxplot(x=data2["PRECIO"])

In [None]:
data2["LOG_PRECIO"] = np.log10(data2["PRECIO"])
data2.head()

In [None]:
data2.hist(column="LOG_PRECIO")

In [None]:
sns.boxplot(x=data2["LOG_PRECIO"])

### Target VS Features

In [None]:
# Update target to include the new log-transformed price

target = ["PRECIO","LOG_PRECIO"]

In [None]:
sns.violinplot(x="SEXO_BOOL", y="PRECIO", data=data2, palette="Wistia")

In [None]:
sns.boxplot(x="SEXO_BOOL", y="PRECIO", data=data2, palette="Wistia")

In [None]:
sns.violinplot(x="SEXO_BOOL", y="LOG_PRECIO", data=data2, palette="Wistia")

Fumador VS Precio

In [None]:
sns.violinplot(x="FUMADOR_BOOL", y="PRECIO", data=data2, palette="Wistia")

In [None]:
sns.boxplot(x="FUMADOR_BOOL", y="PRECIO", data=data2, palette="Wistia")

In [None]:
sns.violinplot(x="FUMADOR_BOOL", y="LOG_PRECIO", data=data2, palette="Wistia")

Tipo Coche VS Precio

In [None]:
sns.boxplot(x="TIPO_COCHE", y="PRECIO", data=data2, palette="Wistia")

Ciudad VS Precio

In [None]:
sns.boxplot(x="CIUDAD", y="PRECIO", data=data2, palette="Wistia")