# Cargamos un dataset de ejemplo 

Vamos a cargar un archivo .csv que hemos dejado en una carpeta dentro de nuestro Drive

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import numpy as np
import pandas as pd

In [0]:
data1 = pd.read_csv('gdrive/My Drive/Introducción a Machine Learning/Clase/Python/dataset/airquality.csv')
data1.describe()

In [0]:
data1.head(6)

In [0]:
data1.iloc[3:10,2] = np.nan
data1.iloc[0:5,3] = np.nan

data1.describe()

In [0]:
data1.head(10)

In [0]:
data1.isna().sum()

# Imputación de datos númericos

https://sklearn.org/modules/generated/sklearn.preprocessing.Imputer.html

In [0]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values=np.nan, strategy='mean', axis=0)

# strategy: “mean”, “median”, “most_frequent”
# axis    : 0 (impute along columns), 1 (impute along rows)

In [0]:
imp.fit(data1)

In [0]:
completedData = imp.transform(data1)
type(completedData)

In [0]:
completedData = pd.DataFrame(completedData)
completedData.columns = data1.columns
completedData.head()

Otra opción: 

https://scikit-learn.org/stable/modules/impute.html

## Eliminar filas o columnas

In [0]:
# Eliminar las columnas "Month" y "Day"
data2 = data1.drop(["Month","Day"],axis=1)

# Eliminar aquellas filas o columnas que presenten NaN
data2.dropna(axis=0, how='any')

# axis: 0 or 'index' (drop rows)
#       1 or 'columns' (drop columns)

    
# how : 'any' (if any NA values are present)
#       'all' (if all values are NA)


In [0]:
data2.shape

In [0]:
data2.head()

In [0]:
data2 = data2.dropna(axis=0, how='any')
data2.shape

# Escalamiento

In [0]:
data2 = completedData.copy()
data2.boxplot()

## Estandarización

In [0]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data2)
data2_standarized = scaler.transform(data2)

type(data2_standarized)

In [0]:
import matplotlib.pyplot as plt
_ = plt.boxplot(data2_standarized)
plt.grid()

In [0]:
#print(scaler.mean_)
#print(scaler.scale_)
#print(scaler.var_[0])

## Normalización (MinMax)

In [0]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
from sklearn.preprocessing import MinMaxScaler
scalerN = MinMaxScaler()
scalerN.fit(data2)
data2_normalized = scalerN.transform(data2)

type(data2_normalized)

In [0]:
import matplotlib.pyplot as plt
_ = plt.boxplot(data2_normalized)
plt.grid()

# Variables Dummy 

In [0]:
#np.repeat(["A","B","C"], 50)

In [0]:
completedData.head(3)

In [0]:
completedData["Country"] = np.array(list("ABC"*50)+["A","A","A"])

In [0]:
completedData.head(3)

In [0]:
newVars = pd.get_dummies(completedData.Country)

In [0]:
newVars.head()

In [0]:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html

dataDummy = pd.concat([completedData,newVars],axis=1)

dataDummy.head(3)

# Binarization (dicotómicas)

In [0]:
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=10)
binarizer.fit(completedData[["Ozone","Wind"]])
dataBinary = binarizer.transform(completedData[["Ozone","Wind"]])

In [0]:
type(dataBinary)

In [0]:
data3 = completedData.copy()
data3["OzoneBin"] = dataBinary[:,0]
data3["WindBin"] = dataBinary[:,1]
data3.head(10)

# Codificar Categorias

In [0]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
enc.fit(completedData.Country)
dataCod = enc.transform(completedData.Country)

In [0]:
data4 = completedData.copy()
data4["CountryCod"] = dataCod
data4.head()

# Generar características polinómicas y de interacción

In [0]:
completedData.head(3)

In [0]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
poly.fit(completedData[["Ozone","Solar.R","Wind","Temp"]])
tmp = poly.transform(completedData[["Ozone","Solar.R","Wind","Temp"]])

In [0]:
# a b c d a**2 b**2 c**2 d**2 ab ac ad bc bd cd
tmp.shape

In [0]:
tmp[0,]

In [0]:
tmp[0:6,0:4]

# Matriz de correlación

In [0]:
completedData.dtypes

In [0]:
completedData.corr()

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,10))
sns.heatmap(completedData.corr(), square=True, annot=True)

# Reducción de dimensionalidad

In [0]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [0]:
# http://www.ub.edu/stat/docencia/Mates/ejemploACP.PDF
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


indextmp = completedData.dtypes != "object"

X = completedData.iloc[:,indextmp.values]
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [0]:
pca = PCA(n_components=2)
pca.fit(X)
Xnew = pca.transform(X)

In [0]:
Xnew.shape

In [0]:
type(Xnew)

In [0]:
Xnew_df = pd.DataFrame(data = Xnew, columns = ['pc1', 'pc2'])

**Consideraciones al reducir dimensionalidad**

Ideal que las nuevas variables cubran al menos el 90% de la varianza total!

In [0]:
pca.explained_variance_

In [0]:
pca.explained_variance_ratio_

In [0]:
pca.explained_variance_ratio_.cumsum()

In [0]:
pca = PCA(n_components=0.9)
pca.fit(X)
Xnew = pca.transform(X)
Xnew.shape

In [0]:
X.shape