# Principal Component Analysis (PCA)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

np.set_printoptions(suppress=True)

In [None]:
# Vamos a explicar los parametros del modelo con un ejemplo simple
# Generamos un arreglo aleatorio de datos

# rng = np.random
rng = np.random.RandomState(1)
components = rng.rand(2, 2)
X = np.dot(components, rng.randn(2, 200)).T
plt.scatter(X[:, 0], X[:, 1])
plt.axis('equal');

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)

In [None]:
X_pca = pca.transform(X)

In [None]:
print(f'PCA components: \n{pca.components_}')
print(f'PCA Explained variance: {pca.explained_variance_}')

In [None]:
def draw_vector(v0, v1, ax=None):
    ax = ax or plt.gca()
    arrowprops=dict(arrowstyle='->',
                    linewidth=2,
                    shrinkA=0, 
                    shrinkB=0,
                    color='r')
    ax.annotate('', v1, v0, arrowprops=arrowprops)

# plot data
plt.scatter(X[:, 0], X[:, 1], alpha=0.5)
plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.5, c='y')
for length, vector in zip(pca.explained_variance_, pca.components_):
    v = vector * 3 * np.sqrt(length)
    draw_vector(pca.mean_, pca.mean_ + v)
plt.axis('equal');

# Importamos dataset

In [None]:
df = pd.read_csv("dataset_agricultura.csv")

In [None]:
df.head()

In [None]:
df.drop(columns='ID', inplace=True)

# Preparamos la data

In [None]:
X = df.drop('Cultivo_Daño', axis=1)
y = df['Cultivo_Daño']

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Scaler = StandardScaler()

X_scaled = Scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y,
                                                    test_size = 0.2,
                                                    stratify = y,
                                                    random_state = 15)

In [None]:
# Ahora vamos a ver como utilizarlo con el dataset

# Inicializamos el PCA
pca = PCA(n_components=6, 
          svd_solver='full')

# Entrenamos con los datos de prueba
pca.fit(X_train)

# convertimos tanto el X_train como el X_test
X_train_pca = pca.transform(X_train)
X_val_pca = pca.transform(X_val)

In [None]:
pca.explained_variance_ratio_.sum()

In [None]:
# ahora vamos a ver la varianza que tienen las variables del PCA
sns.set_style('whitegrid')

features = range(pca.n_components_)
plt.figure(figsize=(20, 10))

plt.bar(features, pca.explained_variance_ratio_)
plt.xticks(features)
plt.xlabel('PCA feature')
plt.ylabel('Variance')

plt.show()

# Predicciones

Luego de realizar el analisis de componentes principales, podemos utilizar el resultado para armar un modelo de prediccion. En este caso volveremos a utilizar la regresion logistica, pero cambiaremos un poco el modelado del PCA.

In [None]:
# importamos la libreria de regresion logistica
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=10000)

lr.fit(X_train_pca, y_train)

print('Los resultados son: \n')
print('El accuracy de prueba es de: {}'.format(lr.score(X_train_pca, y_train)))
print('El accuracy de test es de: {}'.format(lr.score(X_val_pca, y_val)))

In [None]:
# Ahora vamos a probar con el mismo modelo de la semana pasada para ver la diferencia de performance

import xgboost as xgb
from sklearn.metrics import accuracy_score

In [None]:
xg_cl = xgb.XGBClassifier(objective='binary:logistic',
                          learning_rate=1,
                          reg_lambda=100,
                          min_split_loss=1,
                          colsample_bytree=0.5,
                          n_estimators=100,
                          seed=123)

xg_cl.fit(X_train_pca, y_train)

# Verifiquemos el accuracy del modelo para el train y val set

y_train_pred = xg_cl.predict(X_train_pca)
y_val_pred = xg_cl.predict(X_val_pca)

print(f'Accuracy train: {accuracy_score(y_train, y_train_pred)}')
print(f'Accuracy val: {accuracy_score(y_val, y_val_pred)}')