# **Imports**

In [1]:
import numpy as np
import pandas as pd
import time
import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import StratifiedKFold


# **Modelo KNN**

In [11]:
from pickle import NONE
class KNearestNeighbors(BaseEstimator, ClassifierMixin):
    def __init__(self, X_train, y_train, n_neighbors=5, weights='uniform'):

        self.X_train = X_train
        self.y_train = y_train

        self.n_neighbors = n_neighbors
        self.weights = weights

        self.n_classes = 10
        self.prediction = NONE

    def euclidian_distance(self, a, b):
        return np.sqrt(np.sum((a - b)**2, axis=1))

    def kneighbors(self, X_test, return_distance=False):

        dist = []
        neigh_ind = []
        point_dist = []
        for x_test in X_test:
            point_dist += [ self.euclidian_distance(x_test, self.X_train) ]

        for row in point_dist:
            enum_neigh = enumerate(row)
            sorted_neigh = sorted(enum_neigh,
                                  key=lambda x: x[1])[:self.n_neighbors]

            ind_list = [tup[0] for tup in sorted_neigh]
            dist_list = [tup[1] for tup in sorted_neigh]

            dist.append(dist_list)
            neigh_ind.append(ind_list)

        if return_distance:
            return np.array(dist), np.array(neigh_ind)

        return np.array(neigh_ind)

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):

        if self.weights == 'uniform':
            neighbors = self.kneighbors(X_test)
            y_pred = np.array([
                np.argmax(np.bincount(self.y_train[neighbor]))
                for neighbor in neighbors
            ])
            return y_pred

        if self.weights == 'distance':
            distances, neighbors = self.kneighbors(X_test, return_distance=True)
            y_pred = np.array([
                np.argmax(np.bincount(self.y_train[neighbor], 1 / (distances[i] + 1e-6)))
                for i, neighbor in enumerate(neighbors)
            ])
            self.prediction = y_pred
            return y_pred

    def accuracy(self,y_test):
        y_pred = self.prediction
        return float(sum(y_pred == y_test)) / float(len(y_test)), y_pred, y_test

    def precision(self,y_test):
        y_pred = self.prediction
        return float(sum(y_pred[y_pred == 1.0] and y_pred == y_test)) / float(sum(y_pred[y_pred == 1.0] and y_pred != y_test)), y_pred, y_test

    def recall(self,y_test):
        y_pred = self.prediction
        return float(sum(y_pred[y_pred == 1.0] and y_pred == y_test)) / float(sum(y_pred[y_pred == 0.0] and y_pred != y_test)), y_pred, y_test

    def fonescore(self,y_test):
        recall = self.recall(y_test)
        precision = self.precision(y_test)
        return float(2*recall*precision) / float(precision + recall), y_pred, y_test

    def AUC(self,y_test):
        y_pred = self.prediction
        return float(sum(y_pred == y_test)) / float(len(y_test)), y_pred, y_test

    def ROC(self,y_test):
        y_pred = self.prediction
        return float(sum(y_pred == y_test)) / float(len(y_test)), y_pred, y_test


# **Modelo Árbol de Decisión**

# **Modelo Regresión Logística**

# **Dataset WeatherAUS**

In [12]:
dataset_path = "weatherAUS.csv"
dataset = pd.read_csv(dataset_path)

#Eliminemos las filas que no tengan un RainTomorrow definido
dataset = dataset.dropna(subset=["RainTomorrow"])

# Eliminamos algunos features de texto que no representan mayor diferencia al realizar la aproximación y obteneos los datos y el target
X = dataset.drop(["RainTomorrow", "Date", "WindDir9am", "WindDir3pm", "WindGustDir"], axis=1)
y = dataset["RainTomorrow"].replace({'No': 0, 'Yes': 1})

#Reemplazamos RainToday por números y las ubicaciones por números, guardando la referencia en location_number
X["RainToday"] = X["RainToday"].replace({'No': 0.0, 'Yes': 1.0})
unique_classes = X["Location"].unique()
location_number = {cls: idx for idx, cls in enumerate(unique_classes)}
X["Location"] = X["Location"].replace(location_number)

#Normalicemos los datos
min_val = X.min(axis=0)
max_val = X.max(axis=0)
X = (X - min_val) / (max_val - min_val)

#De los features de Wind vamos a quitar los más relacionados, al revisar los datos se ve que el WindGustSpeed está relacionado
#a los otros dos features, por lo que lo removeremos.
windCov = X[["WindGustSpeed", "WindSpeed9am", "WindSpeed3pm"]].cov()
#print(windCov)
X = X.drop("WindGustSpeed", axis=1)

#Vamos a analizar los feature humidity y ver cómo se relacionan, no vamos a eliminar ninguno pues el nivel de relación no es lo suficientemente alto
humidCov = X[["Humidity9am", "Humidity3pm"]].cov()
#print(humidCov)

#Vamos a analizar los feature pressure y ver cómo se relacionan, note que son altamente relacionados, por lo que vamos a eliminar uno arbitrariamente
pressureCov = X[["Pressure9am", "Pressure3pm"]].cov()
#print(pressureCov)
X = X.drop("Pressure9am", axis=1)

#Vamos a analizar los feature cloud y ver cómo se relacionan, no vamos a eliminar ninguno pues el nivel de relación no es lo suficientemente alto
cloudCov = X[["Cloud9am", "Cloud3pm"]].cov()
#print(cloudCov)

#Vamos a analizar los feature temp y ver cómo se relacionan, note que son altamente relacionados, por lo que vamos a eliminar el de las 9am como se hizo con pressure
tempCov = X[["Temp9am", "Temp3pm"]].cov()
#print(tempCov)
X = X.drop("Temp9am", axis=1)

#Vamos a analizar si hay otros features altamente relacionados
#Note que MaxTemp está altamente relacionada con Temp3pm por lo que eliminaremos Temp3pm
XCov = X.cov()
#print(XCov)
X = X.drop("Temp3pm", axis=1)

#print(len(X))

#Vamos a reemplazar los nan con valores medios para que Sklearn y nuestro modelo puedan aceptarlos
X = X.to_numpy()
y = y.to_numpy()
WAX_train, WAX_test, WAy_train, WAy_test = train_test_split(X, y, test_size=0.15, random_state=45)

mean_train = np.nanmean(WAX_train)
mean_test = np.nanmean(WAX_test)

WAX_train[np.isnan(WAX_train)] = mean_train
WAX_test[np.isnan(WAX_test)] = mean_test



# **Dataset Notas del Curso**

# **Dataset escogido**

# **Gridsearch KNN - WeatherAUS**

In [13]:
distance_classifier_3 = KNearestNeighbors(WAX_train, WAy_train, n_neighbors=3, weights='distance')
distance_classifier_5 = KNearestNeighbors(WAX_train, WAy_train, n_neighbors=5, weights='distance')
distance_classifier_7 = KNearestNeighbors(WAX_train, WAy_train, n_neighbors=7, weights='distance')

distance_classifier_3.predict(WAX_test)
distance_classifier_5.predict(WAX_test)
distance_classifier_7.predict(WAX_test)






KeyboardInterrupt: 

# **Métricas KNN - WeatherAUS**

In [None]:
accuracy = [distance_classifier_3.accuracy(WAy_test)[0], distance_classifier_5.accuracy(WAy_test)[0], distance_classifier_7.accuracy(WAy_test)[0]]
#precision = distance_classifier.precision(WAy_test)
#recall = distance_classifier.recall(WAy_test)
#fonescore = distance_classifier.fonescore(WAy_test)

print("Accuracy: "+str(accuracy))
#print("Precision: "+str(precision[0]))
#print("Recall: "+str(recall[0]))
#print("F1 Score: "+str(fonescore[0]))


# **Gridsearch KNN - Notas**

# **Métricas KNN - Notas**

# **Gridsearch KNN - Escogido**

# **Métricas KNN - Escogido**

# **Gridsearch Logit - WeatherAUS**

# **Métricas Logit - WeatherAUS**

# **Gridsearch Logit - Notas**

# **Métricas Logit - Notas**

# **Gridsearch Logit - Escogido**

# **Métricas Logit - Escogido**

# **Gridsearch Árbol - WeatherAUS**

# **Métricas Árbol - WeatherAUS**

# **Gridsearch Árbol - Notas**

# **Métricas Árbol - Notas**

# **Gridsearch Árbol - Escogido**

# **Métricas Árbol - Escogido**