## Importacion de librerias

In [1]:
import pandas as pd
import numpy as np

## Cargue de datos

In [2]:
#Carga los datos a partir del archivo JSON
laLigaDict = pd.read_json (r'../01. Flat Files/Results_LaLiga.json')
#Se vuelve el dicionario un DataFrame
data = pd.DataFrame.from_dict(laLigaDict)
#Dada la estrutura del JSON se transpone
data = data.transpose()

## ETL

In [3]:
#Se actualiza la tabla para que tome unicamente la division 1
data = data[data["division"] == 1]

In [4]:
#Se cambia el formato de las columnas a enteras
data["division"] = data["division"].astype(int) 
data["round"] = data["round"].astype(int)
data["localGoals"] = data["localGoals"].astype(int)
data["visitorGoals"] = data["visitorGoals"].astype(int)

In [5]:
#Se crea la columna de goles total
data["totalGoals"] = (data["localGoals"] + data["visitorGoals"])
data.loc[(data["totalGoals"] > 2),"more2Goals"] = "Yes"
data.loc[(data["totalGoals"] <= 2),"more2Goals"] = "No"

In [6]:
#Se elimina la columnas no necesarias
data = data.drop(columns=["season","division","round","totalGoals","date","timestamp","visitorGoals","localGoals"])

## Modelo K-NN

In [7]:
#Copia independiente de data
copiaData = data.copy()

#Volver variables nominales a numericas
data = pd.get_dummies(data=data,columns=['localTeam','visitorTeam'])

In [8]:
data.head()

Unnamed: 0,more2Goals,localTeam_AD Almeria,localTeam_Alaves,localTeam_Albacete,localTeam_Almeria,localTeam_Atletico de Bilbao,localTeam_Atletico de Madrid,localTeam_Barcelona,localTeam_Betis,localTeam_Burgos,...,visitorTeam_Salamanca,visitorTeam_Santander,visitorTeam_Sevilla,visitorTeam_Sporting de Gijon,visitorTeam_Tenerife,visitorTeam_Valencia,visitorTeam_Valladolid,visitorTeam_Villarreal,visitorTeam_Xerez,visitorTeam_Zaragoza
1,No,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,No,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,No,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,No,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,No,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
copiaData.head()

Unnamed: 0,localTeam,visitorTeam,more2Goals
1,Atletico de Bilbao,Barcelona,No
2,Las Palmas,Atletico de Madrid,No
3,Real Madrid,Valencia,No
4,Celta de Vigo,Sporting de Gijon,No
5,Elche,Granada,No


In [10]:
# Se deja una matriz con las columnas predictorias
x = data.drop(columns=["more2Goals"])
# Un arreglo con la columna predecida
y = data["more2Goals"]

In [11]:
#Importa la funcion train test split
from sklearn.model_selection import train_test_split

#Se obtienen 4 variables con la informacion necesaria para correr un modelo de prediccion, con la matriz de entrenamiento
xTrain, xTest, yTrain, yTest = train_test_split(x,y,test_size = 0.3, random_state = 101)

In [12]:
#Importa el algoritmo de K-nn
from sklearn.neighbors import KNeighborsClassifier

#Se inicializa el algoritmo y se envian los datos de entrenamiento
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(xTrain, yTrain)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [13]:
#Apply model sobre las variables de testeo
predictions = knn.predict(xTest)

#Se importa la lectura de metricas 
from sklearn.metrics import classification_report

#Matriz de precision (accuracy)
resultado = pd.DataFrame.from_dict(classification_report(yTest,predictions,output_dict=True))
resultado = resultado.transpose()
resultado

Unnamed: 0,precision,recall,f1-score,support
No,0.53458,0.562361,0.548119,2694.0
Yes,0.491152,0.463166,0.476749,2457.0
accuracy,0.515046,0.515046,0.515046,0.515046
macro avg,0.512866,0.512764,0.512434,5151.0
weighted avg,0.513865,0.515046,0.514076,5151.0


## Modelo K-NN Optimizado

In [36]:
#Se define la cantidad de grupos inicial en 2
grupos = 2 

#El maximo de la cantidad de eventos sera la raiz cuadrada de la cantidad de registros + 3
maxGrupos = int(x.shape[0]**0.5) + 3

#Importa la funcion train test split
from sklearn.model_selection import train_test_split
#Importa el algoritmo de K-nn
from sklearn.neighbors import KNeighborsClassifier
#Se importa la lectura de metricas 
from sklearn.metrics import classification_report

#Se obtienen 4 variables con la informacion necesaria para correr un modelo de prediccion, con la matriz de entrenamiento
xTrain, xTest, yTrain, yTest = train_test_split(x,y,test_size = 0.3, random_state = 101)

#Matriz de resultados en vacio
cantFilas = int(maxGrupos-grupos)
resultados = pd.DataFrame(columns=['k','preNo','preYes','accuracy'],index=range(cantFilas))

for i in range(grupos,maxGrupos,1):
    
    #Se inicializa el algoritmo y se envian los datos de entrenamiento
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(xTrain, yTrain)

    #Apply model sobre las variables de testeo
    predictions = knn.predict(xTest)

    #Matriz de precision (accuracy)
    resultado = pd.DataFrame.from_dict(classification_report(yTest,predictions,output_dict=True))
    resultado = resultado.transpose()
      
    #Se inserta en la fila el resultado
    resultados.iloc[i-grupos] = [i,resultado.iloc[0][0],resultado.iloc[1][0],resultado.iloc[2][0]]

#Se muestra la matriz de resultados
resultados

Unnamed: 0,k,preNo,preYes,accuracy
0,2,0.535076,0.513199,0.529606
1,3,0.53458,0.491152,0.515046
2,4,0.536572,0.511757,0.529606
3,5,0.545191,0.502504,0.525335
4,6,0.543603,0.52012,0.536012
...,...,...,...,...
127,129,0.552336,0.525502,0.542225
128,130,0.546741,0.522727,0.538536
129,131,0.551828,0.528773,0.543584
130,132,0.54918,0.52853,0.542225


In [38]:
#Se exporta la matriz de resultados en un archivo de Excel
resultados.to_excel (r'../01. Flat Files/ResultsKnn_LaLiga.xlsx', index = None, header=True)