# KFold Validation para el dataset de Airline Delays

In [1]:
# Importamos Librerías

import pandas as pd
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.metrics import r2_score
import numpy as np

In [2]:
# Leemos dataset

data = pd.read_csv("../eggomPY_Datasets/AirlineDelays/DelayedFlights.csv")

In [5]:
data.shape

(1936758, 30)

In [9]:
data.head(4)

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,...,4.0,8.0,0,N,0,,,,,
1,1,2008,1,3,4,754.0,735,1002.0,1000,WN,...,5.0,10.0,0,N,0,,,,,
2,2,2008,1,3,4,628.0,620,804.0,750,WN,...,3.0,17.0,0,N,0,,,,,
3,4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,...,3.0,10.0,0,N,0,2.0,0.0,0.0,0.0,32.0


In [23]:
# Limpiamos ArrDelay (que es nuestra variable obj) de NaN.
df = data.dropna(subset=["ArrDelay"])

# Sampleamos el dataset y lo capamos a un total de 5000 filas.
df = df.sample(frac=1,replace=True, random_state=12).head(5000)

In [24]:
df.shape

(5000, 30)

In [25]:
df.head(4)

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
1527239,5401245,2008,10,8,3,1018.0,1000,1114.0,1100,WN,...,4.0,7.0,0,N,0,,,,,
1815352,6676223,2008,12,11,4,1821.0,1814,1958.0,1958,UA,...,6.0,12.0,0,N,0,,,,,
1308984,4386650,2008,8,1,5,1602.0,1516,1830.0,1759,OH,...,7.0,8.0,0,N,0,0.0,31.0,0.0,0.0,0.0
1467138,5051904,2008,9,4,4,2129.0,1914,21.0,2217,UA,...,2.0,20.0,0,N,0,0.0,8.0,0.0,0.0,116.0


Si nos fijamos vemos que nos ha devuelto filas aleatorias empezando por el índice '1527239' del dataframe original 'data'.

Para emplear la función KFold necesitamos que éste dataframe 'df' tenga índices de fila de 0 en adelante.

Así que debemos resetear los número de fila

In [26]:
df = df.reset_index()

In [27]:
df.head(4)

Unnamed: 0.1,index,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,1527239,5401245,2008,10,8,3,1018.0,1000,1114.0,1100,...,4.0,7.0,0,N,0,,,,,
1,1815352,6676223,2008,12,11,4,1821.0,1814,1958.0,1958,...,6.0,12.0,0,N,0,,,,,
2,1308984,4386650,2008,8,1,5,1602.0,1516,1830.0,1759,...,7.0,8.0,0,N,0,0.0,31.0,0.0,0.0,0.0
3,1467138,5051904,2008,9,4,4,2129.0,1914,21.0,2217,...,2.0,20.0,0,N,0,0.0,8.0,0.0,0.0,116.0


In [28]:
# De todo el 'df' construimos un dataframe con las columnas que queramos emplear como  variables predictoras
#

X = df[["AirTime", "Distance", "DepDelay"]]


# Construimos un dataframe (en realidad es una Series) con la variable objetivo 'ArrDelay'
y = df["ArrDelay"]

## Trabajando con el objeto KFold

In [42]:
# Creamos un objeto KFold con los siguientes parámetros:

#
# n_splits: número de particiones que quiero que me haga de mi dataframe
#
# shuffle = True ---> Habilita que las particiones se cogan de manera aleatoria y no por bloques
#

kf = KFold(n_splits = 10, shuffle = True, random_state=12)

In [43]:
# Método .get_n_splits()
#
# Devuelve el número de splits del objeto KFold

kf.get_n_splits()

10

In [125]:
# Creamos un modelo de regresión lineal

lm = linear_model.LinearRegression()

In [126]:
# Creamos una lista vacía para ir guardando los resultados de R2

results_R2 = []

In [131]:
# Creamos un bucle que dividirá el DF 'X' 10 veces en conjuntos de train y testing, cada vez diferentes, y devolverá el
# valor de R2 para cada fracción:

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.loc[train_idx], X.loc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    lm.fit(X_train, y_train)
    y_guess = lm.predict(X_test)
    print("R2: ",r2_score(y_test, y_guess))
    results_R2.append(r2_score(y_test, y_guess))

R2:  0.9024548025756755
R2:  0.9123486937077726
R2:  0.919166798249019
R2:  0.9454020017974929
R2:  0.83701906569881
R2:  0.8656846835298084
R2:  0.9206452755623555
R2:  0.874398991093401
R2:  0.8889971093728205
R2:  0.9568581444585045


In [132]:
# La media de R2 es:

print("R2 Medio: ", np.mean(results_R2))

R2 Medio:  0.9022975566045662
