## Model Training

### Test and Split

In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, r2_score

# split data
sample_count = df_flights.shape[0]
sample_flights = df_flights.sample(n=sample_count//10,replace=True)
X_train, X_test, y_train, y_test = train_test_split(sample_flights.drop(columns = ['ARRIVAL_DELAY']), sample_flights['ARRIVAL_DELAY'], test_size=0.2, random_state=42, shuffle=True)
display(X_train.shape)
display(y_train.shape)

### Linear Regression

In [None]:
# perform SGD
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
reg = SGDRegressor()
reg.fit(X=X_train, y=y_train)

y_pred = reg.predict(X_test)
mse1 = mean_squared_error(y_test, y_pred)

print("Model weights: ")
print(reg.coef_)
print('Testing MSE:',mse1)
print("Model score:",reg.score(X_test, y_test) ) 
# indicating that with higher departure delay and longer air time, it is more likely to have longer arrival delay 
# the last weight (for distance) is negative, it indicates that with shorter distance, it is more likely to have longer arrival delay 

In [None]:
sns.scatterplot(X_train["AIR_TIME"], y_train)
sns.scatterplot(X_train["DISTANCE"], y_train)
sns.scatterplot(X_train["DEPARTURE_DELAY"], y_train)

### Poylnomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly = PolynomialFeatures(degree=3, include_bias=False)
X_poly = poly.fit_transform((X_train))
plr = LinearRegression()
# Note that I didn't do reshape on X_poly as it's already a matrix.
plr.fit(X_poly, (y_train))
    
predicted = plr.predict(poly.transform((X_test)))
    
display(plr.intercept_)
display(plr.coef_[0:3])

print(f'Polynomial regression with degree = {3}')
print(f'Training MSE error is:',mean_squared_error(plr.predict(X_poly), y_train))
print(f'Testing MSE error is:', mean_squared_error(predicted, y_test))
print("Model score:",plr.score(X_test, y_test))

### Further approaches

#### Lasso Regression

In [None]:
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.1)
reg.fit(X_train, y_train)

y_train_pred = reg.predict(X_train)

train_mse = mean_squared_error(y_train,y_train_pred)
print('Training MSE:',train_mse)
y_hat = reg.predict(X_test)
print(f'Testing MSE error is: {round(mean_squared_error(y_hat, y_test),4)}')
print("Model score:",reg.score(X_test, y_test))

#### Ridge Regression

In [None]:
from sklearn import linear_model
reg = linear_model.Ridge(alpha=.5)
reg.fit(X_train, y_train)

y_train_pred = reg.predict(X_train)
# plt.figure(figsize=(7, 7))
# sns.scatterplot(X_train, y_train)
# plt.show()

train_mse = mean_squared_error(y_train,y_train_pred)
print('Training MSE:',train_mse)
y_hat = reg.predict(X_test)
print(f'Testing MSE error is: {round(mean_squared_error(y_hat, y_test),4)}')

print("Model weights: ")
print(reg.coef_)

z = reg.score(X_test, y_test)
print("Accuracy score: ",z)

In [None]:

print("Full scatter plot for the training data")
plt.figure(figsize=(7, 7))
for i in range (len(X_train.columns)):
  sns.scatterplot(X_train[X_train.columns[i]], y_train)
plt.show()

In [None]:
print("Most Relvant Features")
plt.figure(figsize=(7, 7))
sns.scatterplot(X_train["DEPARTURE_DELAY"], y_train)
sns.scatterplot(X_train["AIR_TIME"], y_train)
sns.scatterplot(X_train["DISTANCE"], y_train)
plt.show()