### Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
import plotly.express as px

import copy
import seaborn as sns
import os
from scipy import stats
import datetime

import sqlite3 as sqlite
from sqlalchemy import create_engine
%matplotlib inline

In [2]:
data_path = '/mnt/d/lighthouse/Midterm_data/'

In [3]:
# 15927485 rows × 42
flights = pd.read_csv(data_path + 'flights.csv', header = 0, low_memory=False)
flights_test =pd.read_csv(data_path + 'flights_test.csv', header = 0)

In [4]:
# making duplicate so we dont have to re-read large csv files
flights_clean = copy.deepcopy(flights)

## Nan

In [5]:
total = flights_clean.isnull().sum().sort_values(ascending = False)
percent = (flights_clean.isnull().sum()/flights_clean.isnull().count()).sort_values(ascending = False)
missing_data = pd.concat([total,percent], axis=1, keys=['Total', 'Percent'])

In [6]:
#missing_data.head(15)

In [6]:
# dropping columsn with 80% missing AND [origin_city_name, dest_city_name] since we have [orgin, dest]
# dropping [cancelled, diverted, flights, dup] since they only contain one value
cols_to_drop_nan = list(missing_data[missing_data['Percent'] > 0.8].index)
#ols_to_drop_other = ['dep_delay','taxi_out','taxi_in', 'wheels_off', 'wheels_on', 'arr_time', 'cancelled', 'diverted','actual_elapsed_time','air_time', 'flights', 'dup', 'dep_time']
cols_to_drop_other = ['taxi_out','taxi_in', 'wheels_off', 'wheels_on', 'cancelled', 'diverted','actual_elapsed_time','air_time', 'flights', 'dup', 'dep_time']
flights_clean.drop(cols_to_drop_nan, axis = 1, inplace = True)
flights_clean.drop(cols_to_drop_other, axis = 1, inplace = True)
flights_test.drop(['dup','flights'], axis =1, inplace=True)

## rows with Nan

In [7]:
# before: (15927485, 20)
# after: 15652397 rows × 20 columns
# lost 3% of data, no biggie
flights_clean.dropna(inplace = True)

In [5]:
# (15605076, 19)
flights_clean = pd.read_csv(data_path + 'flights_cleaned_model.csv', header = 0, low_memory=False)

### seperate numerical/categorical

In [6]:
# 9 numerical features
numerical = list(flights_clean.dtypes[flights_clean.dtypes != 'object'].index)
# 10 categorical features (10 if include city and state)
categorical = list(flights_clean.dtypes[flights_clean.dtypes == 'object'].index)

In [7]:
flights_numerical = copy.deepcopy(flights_clean[numerical])
flights_categorical = copy.deepcopy(flights_clean[categorical])

### seperate year-month-day

In [8]:
flights_clean['fl_date'] = pd.to_datetime(flights_clean['fl_date'])
flights_clean['year'] = flights_clean['fl_date'].dt.year
flights_clean['month'] = flights_clean['fl_date'].dt.month
flights_clean['date'] = flights_clean['fl_date'].dt.day
# Dropping the old date column
flights_clean.drop('fl_date', axis = 1, inplace = True)

In [9]:
flights_clean

Unnamed: 0.1,Unnamed: 0,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,crs_dep_time,dep_delay,crs_arr_time,arr_time,arr_delay,crs_elapsed_time,distance,year,month,date
0,0,WN,WN,WN,2098,WN,N8540V,2098,13198,MCI,...,1755,-2.0,1850,1838.0,-12.0,175.0,1044,2019,3,25
1,1,WN,WN,WN,2238,WN,N8656B,2238,13198,MCI,...,2000,-5.0,2055,2036.0,-19.0,175.0,1044,2019,3,25
2,2,WN,WN,WN,2451,WN,N8583Z,2451,13198,MCI,...,540,1.0,635,625.0,-10.0,175.0,1044,2019,3,25
3,3,WN,WN,WN,2213,WN,N737JW,2213,13198,MCI,...,1550,99.0,1905,2039.0,94.0,135.0,904,2019,3,25
4,4,WN,WN,WN,2096,WN,N705SW,2096,13198,MCI,...,1045,166.0,1430,1702.0,152.0,165.0,1155,2019,3,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15611136,15927480,WN,WN,WN,2189,WN,N7702A,2189,13198,MCI,...,2055,16.0,2245,2242.0,-3.0,110.0,689,2019,3,25
15611137,15927481,WN,WN,WN,1291,WN,N7878A,1291,13198,MCI,...,1200,88.0,1355,1534.0,99.0,235.0,1489,2019,3,25
15611138,15927482,WN,WN,WN,2470,WN,N760SW,2470,13198,MCI,...,920,-5.0,1110,1052.0,-18.0,230.0,1482,2019,3,25
15611139,15927483,WN,WN,WN,1651,WN,N8542Z,1651,13198,MCI,...,1125,-2.0,1215,1204.0,-11.0,170.0,1044,2019,3,25


### encode all string variables

In [39]:
# converting category to numbers
flights_categorical['mkt_unique_carrier'] = pd.factorize(flights_categorical['mkt_unique_carrier'])[0]
flights_categorical['branded_code_share'] = pd.factorize(flights_categorical['branded_code_share'])[0]
flights_categorical['mkt_carrier'] = pd.factorize(flights_categorical['mkt_carrier'])[0]
flights_categorical['op_unique_carrier'] = pd.factorize(flights_categorical['op_unique_carrier'])[0]
flights_categorical['tail_num'] = pd.factorize(flights_categorical['tail_num'])[0]
flights_categorical['origin'] = pd.factorize(flights_categorical['origin'])[0]
flights_categorical['origin_city_name'] = pd.factorize(flights_categorical['origin_city_name'])[0]
flights_categorical['dest'] = pd.factorize(flights_categorical['dest'])[0]
flights_categorical['dest_city_name'] = pd.factorize(flights_categorical['dest_city_name'])[0]

### merging into into model data

In [40]:
flights_final = pd.concat([flights_numerical, flights_categorical], axis=1)
flights_final.reset_index(drop=True)

Unnamed: 0,mkt_carrier_fl_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,arr_delay,crs_elapsed_time,distance,mkt_unique_carrier,...,mkt_carrier,op_unique_carrier,tail_num,origin,origin_city_name,dest,dest_city_name,year,month,date
0,2098,2098,13198,14107,1755,1850,-12.0,175.0,1044,0,...,0,0,0,0,0,0,0,2019,3,25
1,2238,2238,13198,14107,2000,2055,-19.0,175.0,1044,0,...,0,0,1,0,0,0,0,2019,3,25
2,2451,2451,13198,14107,540,635,-10.0,175.0,1044,0,...,0,0,2,0,0,0,0,2019,3,25
3,2213,2213,13198,14492,1550,1905,94.0,135.0,904,0,...,0,0,3,0,0,1,1,2019,3,25
4,2096,2096,13198,14635,1045,1430,152.0,165.0,1155,0,...,0,0,4,0,0,2,2,2019,3,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15615735,2189,2189,13198,13495,2055,2245,-3.0,110.0,689,0,...,0,0,143,0,0,30,30,2019,3,25
15615736,1291,1291,13198,13796,1200,1355,99.0,235.0,1489,0,...,0,0,197,0,0,31,31,2019,3,25
15615737,2470,2470,13198,14057,920,1110,-18.0,230.0,1482,0,...,0,0,290,0,0,58,58,2019,3,25
15615738,1651,1651,13198,14107,1125,1215,-11.0,170.0,1044,0,...,0,0,293,0,0,0,0,2019,3,25


In [41]:
#flights_final.to_csv('model_1_encode_only.csv')

# Model

In [7]:
flights_final = pd.read_csv(data_path + 'model_1_encode_only.csv', header = 0)
flights_final.drop('Unnamed: 0', axis = 1, inplace= True)

In [8]:
flights_test =pd.read_csv(data_path + 'flights_test.csv', header = 0)
flights_test.drop(['dup','flights'], axis =1, inplace=True)
flights_test['fl_date'] = pd.to_datetime(flights_test['fl_date'])
flights_test['year'] = flights_test['fl_date'].dt.year
flights_test['month'] = flights_test['fl_date'].dt.month
flights_test['date'] = flights_test['fl_date'].dt.day
# Dropping the old date column
flights_test.drop('fl_date', axis = 1, inplace = True)

In [9]:
flights_final.head()

Unnamed: 0,mkt_carrier_fl_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,arr_delay,crs_elapsed_time,distance,mkt_unique_carrier,...,mkt_carrier,op_unique_carrier,tail_num,origin,origin_city_name,dest,dest_city_name,year,month,date
0,2098,2098,13198,14107,1755,1850,-12.0,175.0,1044,0,...,0,0,0,0,0,0,0,2019,3,25
1,2238,2238,13198,14107,2000,2055,-19.0,175.0,1044,0,...,0,0,1,0,0,0,0,2019,3,25
2,2451,2451,13198,14107,540,635,-10.0,175.0,1044,0,...,0,0,2,0,0,0,0,2019,3,25
3,2213,2213,13198,14492,1550,1905,94.0,135.0,904,0,...,0,0,3,0,0,1,1,2019,3,25
4,2096,2096,13198,14635,1045,1430,152.0,165.0,1155,0,...,0,0,4,0,0,2,2,2019,3,25


In [18]:
set(flights_final.columns) ^ set(flights_test.columns)

{'arr_delay'}

In [12]:
X = flights_final.drop('arr_delay', axis = 1)
y = flights_final['arr_delay']

### sklearn

In [13]:
reg = LinearRegression().fit(X, y)

In [14]:
reg.score(X, y)

0.01238503161987592

In [15]:
reg.coef_

array([-2.39532148e-03,  2.80051692e-03, -2.73754387e-04, -1.07119122e-04,
        6.70934775e-03,  3.60701911e-03, -2.17027134e-02,  1.98669931e-03,
        6.96435790e-01, -7.05113789e-01,  6.96435790e-01,  8.70016795e-02,
        3.46353461e-04,  3.48285237e-02, -3.69090916e-02,  3.54394663e-02,
       -4.42136118e-02,  4.95042385e-01, -9.46044984e-02, -1.77355894e-02])

### Statsmodel

In [16]:
res = sm.OLS(y, X).fit()

In [17]:
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:              arr_delay   R-squared (uncentered):                   0.024
Model:                            OLS   Adj. R-squared (uncentered):              0.024
Method:                 Least Squares   F-statistic:                          2.032e+04
Date:                Wed, 26 Aug 2020   Prob (F-statistic):                        0.00
Time:                        14:25:32   Log-Likelihood:                     -8.3110e+07
No. Observations:            15615740   AIC:                                  1.662e+08
Df Residuals:                15615721   BIC:                                  1.662e+08
Df Model:                          19                                                  
Covariance Type:            nonrobust                                                  
                         coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------