### Encoding PIPELINE

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('combined_df.csv')

In [4]:
df = df.drop(['Unnamed: 0'],axis=1)

### Adding Part of Day

In [5]:
# function for seperating into times of day

def daypart(x):
    if x >= 200 and x <= 559:
        return 'dawn'
    elif x >= 600 and x <= 959:
        return 'morning'
    elif x >=1000 and x <= 1359:
        return 'noon'
    elif x >= 1400 and x <= 1759:
        return 'noon'
    elif x >= 1800 and x <= 2259:
        return 'evening'
    else: return 'midnight'


In [6]:
df['dep_day_part']=df.crs_dep_time.apply(daypart)
df['arr_day_part']=df.crs_arr_time.apply(daypart)

### Adding Day of Week

In [7]:
df.fl_date = pd.to_datetime(df.fl_date)

In [8]:
df['day_of_week'] = df.fl_date.dt.day_name()

In [9]:
#df.isna().sum()

In [10]:
df = df.drop(['crs_dep_time','crs_arr_time',
         'Carrier_Origin_Dest_Airport','seats','passengers',
         'class'], axis=1)

* Split the data into train/test

In [11]:
from sklearn.model_selection import train_test_split

X = df.loc[:, df.columns != 'arr_delay']
y = df['arr_delay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 25)

## DROP 1 DUMMY COLUMN EACH

#### Encoding Weather(Type)

In [12]:
X_train = pd.get_dummies(X_train, prefix=['Type'], columns=['Type_Mode'])
X_test = pd.get_dummies(X_test, prefix=['Type'], columns=['Type_Mode'])

In [13]:
X_train = X_train.drop(['Type_Storm'],axis=1)
X_test = X_test.drop(['Type_Storm'],axis=1)

In [14]:
# list for filtering:
Type_list = ['Type_Cold', 'Type_Fog','Type_Hail','Type_Precipitation','Type_Rain', 'Type_Snow']

### Encode Day of Week

In [15]:
# one hot encode
X_train = pd.get_dummies(X_train, prefix=['dow'], columns=['day_of_week'])
X_test = pd.get_dummies(X_test, prefix=['dow'], columns=['day_of_week'])

In [16]:
X_train = X_train.drop(['dow_Wednesday'],axis=1)
X_test = X_test.drop(['dow_Wednesday'],axis=1)

In [17]:
dow_list = ['dow_Friday','dow_Monday','dow_Saturday','dow_Sunday','dow_Thrusday','dow_Tuesday']

### Encode Time of Day (Day_Part)

In [18]:
X_train = pd.get_dummies(X_train, prefix=['arr'], columns=['arr_day_part'])
X_test = pd.get_dummies(X_test, prefix=['arr'], columns=['arr_day_part'])

In [19]:
X_train = pd.get_dummies(X_train, prefix=['dep'], columns=['dep_day_part'])
X_test = pd.get_dummies(X_test, prefix=['dep'], columns=['dep_day_part'])

In [20]:
X_train = X_train.drop(['arr_midnight'],axis=1)
X_test = X_test.drop(['arr_midnight'],axis=1)

In [21]:
X_train = X_train.drop(['dep_midnight'],axis=1)
X_test = X_test.drop(['dep_midnight'],axis=1)

In [22]:
arr_day_list= ['arr_dawn','arr_evening','arr_morning','arr_noon']
dep_day_list= ['dep_dawn','dep_evening','dep_morning','dep_noon']

### Encode Aircraft Group

In [23]:
X_train = pd.get_dummies(X_train, prefix=['aircraft'], columns=['aircraft_group'])
X_test = pd.get_dummies(X_test, prefix=['aircraft'], columns=['aircraft_group'])

In [24]:
X_train = X_train.drop(["aircraft_['4' '6']"],axis=1)
X_test = X_test.drop(["aircraft_['4' '6']"],axis=1)

In [25]:
aircraft_list=['aircraft_4','aircraft_6']

### Encode Market Unique Carrier

In [26]:
X_train = pd.get_dummies(X_train, prefix=['muc'], columns=['mkt_unique_carrier'], drop_first=True)

In [27]:
X_test = pd.get_dummies(X_test, prefix=['muc'], columns=['mkt_unique_carrier'], drop_first=True)

In [48]:
mrk_unique_list=['muc_AS','muc_B6','muc_DL','muc_F9','muc_G4','muc_HA','muc_NK','muc_UA','muc_WN']

### Encode Market_Op_Carrier_Difference

In [28]:
X_train.mkt_op_carrier_difference = X_train.mkt_op_carrier_difference.replace([True,False],[1,0])
X_test.mkt_op_carrier_difference = X_test.mkt_op_carrier_difference.replace([True,False],[1,0])

### SCALE NUMERICAL FEATURES

#### Passengers_Seat_Ratio, Distance, Taxi_Holdup, crs_elapsed_time

In [29]:
from sklearn.preprocessing import StandardScaler

In [30]:
scaler = StandardScaler()
X_train.reset_index(drop=True, inplace=True)
X_train_scaled = scaler.fit_transform(X_train[['Passengers_Seat_Ratio','distance','Taxi_Holdup','crs_elapsed_time']])
s_df = pd.DataFrame(X_train_scaled)
X_train = X_train.drop(['Passengers_Seat_Ratio','distance','Taxi_Holdup','crs_elapsed_time'], axis=1)
s_df = s_df.rename(columns={0:'Passengers_Seat_Ratio',
                                  1:'distance',
                                  2:'Taxi_Holdup',
                                  3:'crs_elapsed_time'})
X_train = pd.concat([X_train,s_df],axis=1)

In [31]:
X_test.reset_index(drop=True, inplace= True)
X_test_scaled = scaler.transform(X_test[['Passengers_Seat_Ratio','distance','Taxi_Holdup','crs_elapsed_time']])
ss_df = pd.DataFrame(X_test_scaled)
X_test = X_test.drop(['Passengers_Seat_Ratio','distance','Taxi_Holdup','crs_elapsed_time'], axis=1)
ss_df = ss_df.rename(columns={0:'Passengers_Seat_Ratio',
                                1:'distance',
                                2:'Taxi_Holdup',
                                3:'crs_elapsed_time'})
X_test = pd.concat([X_test, ss_df], axis=1)

In [32]:
# print(X_train.isna().sum(),X_test.isna().sum())

# I TRIED TO ENCODE THIS IN TWO DIFFERENT WAYS, CAN USE WHICH EVER WORKS BETTER

In [33]:
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import LabelEncoder

In [34]:
LE = LabelEncoder()
X_train['origin_label'] = LE.fit_transform(X_train.origin_city_name)
X_test['origin_label'] = LE.transform(X_test.origin_city_name)

le = LabelEncoder()
X_train['dest_label'] = le.fit_transform(X_train.dest_city_name)
X_test['dest_label'] = le.transform(X_test.dest_city_name)

#### Encoding Origin_City_Name

In [35]:
fh = FeatureHasher (n_features = 10, input_type='string')
hashed_features = fh.fit_transform(X_train['origin_city_name'])
hashed_features = hashed_features.toarray()
hf = pd.DataFrame(hashed_features)
X_train = pd.concat([X_train, hf], axis=1)
X_train = X_train.rename(columns={0:'origin_0',
                                  1:'origin_1',
                                  2:'origin_2',
                                  3:'origin_3',
                                  4:'origin_4',
                                  5:'origin_5',
                                  6:'origin_6',
                                  7:'origin_7',
                                  8:'origin_8',
                                  9:'origin_9',
                                  10:'origin_10',})

hashed_features_test = fh.transform(X_test['origin_city_name'])
hashed_features_test = hashed_features_test.toarray()
hff = pd.DataFrame(hashed_features_test)
X_test = pd.concat([X_test, hff], axis=1)
X_test = X_test.rename(columns={0:'origin_0',
                                  1:'origin_1',
                                  2:'origin_2',
                                  3:'origin_3',
                                  4:'origin_4',
                                  5:'origin_5',
                                  6:'origin_6',
                                  7:'origin_7',
                                  8:'origin_8',
                                  9:'origin_9',
                                  10:'origin_10',})

In [50]:
origin_list = ['origin_0','origin_1','origin_2','origin_3','origin_4','origin_5','origin_6','origin_7','origin_8','origin_9']

### Encoding dest_city_name

In [36]:
fh2 = FeatureHasher (n_features = 10, input_type='string')
hashed_features2 = fh2.fit_transform(X_train['dest_city_name'])
hashed_features2 = hashed_features2.toarray()
hf2 = pd.DataFrame(hashed_features2)
X_train = pd.concat([X_train, hf2], axis=1)
X_train = X_train.rename(columns={0:'dest_0',
                                  1:'dest_1',
                                  2:'dest_2',
                                  3:'dest_3',
                                  4:'dest_4',
                                  5:'dest_5',
                                  6:'dest_6',
                                  7:'dest_7',
                                  8:'dest_8',
                                  9:'dest_9',
                                  10:'dest_10',})

hashed_features_test2 = fh2.transform(X_test['dest_city_name'])
hashed_features_test2 = hashed_features_test2.toarray()
hff2 = pd.DataFrame(hashed_features_test2)
X_test = pd.concat([X_test, hff2], axis=1)
X_test = X_test.rename(columns={0:'dest_0',
                                  1:'dest_1',
                                  2:'dest_2',
                                  3:'dest_3',
                                  4:'dest_4',
                                  5:'dest_5',
                                  6:'dest_6',
                                  7:'dest_7',
                                  8:'dest_8',
                                  9:'dest_9',
                                  10:'dest_10'})

In [None]:
dest_list = ['dest_0','dest_1','dest_2','dest_3','dest_4','dest_5','dest_6','dest_7','dest_8','dest_9']

In [37]:
X_train = X_train.drop(['fl_date','origin_city_name','dest_city_name'],axis=1)
X_test = X_test.drop(['fl_date','origin_city_name','dest_city_name'],axis=1)

### Standardize Y set

In [40]:
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [41]:
from sklearn.preprocessing import StandardScaler

In [42]:
scale = StandardScaler()
y_train_scaled = scale.fit_transform(np.array(y_train).reshape(-1,1))
y_train_df = pd.DataFrame(y_train_scaled)
y_train = y_train_df

y_test_scaled = scale.transform(np.array(y_test).reshape(-1,1))
y_test_df = pd.DataFrame(y_test_scaled)
y_test = y_test_df

### TO CSV

In [46]:
X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')