In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import datetime as dt
import sys
import scipy.stats as stats
import urllib
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import torch
from torch import cuda




In [None]:
test = pd.read_csv("C:/vscode/open/test.csv")
train = pd.read_csv("C:/vscode/open/train.csv")

In [6]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))


# 칼럼 관리

In [None]:
columns = train.columns

for i in columns:
    print("unique {}:{}".format(i, len(train[i].unique())))
    print("count null {}:{}".format(i, train[i].isnull().sum()))
    print("")

TABLE의 칼럼들 중, CANCELLED(항공편 취소됨)과 DIVERTED(경유여부) 칼럼은 NULL도 없으며 모든 관측값들이 동일합니다. 따라서 분석에 영향을 주지 못한다고 판단하여 해당 칼럼을 제거하였습니다.

In [None]:
ID = train['ID']

train.drop(columns = ['ID', 'Origin_Airport_ID', 'Destination_Airport_ID', 'Cancelled', 'Diverted'], inplace=True)

In [None]:
train.columns

이들 중, NULL이 다수 포함된 칼럼은 Estimated_Departure_Time, 'Estimated_Arrival_Time', 'Origin_State', 'Destination_State', 'Airline'의 다섯 칼럼입니다. 

# Time칼럼 분단위 변환

In [None]:
i = 0
while i < len(train['Estimated_Departure_Time']):
    if train.loc[i, 'Estimated_Departure_Time'] > 0:
        train.loc[i, 'edt'] = (train.loc[i, 'Estimated_Departure_Time']//100)*60 + train.loc[i, 'Estimated_Departure_Time']%100
    i += 1

In [None]:
i = 0
while i < len(train['Estimated_Arrival_Time']):
    if train.loc[i, 'Estimated_Arrival_Time'] > 0:
        train.loc[i, 'eat'] = (train.loc[i, 'Estimated_Arrival_Time']//100)*60 + train.loc[i, 'Estimated_Arrival_Time']%100
    i += 1

이후, 빈칸을 채우는 데 활용할 수 있도록, eat칼럼에 대한 변형 작업을 거칩니다.
eat < edt라면, 시간대가 변하지 않았다는 가정 하에서는 24시를 넘겨 다음날 도착했다고 보는 것이 타당하므로 해당 조건의 값들에 1440(하루를 분으로 표현)을 더해줍니다.

In [None]:
i = 0
while i <= 999999:
    if train.loc[i, 'eat'] - train.loc[i, 'edt'] < 0:
        train.loc[i, 'eat'] += 1440
    i += 1

# 날짜(월, 일)칼럼 D(eparture)와 A로 별도 분리

추가로, 세분화해서 활용 가능하도록, Month와 Day_of_Month를 도착일자와 출발일자로 분리합니다. 출발 일자는 변경이 없겠지만, 도착일자는 위의 조건과 같은 상황에 따라 변동이 가능합니다.

In [None]:
train['dmonth'] = train['Month']
train['dday'] = train['Day_of_Month']

In [None]:
train['amonth'] = train['Month']
train['aday'] = train['Day_of_Month']

이에, 주별로 공항 데이터를 groupby하여 주 결측값이 있는 데이터에 공항을 통해 주값을 메워주도록 하였습니다. OS(Origin_State)와 DS(Destination_State)를 통해 값을 배정하였습니다.

In [None]:
OS = train.groupby('Origin_State')['Origin_Airport'].unique().reset_index()
DS = train.groupby("Destination_State")['Destination_Airport'].unique().reset_index()

In [None]:
type(OS['Origin_Airport'])

In [None]:
i = 0
while i <= 999999:
    j = 0
    if isinstance(train.loc[i, 'Origin_State'], float):
        while j < 52:
            if train.loc[i, 'Origin_Airport'] in list(OS.loc[j, 'Origin_Airport']):
                train.loc[i, 'Origin_State'] = OS.loc[j, 'Origin_State']
            j += 1
    i += 1

In [None]:
i = 0
while i <= 999999:
    j = 0
    if isinstance(train.loc[i, 'Destination_State'], float):
        while j < 52:
            if train.loc[i, 'Destination_Airport'] in list(DS.loc[j, 'Destination_Airport']):
                train.loc[i, 'Destination_State'] = DS.loc[j, 'Destination_State']
            j += 1
    i += 1

이렇게 하면 아무래도 100만개짜리 파일이니 어딘가엔 반드시 매칭되는 공항 정보와 주 정보가 있을 것이고, null값을 처리할 수 있을 거라 봅니다. 그러면 이제 train의 null 정보를 확인해보도록 하겠습니다.

In [None]:
train[['Origin_State', 'Destination_State']].isnull().sum()

?

In [None]:
i = 0
while i <= 999999:
    j = 0
    if isinstance(train.loc[i, 'Destination_State'], float):
        while j < 52:
            if train.loc[i, 'Destination_Airport'] in list(OS.loc[j, 'Origin_Airport']):
                train.loc[i, 'Destination_State'] = DS.loc[j, 'Origin_State']
            j += 1
    i += 1

In [None]:
train[['Origin_State', 'Destination_State']].isnull().sum()

In [None]:
train[train['Destination_State'].isnull()]

In [None]:
train[train['Origin_Airport'] == 'YNG']

In [None]:
train[train['Destination_Airport'] == 'YNG']

In [None]:
train.describe()

In [None]:
plt.hist(train['Distance'], bins=50)

In [None]:
train[train['Distance']>3000].head(50)

In [None]:
i = 0
while i < 999999:
    if train.loc[i, 'edt'] > 0 and train.loc[i, 'eat'] > 0 :
        train.loc[i, 'time'] = train.loc[i, 'eat'] - train.loc[i, 'edt']
    i += 1
    

In [None]:
plt.scatter( train[train['time'].notnull()][['Distance', 'time']]['Distance'], train[train['time'].notnull()][['Distance', 'time']]['time'])

In [None]:
train[train['time']>1400]

In [None]:
distance = train.groupby(['Origin_State', 'Destination_State'])['Distance'].mean().reset_index()

In [None]:
deptimes = train.groupby(["Origin_State", "Destination_State"])['edt'].mean().reset_index()

In [None]:
atimes = train.groupby(['Origin_State', 'Destination_State'])['eat'].mean().reset_index()

In [None]:
distance = distance.merge(deptimes, how='left', on = ['Origin_State', 'Destination_State']).merge(atimes, how='left', on=['Origin_State', 'Destination_State'])

In [None]:
distance['times'] = distance['eat']-distance['edt']

In [None]:
list((set(distance[distance['Origin_State']=='California'].index)\
      &set(distance[distance['Destination_State']=='Texas'].index)))[0]

In [None]:
distance

In [None]:
train

In [None]:
distance[distance['Destination_State']==train.loc[5, 'Destination_State']]

In [None]:
i = 0
while i <= 999999:
    if train.loc[i, 'edt'] > 0:
        pass
    else : 
        if train.loc[i, 'eat'] > 0:
            if len(list(set(distance[distance['Origin_State']==train.loc[i, 'Origin_State']].index)&set(distance[distance['Destination_State']==train.loc[i, 'Destination_State']].index))) != 0:
                train.loc[i, 'edt'] = train.loc[i, 'eat'] - distance.loc[list(set(distance[distance['Origin_State']==train.loc[i, 'Origin_State']].index)&set(distance[distance['Destination_State']==train.loc[i, 'Destination_State']].index))[0], 'times']
            else:
                pass
    i += 1

In [None]:
i = 0
while i <= 999999:
    if train.loc[i, 'eat'] > 0:
        pass
    else : 
        if train.loc[i, 'edt'] > 0:
            if len(list(set(distance[distance['Origin_State']==train.loc[i, 'Origin_State']].index)&set(distance[distance['Destination_State']==train.loc[i, 'Destination_State']].index))) != 0:
                train.loc[i, 'eat'] = train.loc[i, 'edt'] + distance.loc[list(set(distance[distance['Origin_State']==train.loc[i, 'Origin_State']].index)&set(distance[distance\
                ['Destination_State']==train.loc[i, 'Destination_State']].index))[0], 'times']
            else:
                pass
    i += 1

In [None]:
train.isnull().sum()

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=False)

model.fit(pd.DataFrame(train[train['time'].notnull()][['Distance', 'time']]['time']), pd.DataFrame(train[train['time'].notnull()][['Distance', 'time']]['Distance']))

print(model.coef_)
print(model.intercept_)

In [None]:
train

In [None]:
dpm = 5.38393855

In [None]:
train['time'] = train['Distance']/dpm

In [None]:
i = 0
while i <= 999999:
    if train.loc[i, 'edt'] < 0:
        train.loc[i, 'edt'] += 1440
        train.loc[i, 'eat'] += 1440
    i += 1

In [None]:
i = 0
while i <= 999999:
    if train.loc[i, 'edt'] > 0:
        train.loc[i, 'eat'] = round(train.loc[i, 'eat'])
        train.loc[i, 'edt'] = round(train.loc[i, 'edt'])
    i += 1

In [None]:
train.to_csv("C:/vscode/open/final/training.csv")

In [None]:
oa = train.groupby('Origin_Airport')['Airline'].unique().reset_index().dropna()

In [None]:
da = train.groupby('Destination_Airport')['Airline'].unique().reset_index().dropna()

In [None]:
ta = train.groupby('Tail_Number')['Airline'].unique().reset_index().dropna()

In [None]:
import random

In [None]:
oairlineairport = train.groupby('Origin_Airport')['Airline'].unique().reset_index()

dairlineairport = train.groupby('Destination_Airport')['Airline'].unique().reset_index()

In [None]:
carr_id = train.groupby('Carrier_ID(DOT)')['Airline'].unique().reset_index()

In [None]:
ccod = train.groupby("Carrier_Code(IATA)")['Airline'].unique().reset_index()

In [None]:
i = 0
while i <= 999999:
    j = 0
    k = 0
    l = 0
    if isinstance(train.loc[i, 'Airline'], float):
        
        while j < len(oairlineairport['Origin_Airport']):
            if train.loc[i, 'Origin_Airport'] == oairlineairport.loc[j, 'Origin_Airport']:
                
                while k < len(dairlineairport['Destination_Airport']):
                    
                    if train.loc[i, 'Destination_Airport'] == dairlineairport.loc[k, 'Destination_Airport']:
                        
                        while l < len(carr_id['Carrier_ID(DOT)']) :
                            if train.loc[i, 'Carrier_ID(DOT)'] == carr_id.loc[l, 'Carrier_ID(DOT)']:
                                
                                train.loc[i, 'Airline'] = list(set(oairlineairport.loc[j, 'Airline'])&set(dairlineairport.loc[k, 'Airline'])&set(carr_id.loc[l, 'Airline']))[random.randint(0, (len(set(oairlineairport.loc[j, 'Airline'])&set(dairlineairport.loc[k, 'Airline'])&set(carr_id.loc[l, 'Airline']))-1))]
                                break
                                
                            l += 1
                        break
                        
                    k += 1
                break
                
            j += 1
    i += 1

In [None]:
i = 0
while i <= 999999:
    j = 0
    if isinstance(train.loc[i, 'Airline'], float):
        while j < len(oa['Origin_Airport']):
            k = 0
            if oa.loc[j, 'Origin_Airport'] == train.loc[i, 'Origin_Airport']:
                while k < len(da['Destination_Airport']):
                    l = 0
                    if da.loc[k, 'Destination_Airport'] == train.loc[i, 'Destination_Airport']:
                        
                        while l < len(ta['Tail_Number']):
                            if ta.loc[l, 'Tail_Number'] == train.loc[i, 'Tail_Number']:
                                if len(set(oa.loc[j, 'Airline'])&set(da.loc[k, 'Airline'])&set(ta.loc[l, 'Airline'])) == 0:
                                    train.loc[i, 'Airline'] = list(set(oa.loc[j, 'Airline'])&set(da.loc[k, 'Airline']))[random.randint(0, int(len(list(set(oa.loc[j, 'Airline'])&set(da.loc[k, 'Airline']))))-1)]
                                    break
                                else:
                                    train.loc[i, 'Airline'] = list(set(oa.loc[j, 'Airline'])&set(da.loc[k, 'Airline'])&set(ta.loc[l, 'Airline']))[random.randint(0, int(len(list(set(oa.loc[j, 'Airline'])&set(da.loc[k, 'Airline'])&set(ta.loc[l, 'Airline']))))-1)]
                                    break
                                break
                            l += 1
                        break        
                    k += 1
                break        
            j += 1
    i += 1

In [None]:
i = 0
while i <= 999999:
    j = 0
    if isinstance(train.loc[i, 'Airline'], float):
        while j < len(oa['Origin_Airport']):
            k = 0
            if oa.loc[j, 'Origin_Airport'] == train.loc[i, 'Origin_Airport']:
                while k < len(da['Destination_Airport']):
                    l = 0
                    if da.loc[k, 'Destination_Airport'] == train.loc[i, 'Destination_Airport']:
                        
                        while l < len(ccod['Carrier_Code(IATA)']):
                            if ccod.loc[l, 'Carrier_Code(IATA)'] == train.loc[i, 'Carrier_Code(IATA)']:
                                if len(set(oa.loc[j, 'Airline'])&set(da.loc[k, 'Airline'])&set(ccod.loc[l, 'Airline'])) == 0:
                                    train.loc[i, 'Airline'] = list(set(oa.loc[j, 'Airline'])&set(da.loc[k, 'Airline']))[random.randint(0, int(len(list(set(oa.loc[j, 'Airline'])&set(da.loc[k, 'Airline']))))-1)]
                                    break
                                else:
                                    train.loc[i, 'Airline'] = list(set(oa.loc[j, 'Airline'])&set(da.loc[k, 'Airline'])&set(ccod.loc[l, 'Airline']))[random.randint(0, int(len(list(set(oa.loc[j, 'Airline'])&set(da.loc[k, 'Airline'])&set(ta.loc[l, 'Airline']))))-1)]
                                    break
                                break
                            l += 1
                        break        
                    k += 1
                break        
            j += 1
    i += 1

In [None]:
train.isnull().sum()

In [None]:
i = 0
while i <= 999999:
    if isinstance(train.loc[i, 'Airline'], float):
        j = 0
        k = 0
        while j < 52:
            if train.loc[i, 'Origin_Airport'] == oa.loc[j, 'Origin_Airport']:
                while k < 52:
                    if train.loc[i, 'Destination_Airport'] == da.loc[k, 'Destination_Airport']:
                        train.loc[i, 'Airline'] = list(set(oa.loc[j, 'Airline'])&set(da.loc[k, 'Airline']))[random.randint(0, len(list(set(oa.loc[j, 'Airline'])&set(da.loc[k, 'Airline'])))-1)]
                    k += 1
            j += 1    
        
    i += 1

In [None]:
train.isnull().sum()

In [None]:
while train['Airline'].isnull().sum() > 50:
    i = 0
    while i <= 999999:
        j = 0
        if isinstance(train.loc[i, 'Airline'], float):
            while j < len(oa['Origin_Airport']):
                if train.loc[i, 'Origin_Airport'] == oa.loc[j, 'Origin_Airport']:
                    train.loc[i, 'Airline'] = oa.loc[j, 'Airline'][random.randint(0, int(len(oa.loc[j, 'Airline'])-1))]
                j += 1
        i += 1

In [None]:
train.isnull().sum()

In [None]:
os = train.groupby("Origin_State")['Airline'].unique().reset_index()

In [None]:
i = 0
while i <= 999999:
    j = 0
    if isinstance(train.loc[i, 'Airline'], float):
        while j < 52:
            if train.loc[i, 'Origin_State'] == os.loc[j, 'Origin_State']:
                train.loc[i, 'Airline'] = os.loc[j, 'Airline'][random.randint(0, len(os.loc[j, 'Airline'])-1)]
            j += 1
    i += 1

In [None]:
train.isnull().sum()

In [None]:
missing_airport = train[train['Destination_State'].isnull()]['Airline']

In [None]:

train[train['Airline']=='Allegiant Air']['Destination_State'].mode()
    

In [None]:
train['Destination_State'] = train['Destination_State'].fillna('Florida')

In [None]:
train.isnull().sum()

In [None]:
train = train.drop(columns = ['Carrier_Code(IATA)', 'Carrier_ID(DOT)'])

In [None]:
train.describe()

In [None]:
train.to_csv("C:/vscode/open/final/training2.csv")

In [None]:
train = pd.read_csv("C:/vscode/open/final/training2a.csv")

# edt, eat 칼럼 정리, 날짜변환

In [None]:
i = 0
while i <= 999999:
    if train.loc[i, 'eat'] >= 1440:
        train.loc[i, 'eat'] -= 1440
        train.loc[i, 'aday'] += 1
    i += 1

In [None]:
train['edt'].describe()

In [None]:
train['aday'].describe()

In [None]:
i = 0
while i <= 999999:
    if train.loc[i, 'aday'] >= 32:
        train.loc[i, 'amonth'] += 1
        train.loc[i, 'aday'] = 1
    i += 1

In [None]:
i = 0
while i <= 999999:
    if train.loc[i, 'amonth'] == 2:
        if train.loc[i, 'aday'] >= 29:
            train.loc[i, 'amonth'] += 1
            train.loc[i, 'aday'] -= 28
    elif train.loc[i, 'amonth'] in [4, 6, 9, 11]:
        if train.loc[i, 'aday'] >= 31:
            train.loc[i, 'amonth'] += 1
            train.loc[i, 'aday'] -= 30
    i += 1

In [None]:
train.isnull().sum()

# 테이블 컬럼 정리

In [None]:
train.drop(columns = ['Month', 'Day_of_Month'], inplace=True)

In [None]:
train.drop(columns = 'Unnamed: 0', inplace=True)

In [None]:
train.drop(columns = ['Estimated_Departure_Time', 'Estimated_Arrival_Time'], inplace=True)

In [None]:
train = train.drop(index = 1000000)

In [None]:
train

In [None]:
train = train[['edt', 'eat', 'dmonth', 'dday', 'amonth', 'aday',
       'Origin_Airport', 'Origin_State', 'Destination_Airport',
       'Destination_State', 'Distance', 'Airline', 'Tail_Number', 'time', 'Delay']]

In [None]:
train.isnull().sum()

# 모델 적합

In [None]:
train.to_csv("C:/vscode/open/final/training3a.csv")

In [None]:
train = pd.read_csv("C:/vscode/open/final/training3a.csv")

In [None]:
data = train[train['Delay'].notnull()]

In [None]:
target = data['Delay']
data.drop(columns = 'Delay', inplace=True)

In [None]:
data

In [None]:
target = target.replace('Not_Delayed', 0).replace('Delayed', 1)

In [None]:
data = data.reset_index()

In [None]:
data

In [None]:
target=target.reset_index()

In [None]:
i = 0
while i < len(data['edt']):
    data.loc[i, 'edt'] = float(data.loc[i, 'edt']//60)*100 + float((data.loc[i, 'edt']%60)/60)
    data.loc[i, 'eat'] = float(data.loc[i, 'eat']//60)*100 + float((data.loc[i, 'eat']%60)/60)
    i += 1

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
train.to_csv("C:/vscode/open/final/training3b.csv")

In [None]:
encoder1 = encoder.fit(data['Origin_Airport'])
data["Origin_Airport"] = encoder1.transform(data['Origin_Airport'])
data['Destination_Airport'] = encoder1.transform(data['Destination_Airport'])

In [None]:
encoder2 = encoder.fit(data['Destination_State'])
data['Origin_State'] = encoder2.transform(data['Origin_State'])
data['Destination_State'] = encoder2.transform(data['Destination_State'])

In [None]:
encoder3 = encoder.fit(data['Airline'])
data['Airline'] = encoder3.transform(data['Airline'])

In [None]:
data2 = pd.read_csv("C:/vscode/open/final/dta.csv")

In [None]:
encoder4 = encoder.fit(data2['Tail_Number'])
data2['Tail_Number'] = encoder4.transform(data2['Tail_Number'])

In [None]:
encoder4 = encoder.fit(data['Tail_Number'])
data['Tail_Number'] = encoder4.transform(data['Tail_Number'])

In [None]:
data = data.drop(columns = 'index')

In [None]:
data = data.drop(columns = 'Unnamed: 0')

In [None]:
data.dtypes

In [None]:
if 'Unnamed: 0.1' in data.columns:
    data.drop(columns = 'Unnamed: 0.1', inplace=True)

In [None]:
data['Origin_Airport']=data['Origin_Airport'].astype('category')

In [None]:
data['Destination_Airport']=data['Destination_Airport'].astype('category')

In [None]:
data['Origin_State'] = data['Origin_State'].astype('category')

In [None]:
data['Destination_State'] = data['Destination_State'].astype('category')

In [None]:
data['Airline'] = data['Airline'].astype('category')

In [None]:
data['Tail_Number'] = data['Tail_Number'].astype('category')

In [None]:
col = ['Origin_Airport', 'Destination_Airport', 'Origin_State', 'Destination_State', 'Airline', 'Tail_Number']

In [None]:
for i in col:
    data[i] = data[i].cat.as_unordered()

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(pd.DataFrame(data['Distance']))

In [None]:
data.dtypes

In [None]:
target

In [None]:
target.drop(columns = 'index', inplace=True)

In [None]:
target['Delay'] = target['Delay'].astype('int')

In [None]:
i = 0
while i < len(target['Delay']):
    if target.loc[i, 'Delay'] == 0:
        target.loc[i, 'Delayed'] = 0
        target.loc[i, 'Not_Delayed'] = 1
    else:
        target.loc[i, 'Delayed'] = 1
        target.loc[i, 'Not_Delayed'] = 0
    i += 1

In [None]:
target.drop(columns = 'Delay', inplace=True)

target['Delayed'] = target['Delayed'].astype('int')
target['Not_Delayed'] = target['Not_Delayed'].astype('int')

In [None]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(data, target, test_size=0.25)

In [None]:
import xgboost as xgb

In [None]:
from xgboost import XGBClassifier

In [None]:
dtrain = xgb.DMatrix(data = xtrain, label = ytrain, enable_categorical=True)

dtest = xgb.DMatrix(data = xtest, label = ytest, enable_categorical=True)

params = {'max_depth':14, 
          'learning_rate':0.03,  
          'scale_pos_weight':5, 
          'min_child_weight':3, 
          'objective' : 'binary:logistic',
          'eval_metric':'logloss',
          'booster':'gbtree'}

num_rounds = 150

In [None]:
xgbmodel = xgb.train(params = params, num_boost_round = num_rounds, dtrain=dtrain)

In [None]:
pred = xgbmodel.predict(dtest)

In [None]:
from sklearn.metrics import log_loss

In [None]:
log_loss(ytest, pred)

In [None]:
pred

In [None]:
testdata = pd.read_csv("C:/vscode/open/final/testdata4.csv")

In [None]:
testdata2 = pd.read_csv("C:/vscode/open/final/testdata4.csv")

In [None]:
for label in np.unique(testdata['Origin_Airport']):
    if label not in encoder1.classes_:
        encoder1.classes_ = np.append(encoder1.classes_, label)

In [None]:
testdata['Origin_Airport'] = encoder1.transform(testdata['Origin_Airport'])
testdata['Destination_Airport'] = encoder1.transform(testdata['Destination_Airport'])

In [None]:
testdata['Origin_State'].mode()

In [None]:
testdata['Destination_State'].mode()

In [None]:
testdata['Origin_State'] = testdata['Origin_State'].fillna('California')

In [None]:
testdata['Destination_State'] = testdata['Destination_State'].fillna('California')

In [None]:
testdata['Origin_State'] = encoder2.transform(testdata['Origin_State'])
testdata['Destination_State'] = encoder2.transform(testdata['Destination_State'])

In [None]:
testdata['Airline'].mode()

In [None]:
testdata['Airline'] = testdata['Airline'].fillna("Southwest Airlines Co.")

In [None]:
for label in np.unique(testdata['Airline']):
    if label not in encoder3.classes_:
        encoder3.classes = np.append(encoder3.classes_, label)

In [None]:
testdata['Airline'] = encoder3.transform(testdata['Airline'])

In [None]:
for label in np.unique(testdata['Tail_Number']):
    if label not in encoder4.classes_:
        encoder4.classes_ = np.append(encoder4.classes_, label)

In [None]:
testdata['Tail_Number'] = encoder4.transform(testdata['Tail_Number'])

In [None]:
samsub = pd.read_csv("C:/vscode/open/sample_submission.csv")

In [None]:
testdata.isnull().sum()

In [None]:
testdata['Distance'] = scaler.transform(pd.DataFrame(testdata['Distance']))

In [None]:
testdata

In [None]:
data

In [None]:
testdata.drop(columns = ['Unnamed: 0'], inplace=True)

In [None]:
clist = ['Origin_Airport', 'Origin_State', 'Destination_State', 'Destination_Airport', 'Airline', 'Tail_Number']

In [None]:
for i in clist:
    testdata[i] = testdata[i].astype('category')
    testdata[i] = testdata[i].cat.as_unordered()

In [None]:
testdata

In [None]:
testdata.dtypes

# Model Predict I

In [None]:
samid = samsub['ID']

In [None]:
samsub.drop(columns = 'ID', inplace=True)

In [None]:
samsub = samsub[['Delayed', 'Not_Delayed']]

In [None]:
dsubmit = xgb.DMatrix(data = testdata, label = samsub, enable_categorical=True)

In [None]:
submis1 = xgbmodel.predict(dsubmit)

In [None]:
submis1 = pd.DataFrame(submis1)

In [None]:
samsub1 = pd.concat([samid, submis1], axis=1)

In [None]:
samsub1.columns = ['ID', 'Delayed', 'Not_Delayed']

In [None]:
samsub1 = samsub1[["ID", "Not_Delayed", 'Delayed']]

In [None]:
samsub1

In [None]:
samsub1 = samsub1.set_index("ID")

In [None]:
samsub1

In [None]:
samsub1.to_csv("C:/vscode/open/submission/samsub9.csv")

# Model Predict II

In [None]:
samsub2 = samsub1.copy()

In [None]:
samsub2 = samsub2.reset_index()

In [None]:
i = 0
while i < 1000000:
    samsub2.loc[i, 'Not_Delayed'] = samsub2.loc[i, 'Not_Delayed']/(samsub2.loc[i, 'Not_Delayed']+samsub2.loc[i, "Delayed"])
    samsub2.loc[i, "Delayed"] = samsub2.loc[i, 'Delayed']/(samsub2.loc[i, 'Not_Delayed']+samsub2.loc[i, "Delayed"])
    i += 1

In [None]:
samsub2 = samsub2.set_index("ID")

In [None]:
samsub2.to_csv("C:/vscode/open/submission/samsub10.csv")

In [None]:
samsub3 = samsub2.copy().reset_index()

In [None]:
i = 0
while i <= 999999:
    samsub3.loc[i, 'Not_Delayed'] = samsub3.loc[i, 'Not_Delayed'] - 0.1
    samsub3.loc[i, 'Delayed'] = samsub3.loc[i, 'Delayed'] +0.1
    i += 1

In [None]:
samsub3=pd.concat([samid, samsub3], axis=1)

In [None]:
samsub3 = samsub3.set_index("ID")

In [None]:
samsub3