In [1]:
import pandas as pd
import numpy as np

In [2]:
weather_data_prep = pd.read_csv("weather_data_prep.csv")

## Convert DATE column into Date format

In [3]:
weather_data_prep['DATE']= pd.to_datetime(weather_data_prep['DATE']) 

## Drop useless columns

We think SNWD (Snow Depth) and PGTM (Peak gust time) are useless because it's null

In [4]:
weather_data_prep = weather_data_prep.drop(['SNWD', 'PGTM', 'SNOW'], axis=1)

## Aggregate by date

In [5]:
weather_data_prep = weather_data_prep.groupby(['DATE']).agg({
    'AWND': 'mean',
    'PRCP':'mean',
    'TAVG': 'mean',
    'WDF5': 'mean',
    'TMAX':'max',
    'WSF2': 'max',
    'WSF5': 'max',
    'WT01': 'max',
    'WT02': 'max',
    'WT03': 'max',
    'WT08': 'max',
    'TMIN': 'min'
})


weather_data_prep = weather_data_prep.rename(columns={
    "AWND": "AWND_mean", 
    "PRCP": "PRCP_mean", 
    "TAVG": "TAVG_mean", 
    "WDF5": "WDF5_mean", 
    "TMAX": "TMAX_max",
    'WSF2': 'WSF2_max',
    'WSF5': 'WSF5_max',
    'WT01': 'WT01_max',
    'WT02': 'WT02_max',
    'WT03': 'WT03_max',
    'WT08': 'WT08_max',
    'TMIN': 'TMIN_min'
}, errors="raise")








## Add columns

In [6]:
weather_data_prep['IceRoad'] = np.where(weather_data_prep['TMIN_min'] < 37.4, 1, 0)
weather_data_prep['WetDay'] = np.where(weather_data_prep['PRCP_mean'] > 0, 1, 0)
weather_data_prep = weather_data_prep.reset_index()

## Load airport dataset

In [7]:
Airport_Data = pd.read_csv("Airport_Data.csv")

  interactivity=interactivity, compiler=compiler, result=result)


## We drop useless columns


In [8]:
Airport_Data = Airport_Data.drop([
    'stand_last_change', 
    'sto',
    'atot',
    'aobt',
    'chocks_on',
    'stand_scheduled',
    'last_distance_to_gate',
    'last_in_sector',
    'status',
    'mode_s',
    'acReg',
    'partition',
    'vdgs_in',
    'stand_active',
    'stand_docking',
    'aibt_received',
    'sqt',
    'plb_on',
    'pca_on',
    'gpu_on',
    'towbar_on',
    'plb_off',
    'pca_off',
    'gpu_off',
    'acars_out',
    'vdgs_out',
    'stand_free',
    'eobt',
    'aldt_received',
    'stand_prepared',
    'stand_auto_start',
    'roll',
    'speed'
], axis=1)

## We add Date column


In [9]:
Airport_Data['DATE'] = pd.to_datetime(Airport_Data['aldt'], errors='coerce').dt.normalize()

## Merging the two

In [10]:
Airport_and_weather = pd.merge(Airport_Data, weather_data_prep, how='left', on=['DATE'])

In [11]:
Airport_and_weather = Airport_and_weather.rename(columns={
    "carrier": "Airline", 
    "flight": "FlightNumber", 
    "DATE": "Date", 
    "acType": "AircraftType", 
    "ship": "ShipmentWeight",
    'runway': 'Runway',
    'stand': 'Stand',
    'aldt': 'ActualLandingTime',
    'eibt': 'EstimatedInBlockTime',
    'cibt': 'CalculatedInBlockTime',
    'aibt': 'ActualInBlockTime'
}, errors="raise")

In [12]:
Airport_and_weather['TaxiTime'] = pd.to_datetime(Airport_and_weather['ActualInBlockTime'], errors='coerce') - pd.to_datetime(Airport_and_weather['ActualLandingTime'], errors='coerce')


Airport_and_weather['TaxiTime'] = Airport_and_weather['TaxiTime'].dt.total_seconds() / 60






In [13]:
Airport_and_weather = Airport_and_weather.drop_duplicates()
Airport_and_weather = Airport_and_weather[np.isfinite(Airport_and_weather['TaxiTime'])]

In [14]:
conditions = [
    (pd.to_numeric(Airport_and_weather['ShipmentWeight'], errors='coerce') < 2000),
    (pd.to_numeric(Airport_and_weather['ShipmentWeight'], errors='coerce') >= 2000) & (pd.to_numeric(Airport_and_weather['ShipmentWeight'], errors='coerce') < 6000),
    (pd.to_numeric(Airport_and_weather['ShipmentWeight'], errors='coerce') >= 6000)]

choices = ['S', 'M', 'L']

Airport_and_weather['ShipmentWeightCat'] = np.select(conditions, choices)

In [15]:
Airport_and_weather['ActualLandingTime'] = pd.to_datetime(Airport_and_weather['ActualLandingTime'], errors='coerce')
Airport_and_weather = Airport_and_weather.sort_values(by='ActualLandingTime').set_index('ActualLandingTime')
Airport_and_weather['NbPlanesLast10Mn'] = Airport_and_weather['TaxiTime'].rolling("10T").count()
Airport_and_weather = Airport_and_weather.reset_index()

Airport_and_weather['NbPlanesLast10Mn'] = Airport_and_weather['NbPlanesLast10Mn'] - 1

In [16]:
Airport_and_weather = Airport_and_weather[(Airport_and_weather['TaxiTime'] >= 0) & (Airport_and_weather['TaxiTime'] < 120)]



In [17]:
Airport_and_weather['Hour'] = Airport_and_weather['ActualLandingTime'].dt.hour
Airport_and_weather['DayOfTheWeek'] = Airport_and_weather['ActualLandingTime'].dt.dayofweek

In [18]:
Airport_and_weather

Unnamed: 0,ActualLandingTime,Airline,FlightNumber,AircraftType,ShipmentWeight,Runway,Stand,EstimatedInBlockTime,CalculatedInBlockTime,ActualInBlockTime,...,WT03_max,WT08_max,TMIN_min,IceRoad,WetDay,TaxiTime,ShipmentWeightCat,NbPlanesLast10Mn,Hour,DayOfTheWeek
0,2018-07-30 04:25:00,CAR1,2519,B737/9-WL,3845,RUNWAY02,STAND04,7/30/2018 4:33,7/30/2018 4:33,7/30/2018 4:29,...,1.0,1.0,69.0,0.0,1.0,4.0,M,0.0,4,0
1,2018-07-30 08:14:00,CAR1,1757,B757/2-WL,5655,RUNWAY02,STAND07,7/30/2018 8:19,7/30/2018 8:18,7/30/2018 8:18,...,1.0,1.0,69.0,0.0,1.0,4.0,M,0.0,8,0
2,2018-07-30 08:26:00,CAR1,1636,A321/2,3011,RUNWAY01,STAND08,7/30/2018 8:31,7/30/2018 8:32,7/30/2018 8:29,...,1.0,1.0,69.0,0.0,1.0,3.0,M,0.0,8,0
3,2018-07-30 08:42:00,CAR1,2048,B737/9-WL,3813,RUNWAY02,STAND09,7/30/2018 8:47,7/30/2018 8:45,7/30/2018 8:46,...,1.0,1.0,69.0,0.0,1.0,4.0,M,0.0,8,0
4,2018-07-30 08:52:00,CAR1,2510,A321/2,3048,RUNWAY02,STAND11,7/30/2018 8:57,7/30/2018 8:57,7/30/2018 8:57,...,1.0,1.0,69.0,0.0,1.0,5.0,M,0.0,8,0
5,2018-07-30 09:07:00,CAR1,110,A330/3,3322,RUNWAY01,STAND14,7/30/2018 9:15,7/30/2018 9:16,7/30/2018 9:17,...,1.0,1.0,69.0,0.0,1.0,10.0,M,0.0,9,0
6,2018-07-30 09:10:00,CAR1,423,B737/9-WL,3866,RUNWAY02,STAND15,7/30/2018 9:16,7/30/2018 9:17,7/30/2018 9:16,...,1.0,1.0,69.0,0.0,1.0,6.0,M,1.0,9,0
7,2018-07-30 09:10:00,CAR1,146,B767/3-WL,199,RUNWAY01,STAND16,7/30/2018 9:18,7/30/2018 9:15,7/30/2018 9:17,...,1.0,1.0,69.0,0.0,1.0,7.0,S,2.0,9,0
8,2018-07-30 09:21:00,CAR1,1943,B777/2-LR,7103,RUNWAY01,STAND03,7/30/2018 9:28,7/30/2018 9:27,7/30/2018 9:28,...,1.0,1.0,69.0,0.0,1.0,7.0,L,0.0,9,0
9,2018-07-30 09:27:00,CAR1,2678,B737/9-WL,3842,RUNWAY03,STAND17,7/30/2018 9:35,7/30/2018 9:33,7/30/2018 9:33,...,1.0,1.0,69.0,0.0,1.0,6.0,M,1.0,9,0


In [19]:
Airport_and_weather['Runway'] = pd.to_numeric(Airport_and_weather['Runway'].str[7:9], errors='coerce')
Airport_and_weather = Airport_and_weather.rename(columns={
    "Runway": "RunwayNumber"
}, errors="raise")


Airport_and_weather['Stand'] =  pd.to_numeric(Airport_and_weather['Stand'].str[5:], errors='coerce')

Airport_and_weather = Airport_and_weather.rename(columns={
    "Stand": "StandNumber"
}, errors="raise")

In [20]:
Airport_and_weather['FlightNumber'] = pd.to_numeric(Airport_and_weather['FlightNumber'], errors='coerce')

Airport_and_weather = Airport_and_weather[(Airport_and_weather['FlightNumber'] != 9956) & (Airport_and_weather['FlightNumber'] != 1935)]

In [21]:
Airport_and_weather = Airport_and_weather.drop([
    'FlightNumber', 
    'EstimatedInBlockTime',
    'CalculatedInBlockTime',
    'ActualInBlockTime',
    'TMAX_max',
    'TMIN_min',
    'ShipmentWeight'
], axis=1)

In [22]:
Airport_and_weather['LogTaxiTime'] = np.log(Airport_and_weather['TaxiTime'] + 1) 

In [23]:
Aircraft_info = pd.read_csv("weather_data_prep.csv")

In [24]:
xl_file = pd.ExcelFile("ACchar.xlsx")
dfs = {sheet_name: xl_file.parse(sheet_name) 
          for sheet_name in xl_file.sheet_names}

In [25]:
Aircrafts_info = pd.read_csv("aircraft_simplified_data.csv", sep=";",header=0,encoding = 'unicode_escape')
Aircrafts_info = Aircrafts_info.rename(columns={
    "Model_Airport_Data": "AircraftType"
}, errors="raise")

In [26]:
FinalDataset = pd.merge(Airport_and_weather, Aircrafts_info, how='left', on=['AircraftType'])

In [27]:
FinalDataset =FinalDataset[FinalDataset['Date'] != "2018-09-26"]
FinalDataset =FinalDataset[FinalDataset['Date'] != "2018-08-01"]

# Final data preparation 

In [28]:
# Columns renaming: 
FinalDataset.rename(columns={'Parking Area':'ParkingArea',
                             'WT01_max':'WT01max',
                             'WT02_max':'WT02max', 
                             'WT03_max':'WT03max', 
                             'WT08_max':'WT08max', 
                             'WSF2_max':'WSF2max', 
                             'WSF5_max':'WSF5max',
                             'Model_Acchar':'ModelAcchar',  
                             'TAVG_mean':'TAVGmean',
                             'AWND_mean':'AWNDmean', 
                             'TAVG_mean':'TAVGmean', 
                             'DayOfTheWeek':'WeekDay',
                             'WDF5_mean':'WDF5mean', 
                             'PRCP_mean':'PRCPmean'}, inplace=True)

# Dropping correlated variables:  
FinalDataset=FinalDataset.drop(labels=["Wingspan","Length"], axis=1)

# Converting dates to_datetime: 
FinalDataset.ActualLandingTime=pd.to_datetime(FinalDataset.ActualLandingTime.values)
FinalDataset.Date=pd.to_datetime(FinalDataset.Date.values)


In [29]:
FinalDataset.dtypes

ActualLandingTime    datetime64[ns]
Airline                      object
AircraftType                 object
RunwayNumber                float64
StandNumber                   int64
Date                 datetime64[ns]
AWNDmean                    float64
PRCPmean                    float64
TAVGmean                    float64
WDF5mean                    float64
WSF2max                     float64
WSF5max                     float64
WT01max                     float64
WT02max                     float64
WT03max                     float64
WT08max                     float64
IceRoad                     float64
WetDay                      float64
TaxiTime                    float64
ShipmentWeightCat            object
NbPlanesLast10Mn            float64
Hour                          int64
WeekDay                       int64
LogTaxiTime                 float64
ModelAcchar                  object
ParkingArea                 float64
dtype: object

In [30]:

# Dropping (real) NA values ~4%: 
FinalDataset=FinalDataset.dropna(subset=["ModelAcchar", "RunwayNumber", "StandNumber", "AircraftType","ParkingArea"])

# Filling other missing values with 0 : 
FinalDataset=FinalDataset.fillna(0)

# Putting TaxiTime and LogTaxiTime as last columns : 
taxitime=FinalDataset.TaxiTime
logtaxitime=FinalDataset.LogTaxiTime 

FinalDataset.drop(["TaxiTime", "LogTaxiTime"], axis=1, inplace=True)

FinalDataset["TaxiTime"]=taxitime
FinalDataset["LogTaxiTime"]=logtaxitime

In [31]:
export_csv = FinalDataset.to_csv('DatasetTrainingFinal.csv', index = None, header=True)

In [33]:
FinalDataset

Unnamed: 0,ActualLandingTime,Airline,AircraftType,RunwayNumber,StandNumber,Date,AWNDmean,PRCPmean,TAVGmean,WDF5mean,...,IceRoad,WetDay,ShipmentWeightCat,NbPlanesLast10Mn,Hour,WeekDay,ModelAcchar,ParkingArea,TaxiTime,LogTaxiTime
0,2018-07-30 04:25:00,CAR1,B737/9-WL,2.0,4,2018-07-30,4.173333,0.006667,79.0,153.333333,...,0.0,1.0,M,0.0,4,0,737-900 with winglets,16223.1,4.0,1.609438
1,2018-07-30 08:14:00,CAR1,B757/2-WL,2.0,7,2018-07-30,4.173333,0.006667,79.0,153.333333,...,0.0,1.0,M,0.0,8,0,"757-200, -200PF with winglets",20919.9,4.0,1.609438
2,2018-07-30 08:26:00,CAR1,A321/2,1.0,8,2018-07-30,4.173333,0.006667,79.0,153.333333,...,0.0,1.0,M,0.0,8,0,A321-200,16337.8,3.0,1.386294
3,2018-07-30 08:42:00,CAR1,B737/9-WL,2.0,9,2018-07-30,4.173333,0.006667,79.0,153.333333,...,0.0,1.0,M,0.0,8,0,737-900 with winglets,16223.1,4.0,1.609438
4,2018-07-30 08:52:00,CAR1,A321/2,2.0,11,2018-07-30,4.173333,0.006667,79.0,153.333333,...,0.0,1.0,M,0.0,8,0,A321-200,16337.8,5.0,1.791759
5,2018-07-30 09:07:00,CAR1,A330/3,1.0,14,2018-07-30,4.173333,0.006667,79.0,153.333333,...,0.0,1.0,M,0.0,9,0,A330-300,41344.5,10.0,2.397895
6,2018-07-30 09:10:00,CAR1,B737/9-WL,2.0,15,2018-07-30,4.173333,0.006667,79.0,153.333333,...,0.0,1.0,M,1.0,9,0,737-900 with winglets,16223.1,6.0,1.945910
7,2018-07-30 09:10:00,CAR1,B767/3-WL,1.0,16,2018-07-30,4.173333,0.006667,79.0,153.333333,...,0.0,1.0,S,2.0,9,0,767-300ER with winglets,30101.8,7.0,2.079442
8,2018-07-30 09:21:00,CAR1,B777/2-LR,1.0,3,2018-07-30,4.173333,0.006667,79.0,153.333333,...,0.0,1.0,L,0.0,9,0,777-200LR,44447.6,7.0,2.079442
9,2018-07-30 09:27:00,CAR1,B737/9-WL,3.0,17,2018-07-30,4.173333,0.006667,79.0,153.333333,...,0.0,1.0,M,1.0,9,0,737-900 with winglets,16223.1,6.0,1.945910
