In [19]:
import pandas as pd
import numpy as np
import os
import pyarrow.parquet as pq
from math import *
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
### Loading data

df_1990 = pd.read_csv("/Users/youssouf/Downloads/1990.csv")
df_1990.head(2)

df_1993 = pd.read_csv("/Users/youssouf/Downloads/1993.csv")
df_1993.head(2)

df_1994 = pd.read_csv("/Users/youssouf/Downloads/1994.csv")
df_1994.head(2)

df_1998 = pd.read_csv("/Users/youssouf/Downloads/1998.csv")
df_1998.head(2)

df_2007 = pd.read_csv("/Users/youssouf/Downloads/2007.csv")
df_2007.head(2)

df_2008 = pd.read_csv("/Users/youssouf/Downloads/2008.csv")
df_2008.head(2)

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,1343.0,1325,1451.0,1435,WN,588,...,4.0,9.0,0,,0,16.0,0.0,0.0,0.0,0.0
1,2008,1,3,4,1125.0,1120,1247.0,1245,WN,1343,...,3.0,8.0,0,,0,,,,,


In [3]:
### Sample of data

list_df = [df_1990, df_1993 ,df_1994, df_1998, df_2007, df_2008]
lst_df_tmp = []

for df in list_df :
    lst_df_tmp.append(df.sample(frac = 0.006))
            
df_final = pd.concat(lst_df_tmp)
df_final.head(2)

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
1109802,1990,3,22,4,2120.0,2109,2219.0,2215,DL,1435,...,,,0,,0,,,,,
1809660,1990,5,27,7,1600.0,1600,1700.0,1700,WN,352,...,,,0,,0,,,,,


In [4]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184490 entries, 1109802 to 2085578
Data columns (total 29 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Year               184490 non-null  int64  
 1   Month              184490 non-null  int64  
 2   DayofMonth         184490 non-null  int64  
 3   DayOfWeek          184490 non-null  int64  
 4   DepTime            181235 non-null  float64
 5   CRSDepTime         184490 non-null  int64  
 6   ArrTime            180790 non-null  float64
 7   CRSArrTime         184490 non-null  int64  
 8   UniqueCarrier      184490 non-null  object 
 9   FlightNum          184490 non-null  int64  
 10  TailNum            91106 non-null   object 
 11  ActualElapsedTime  180790 non-null  float64
 12  CRSElapsedTime     184471 non-null  float64
 13  AirTime            88997 non-null   float64
 14  ArrDelay           180790 non-null  float64
 15  DepDelay           181235 non-null  float64


In [28]:
### Creation of the column Delayed
def delayed(x,y) :
    if (x <= 0 and y <= 0) : 
        return "N"
    return "Y"
            
vdelayed = np.vectorize(delayed)   

In [29]:
df_final["Delayed"] = vdelayed(df_final["ArrDelay"], df_final["DepDelay"])

In [8]:
### Label Encoding of categorical variables
df_final['UniqueCarrier_Cat'] = df_final['UniqueCarrier'].astype('category').cat.codes
df_final['Origin_Cat'] = df_final['Origin'].astype('category').cat.codes
df_final['Dest_Cat'] = df_final['Dest'].astype('category').cat.codes
df_final['TailNum_Cat'] = df_final['TailNum'].astype('category').cat.codes
df_final['CancellationCode_Cat'] = df_final['CancellationCode'].astype('category').cat.codes
df_final['Delayed_Cat'] = df_final['Delayed'].astype('category').cat.codes


In [None]:
df_final.head()

In [10]:
df_final.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
       'Delayed', 'UniqueCarrier_Cat', 'Origin_Cat', 'Dest_Cat', 'TailNum_Cat',
       'CancellationCode_Cat', 'Delayed_Cat'],
      dtype='object')

In [9]:
df_final[["Delayed"]].value_counts()

Delayed
Y          113150
N           71340
dtype: int64

In [15]:
### Data Preparation

df_final = df_final.fillna(0)

data = df_final[['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'FlightNum' ,
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
        'UniqueCarrier_Cat', 'Origin_Cat', 'Dest_Cat', 'TailNum_Cat',
       'CancellationCode_Cat']]

target = df_final[["Delayed_Cat"]]

In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184490 entries, 1109802 to 2085578
Data columns (total 29 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Year                  184490 non-null  int64  
 1   Month                 184490 non-null  int64  
 2   DayofMonth            184490 non-null  int64  
 3   DayOfWeek             184490 non-null  int64  
 4   DepTime               184490 non-null  float64
 5   CRSDepTime            184490 non-null  int64  
 6   ArrTime               184490 non-null  float64
 7   CRSArrTime            184490 non-null  int64  
 8   FlightNum             184490 non-null  int64  
 9   ActualElapsedTime     184490 non-null  float64
 10  CRSElapsedTime        184490 non-null  float64
 11  AirTime               184490 non-null  float64
 12  ArrDelay              184490 non-null  float64
 13  DepDelay              184490 non-null  float64
 14  Distance              184490 non-null  float6

In [22]:

X_train,X_test,y_train,y_test = train_test_split(data,target,test_size=0.75,random_state=0)


In [26]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X_train,y_train)

#
y_pred=logreg.predict(X_test)

  return f(**kwargs)


AttributeError: 'str' object has no attribute 'decode'

In [27]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

NameError: name 'y_pred' is not defined

In [10]:
from pycaret.classification import *


OSError: dlopen(/Users/youssouf/opt/anaconda3/lib/python3.8/site-packages/lightgbm/lib_lightgbm.so, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib
  Referenced from: /Users/youssouf/opt/anaconda3/lib/python3.8/site-packages/lightgbm/lib_lightgbm.so
  Reason: image not found

In [9]:
#### 
exp_name = setup(data = df_final,  target = 'Purchase', 
                 categorical_features = ["UniqueCarrier", "Origin", "Dest", "TailNum", "CancellationCode"],
                train_size = 0.75, n_jobs = 2)



NameError: name 'setup' is not defined

In [None]:
best_model = compare_models()

In [None]:
lr = create_model('lr')