In [1]:
import pandas as pd
import datetime
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [3]:
df = pd.read_csv('../../raw_data/flights.csv', nrows=500000)

In [4]:
df.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,


In [5]:
df['ORIGIN_AIRPORT'].duplicated().sum()

499687

In [6]:
df['ORIGIN_AIRPORT'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 500000 entries, 0 to 499999
Series name: ORIGIN_AIRPORT
Non-Null Count   Dtype 
--------------   ----- 
500000 non-null  object
dtypes: object(1)
memory usage: 3.8+ MB


In [7]:
df['ORIGIN_AIRPORT'].value_counts

<bound method IndexOpsMixin.value_counts of 0         ANC
1         LAX
2         SFO
3         LAX
4         SEA
         ... 
499995    DAL
499996    MDW
499997    MDW
499998    MDW
499999    MSP
Name: ORIGIN_AIRPORT, Length: 500000, dtype: object>

In [8]:
df['ORIGIN_AIRPORT'].value_counts()

ORIGIN_AIRPORT
ATL    31332
ORD    24967
DFW    24651
LAX    18457
DEN    18142
       ...  
OTH       19
UST       17
PPG       11
ADK       10
BGM        2
Name: count, Length: 313, dtype: int64

In [9]:
#cambiar formato a datetime 
df['DATE'] = pd.to_datetime(df[['YEAR','MONTH', 'DAY']])

In [10]:
df['DATE']

0        2015-01-01
1        2015-01-01
2        2015-01-01
3        2015-01-01
4        2015-01-01
            ...    
499995   2015-02-03
499996   2015-02-03
499997   2015-02-03
499998   2015-02-03
499999   2015-02-03
Name: DATE, Length: 500000, dtype: datetime64[ns]

In [15]:
def format_heure(chaine):
    if pd.isnull(chaine):
        return np.nan
    else:
        if chaine == 2400: chaine = 0
        chaine = "{0:04d}".format(int(chaine))
        heure = datetime.time(int(chaine[0:2]), int(chaine[2:4]))
        return int(heure)

In [16]:
def combine_date_heure(x):
    if pd.isnull(x[0]) or pd.isnull(x[1]):
        return np.nan
    else:
        return datetime.datetime.combine(x[0],x[1])

In [17]:
def create_flight_time(df, col):    
    liste = []
    for index, cols in df[['DATE', col]].iterrows():    
        if pd.isnull(cols[1]):
            liste.append(np.nan)
        elif float(cols[1]) == 2400:
            cols[0] += datetime.timedelta(days=1)
            cols[1] = datetime.time(0,0)
            liste.append(combine_date_heure(cols))
        else:
            cols[1] = format_heure(cols[1])
            liste.append(combine_date_heure(cols))
    return pd.Series(liste)

In [18]:
df['SCHEDULED_DEPARTURE'] = create_flight_time(df, 'SCHEDULED_DEPARTURE')
df['DEPARTURE_TIME'] = df['DEPARTURE_TIME'].apply(format_heure)
df['SCHEDULED_ARRIVAL'] = df['SCHEDULED_ARRIVAL'].apply(format_heure)
df['ARRIVAL_TIME'] = df['ARRIVAL_TIME'].apply(format_heure)
#__________________________________________________________________________
df.loc[:5, ['SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'DEPARTURE_TIME',
             'ARRIVAL_TIME', 'DEPARTURE_DELAY', 'ARRIVAL_DELAY']]

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'datetime.time'

In [13]:
variables_to_remove = ['YEAR','FLIGHT_NUMBER','TAXI_OUT', 'WHEELS_ON', 'WHEELS_OFF', 'YEAR', 'AIRLINE_DELAY', 
                       'DATE', 'AIR_SYSTEM_DELAY','SCHEDULED_DEPARTURE','DEPARTURE_TIME','DEPARTURE_DELAY',
                       'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY','TAXI_IN','ARRIVAL_TIME','ARRIVAL_DELAY','DIVERTED',
                       'WEATHER_DELAY', 'CANCELLATION_REASON', 'TAIL_NUMBER', 'AIR_TIME','ELAPSED_TIME', 'AIR_TIME' ]

In [14]:
df.drop(variables_to_remove, axis = 1, inplace = True)

In [15]:
df_2 = df[['MONTH','DAY', 'DAY_OF_WEEK', 'CANCELLED', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
        'SCHEDULED_TIME','DISTANCE','SCHEDULED_ARRIVAL',
        'CANCELLED']]

In [16]:
df_2.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,CANCELLED,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,CANCELLED.1
0,1,1,4,0,AS,ANC,SEA,205.0,1448,04:30:00,0
1,1,1,4,0,AA,LAX,PBI,280.0,2330,07:50:00,0
2,1,1,4,0,US,SFO,CLT,286.0,2296,08:06:00,0
3,1,1,4,0,AA,LAX,MIA,285.0,2342,08:05:00,0
4,1,1,4,0,AS,SEA,ANC,235.0,1448,03:20:00,0


In [17]:
df_2['AIRLINE'].isnull().sum()

0

In [18]:
df_2['AIRLINE'].unique()

array(['AS', 'AA', 'US', 'DL', 'NK', 'UA', 'HA', 'B6', 'OO', 'EV', 'MQ',
       'F9', 'WN', 'VX'], dtype=object)

In [19]:
# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse = False) 

# Fit encoder
ohe.fit(df_2[['AIRLINE']]) 




In [21]:
print(f"The column names for the encoded values are {ohe.get_feature_names_out()}")

# Transform the current "Street" column
ohe_t= ohe.transform(df_2[['AIRLINE']])

df_2[ohe.get_feature_names_out()]

# Drop the column "Street" which has been encoded
#df_2.drop(columns = ["AIRLINE"], inplace = True)

# Show the dataset
# df_2.head(3)

The column names for the encoded values are ['AIRLINE_AA' 'AIRLINE_AS' 'AIRLINE_B6' 'AIRLINE_DL' 'AIRLINE_EV'
 'AIRLINE_F9' 'AIRLINE_HA' 'AIRLINE_MQ' 'AIRLINE_NK' 'AIRLINE_OO'
 'AIRLINE_UA' 'AIRLINE_US' 'AIRLINE_VX' 'AIRLINE_WN']


Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,CANCELLED,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,CANCELLED.1
0,1,1,4,0,AS,ANC,SEA,205.0,1448,04:30:00,0
1,1,1,4,0,AA,LAX,PBI,280.0,2330,07:50:00,0
2,1,1,4,0,US,SFO,CLT,286.0,2296,08:06:00,0


In [22]:
df_2[ohe.get_feature_names_out()]=ohe_t
df_2

NotImplementedError: 

In [None]:
ohe.transform(df_2[['AIRLINE']]).shape

In [None]:
ohe.get_feature_names_out()

In [None]:
ohe.fit(df[['AIRLINE']]) 

In [None]:
print(f"The column names for the encoded values are {ohe.get_feature_names_out()}")

In [None]:
# Transform the current "AIRLINE" column
df[ohe.get_feature_names_out()] = ohe.transform(df[['AIRLINE']])

In [None]:
df_airlines = pd.read_csv('../../raw_data/airlines.csv')a

In [None]:
df_airlines

In [None]:
print('df_airlines_dimensiones:', df_airlines.shape)

In [None]:
df_airports = pd.read_csv('../../raw_data/airports.csv')

In [None]:
df_airports

In [None]:
df_city_attributes = pd.read_csv('../../raw_data/city_attributes.csv')

In [None]:
df_city_attributes

In [None]:
df_humidity =pd.read_csv('../../raw_data/humidity.csv')

In [None]:
df_humidity

In [None]:
df_pressure =pd.read_csv('../../raw_data/pressure.csv')

In [None]:
df_pressure

In [None]:
df_temperature =pd.read_csv('../../raw_data/temperature.csv')

In [None]:
df_temperature

In [None]:
df_weather_description =pd.read_csv('../../raw_data/weather_description.csv')

In [None]:
df_weather_description

In [None]:
df_wind_direction =pd.read_csv('../../raw_data/wind_direction.csv')

In [None]:
df_wind_direction

In [None]:
df_wind_speed =pd.read_csv('../../raw_data/wind_speed.csv')

In [None]:
df_wind_speed