In [1]:
# loading libraries

import os
import folium
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns

In [41]:
pd.set_option("display.max_columns", None)

In [None]:
# creating a temp df (with duplicated lines for airports that changed their ICAO code) to add clusters column to map df and flights df

In [2]:
temp = pd.read_csv("./assets/csv/temp/temp_icao_cluster.csv", sep=",")

In [3]:
temp

Unnamed: 0,icao_code,lat_dd,long_dd,city_primary,state,cluster
0,SBGR,-23.435556,-46.473056,Guarulhos,SP,1
1,SBSP,-23.626111,-46.656389,São paulo,SP,2
2,SBBR,-15.871111,-47.918611,Brasília,DF,1
3,SBKP,-23.006944,-47.134444,Campinas,SP,3
4,SBGL,-22.810000,-43.250556,Rio de janeiro,RJ,2
...,...,...,...,...,...,...
237,SSLT,-29.812500,-55.893333,Alegrete,RS,8
238,SILC,-13.037778,-55.950278,Lucas do rio verde,MT,8
239,SIZX,-11.296667,-57.548889,Juara,MT,8
240,SSOE,-26.781111,-53.503333,São miguel do oeste,SC,8


# Cleaning *'clusters' and 'flights'* files from 2002 to 2011

In [None]:
# 2002 clusters – none repeated

In [148]:
clusters_2002 = pd.read_csv('./assets/csv/count/df2002.csv', sep=',')

In [149]:
clusters_2002

Unnamed: 0,icao_code,count
0,SBBR,7972
1,SBSP,7647
2,SBGR,6722
3,SBSV,5727
4,SBGL,5189
...,...,...
63,SBKG,63
64,SNBR,40
65,SBTT,34
66,SNDM,31


In [150]:
clusters_2002 = clusters_2002.merge(temp, how='left', on="icao_code")

In [151]:
clusters_2002

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBBR,7972,-15.871111,-47.918611,Brasília,DF,1
1,SBSP,7647,-23.626111,-46.656389,São paulo,SP,2
2,SBGR,6722,-23.435556,-46.473056,Guarulhos,SP,1
3,SBSV,5727,-12.908611,-38.322500,Salvador,BA,2
4,SBGL,5189,-22.810000,-43.250556,Rio de janeiro,RJ,2
...,...,...,...,...,...,...,...
63,SBKG,63,-7.269167,-35.895000,Campina grande,PB,7
64,SNBR,40,-12.079167,-45.009444,Barreiras,BA,8
65,SBTT,34,-4.250556,-69.937778,Tabatinga,AM,8
66,SNDM,31,-12.482222,-41.276944,Lençóis,BA,8


In [152]:
clusters_2002.city_primary.duplicated().any()

True

In [153]:
clusters_2002[clusters_2002.city_primary.duplicated()]

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
11,SBRJ,2493,-22.91,-43.1625,Rio de janeiro,RJ,3


In [154]:
# city of Rio de Janeiro has two airports

clusters_2002[clusters_2002['city_primary'] == 'Rio de janeiro']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
4,SBGL,5189,-22.81,-43.250556,Rio de janeiro,RJ,2
11,SBRJ,2493,-22.91,-43.1625,Rio de janeiro,RJ,3


In [157]:
# flights 2002

In [220]:
flights_2002 = pd.read_csv("./assets/csv/temp/flights_2002.csv", sep=",", index_col=0).drop(columns="index")

In [221]:
len(flights_2002)

89667

In [222]:
len(flights_2002[flights_2002.icao_origin == flights_2002.icao_dest])

5

In [223]:
flights_2002 = flights_2002[~(flights_2002.icao_origin == flights_2002.icao_dest)]

In [224]:
flights_2002.reset_index(drop=True, inplace=True)

In [225]:
len(flights_2002)

89662

In [226]:
flights_2002.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata
0,GLO,G3,1720,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2002-09-02,17:46:00,2002-09-02,2002,9,2,1,SETEMBRO,SBSP,CGH,Congonhas,São Paulo,SP,SUDESTE,18:41:00,2002-09-02,2002,9,2,B737,BOEING 737-700 (WINGLETS) PAX,2225.0,15947,331.0,0.92,360.18,144,136,0,598,800,0,0,0,11598,0.0,0,0,0,197938,264800,47664,45016,5278,5278,1720_2,SBCT-SBSP,CWB-CGH
1,GLO,G3,1735,4,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2002-09-10,,,0,0,0,0,SETEMBRO,SBMH,,Maringá,Maringá,PR,SUL,,,0,0,0,B737,BOEING 737-700 (WINGLETS) PAX,2386.0,15947,367.0,1.0,1.0,144,57,0,258,0,10,0,0,4543,0.0,3670,0,0,94686,0,52848,20919,5852,5852,1735_4,SBCT-SBMH,


In [227]:
len(flights_2002[flights_2002.sched_date.isnull()])

0

In [228]:
len(flights_2002[flights_2002.dep_date.isnull()])

4545

In [229]:
flights_2002.insert(11, 'sched_year', flights_2002.sched_date.str[0:4])
flights_2002.insert(12, 'sched_month', flights_2002.sched_date.str[5:7])
flights_2002.insert(13, 'sched_day', flights_2002.sched_date.str[8:10])

In [230]:
flights_2002.dep_month.unique()

array([ 9,  0, 10, 11, 12,  1])

In [231]:
len(flights_2002[flights_2002.dep_month == 1])

10

In [232]:
# creating cluster column for origin airport

In [233]:
flights_2002 = flights_2002.merge(
    temp, how="left", left_on="icao_origin", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [234]:
len(flights_2002[flights_2002.cluster.isnull()])

0

In [235]:
flights_2002 = flights_2002.rename(
    {
        "lat_dd": "lat_dd_origin",
        "long_dd": "long_dd_origin",
        "cluster": "cluster_origin"
    },
    axis=1,
)

In [236]:
sorted(flights_2002.cluster_origin.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [237]:
# creating cluster column for destination airport

In [238]:
flights_2002 = flights_2002.merge(
    temp, how="left", left_on="icao_dest", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [239]:
len(flights_2002[flights_2002.cluster.isnull()])

0

In [240]:
flights_2002 = flights_2002.rename(
    {
        "lat_dd": "lat_dd_dest",
        "long_dd": "long_dd_dest",
        "cluster": "cluster_dest"
    },
    axis=1,
)

In [241]:
sorted(flights_2002.cluster_dest.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [242]:
flights_2002.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,sched_year,sched_month,sched_day,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata,lat_dd_origin,long_dd_origin,cluster_origin,lat_dd_dest,long_dd_dest,cluster_dest
0,GLO,G3,1720,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2002-09-02,2002,9,2,17:46:00,2002-09-02,2002,9,2,1,SETEMBRO,SBSP,CGH,Congonhas,São Paulo,SP,SUDESTE,18:41:00,2002-09-02,2002,9,2,B737,BOEING 737-700 (WINGLETS) PAX,2225.0,15947,331.0,0.92,360.18,144,136,0,598,800,0,0,0,11598,0.0,0,0,0,197938,264800,47664,45016,5278,5278,1720_2,SBCT-SBSP,CWB-CGH,-25.531667,-49.176111,4,-23.626111,-46.656389,2
1,GLO,G3,1735,4,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2002-09-10,2002,9,10,,,0,0,0,0,SETEMBRO,SBMH,,Maringá,Maringá,PR,SUL,,,0,0,0,B737,BOEING 737-700 (WINGLETS) PAX,2386.0,15947,367.0,1.0,1.0,144,57,0,258,0,10,0,0,4543,0.0,3670,0,0,94686,0,52848,20919,5852,5852,1735_4,SBCT-SBMH,,-25.531667,-49.176111,4,-23.479444,-52.012222,6


In [182]:
# 2003 clusters – Maringá (SBMG) repeated

In [183]:
clusters_2003 = pd.read_csv('./assets/csv/count/df2003.csv', sep=',')

In [184]:
clusters_2003

Unnamed: 0,icao_code,count
0,SBSP,26994
1,SBBR,23051
2,SBSV,18994
3,SBGL,15696
4,SBGR,15555
...,...,...
63,SBKG,107
64,SBJU,87
65,SBSJ,72
66,SNDM,51


In [185]:
clusters_2003 = clusters_2003.merge(temp, how='left', on="icao_code")

In [186]:
clusters_2003

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBSP,26994,-23.626111,-46.656389,São paulo,SP,2
1,SBBR,23051,-15.871111,-47.918611,Brasília,DF,1
2,SBSV,18994,-12.908611,-38.322500,Salvador,BA,2
3,SBGL,15696,-22.810000,-43.250556,Rio de janeiro,RJ,2
4,SBGR,15555,-23.435556,-46.473056,Guarulhos,SP,1
...,...,...,...,...,...,...,...
63,SBKG,107,-7.269167,-35.895000,Campina grande,PB,7
64,SBJU,87,-7.219167,-39.269444,Juazeiro do norte,CE,6
65,SBSJ,72,-23.228889,-45.871111,São josé dos campos,SP,8
66,SNDM,51,-12.482222,-41.276944,Lençóis,BA,8


In [187]:
clusters_2003.city_primary.duplicated().any()

True

In [188]:
clusters_2003[clusters_2003.city_primary.duplicated()]

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
7,SBRJ,11682,-22.91,-43.1625,Rio de janeiro,RJ,3
41,SBMH,570,-23.479444,-52.012222,Maringá,PR,6


In [189]:
clusters_2003[clusters_2003['city_primary'] == 'Rio de janeiro']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
3,SBGL,15696,-22.81,-43.250556,Rio de janeiro,RJ,2
7,SBRJ,11682,-22.91,-43.1625,Rio de janeiro,RJ,3


In [190]:
clusters_2003[clusters_2003['city_primary'] == 'Maringá']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
39,SBMG,656,-23.479444,-52.012222,Maringá,PR,6
41,SBMH,570,-23.479444,-52.012222,Maringá,PR,6


In [191]:
clusters_2003.at[39, 'count'] = 656 + 570

In [192]:
clusters_2003.drop(41, inplace=True)

In [193]:
clusters_2003

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBSP,26994,-23.626111,-46.656389,São paulo,SP,2
1,SBBR,23051,-15.871111,-47.918611,Brasília,DF,1
2,SBSV,18994,-12.908611,-38.322500,Salvador,BA,2
3,SBGL,15696,-22.810000,-43.250556,Rio de janeiro,RJ,2
4,SBGR,15555,-23.435556,-46.473056,Guarulhos,SP,1
...,...,...,...,...,...,...,...
63,SBKG,107,-7.269167,-35.895000,Campina grande,PB,7
64,SBJU,87,-7.219167,-39.269444,Juazeiro do norte,CE,6
65,SBSJ,72,-23.228889,-45.871111,São josé dos campos,SP,8
66,SNDM,51,-12.482222,-41.276944,Lençóis,BA,8


In [194]:
clusters_2003.to_csv("./assets/csv/cluster/cluster_2003_map.csv", sep=",")

In [None]:
# flights 2003

In [250]:
flights_2003 = pd.read_csv("./assets/csv/temp/flights_2003.csv", sep=",", index_col=0).drop(columns="index")

In [251]:
len(flights_2003)

247128

In [252]:
len(flights_2003[flights_2003.icao_origin == flights_2003.icao_dest])

17

In [253]:
flights_2003 = flights_2003[~(flights_2003.icao_origin == flights_2003.icao_dest)]

In [254]:
flights_2003.reset_index(drop=True, inplace=True)

In [255]:
len(flights_2003)

247111

In [256]:
flights_2003.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata
0,GLO,G3,1762,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2003-01-02,16:34:00,2003-01-01,2003,1,1,3,JANEIRO,SBSP,CGH,Congonhas,São Paulo,SP,SUDESTE,17:16:00,2003-01-01,2003,1,1,B737,BOEING 737-700 (WINGLETS) PAX,2266.0,15947,331.0,0.7,473.38,144,120,1,1311,0,15,0,0,10401,331.0,4965,0,0,433941,0,47664,39720,5278,5278,1762_2,SBCT-SBSP,CWB-CGH
1,GLO,G3,1720,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2003-01-04,07:40:00,2003-01-03,2003,1,3,5,JANEIRO,SBSP,CGH,Congonhas,São Paulo,SP,SUDESTE,08:16:00,2003-01-03,2003,1,3,B738,BOEING 737-800 (WINGLETS) PAX,2152.0,19800,331.0,0.6,552.28,177,79,0,1051,0,1287,0,0,8263,0.0,425997,0,0,347881,0,58587,26149,6553,6553,1720_2,SBCT-SBSP,CWB-CGH


In [257]:
flights_2003.loc[flights_2003.icao_origin == "SBMH", "icao_origin"] = "SBMG"

In [258]:
len(flights_2003[flights_2003.sched_date.isnull()])

0

In [259]:
len(flights_2003[flights_2003.dep_date.isnull()])

0

In [260]:
flights_2003.insert(11, 'sched_year', flights_2003.sched_date.str[0:4])
flights_2003.insert(12, 'sched_month', flights_2003.sched_date.str[5:7])
flights_2003.insert(13, 'sched_day', flights_2003.sched_date.str[8:10])

In [261]:
sorted(flights_2003.dep_month.unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [263]:
# creating cluster column for origin airport

In [264]:
flights_2003 = flights_2003.merge(
    temp, how="left", left_on="icao_origin", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [265]:
len(flights_2003[flights_2003.cluster.isnull()])

0

In [266]:
flights_2003 = flights_2003.rename(
    {
        "lat_dd": "lat_dd_origin",
        "long_dd": "long_dd_origin",
        "cluster": "cluster_origin"
    },
    axis=1,
)

In [267]:
sorted(flights_2003.cluster_origin.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [268]:
# creating cluster column for destination airport

In [269]:
flights_2003 = flights_2003.merge(
    temp, how="left", left_on="icao_dest", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [270]:
len(flights_2003[flights_2003.cluster.isnull()])

0

In [271]:
flights_2003 = flights_2003.rename(
    {
        "lat_dd": "lat_dd_dest",
        "long_dd": "long_dd_dest",
        "cluster": "cluster_dest"
    },
    axis=1,
)

In [272]:
sorted(flights_2003.cluster_dest.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [273]:
flights_2003.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,sched_year,sched_month,sched_day,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata,lat_dd_origin,long_dd_origin,cluster_origin,lat_dd_dest,long_dd_dest,cluster_dest
0,GLO,G3,1762,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2003-01-02,2003,1,2,16:34:00,2003-01-01,2003,1,1,3,JANEIRO,SBSP,CGH,Congonhas,São Paulo,SP,SUDESTE,17:16:00,2003-01-01,2003,1,1,B737,BOEING 737-700 (WINGLETS) PAX,2266.0,15947,331.0,0.7,473.38,144,120,1,1311,0,15,0,0,10401,331.0,4965,0,0,433941,0,47664,39720,5278,5278,1762_2,SBCT-SBSP,CWB-CGH,-25.531667,-49.176111,4,-23.626111,-46.656389,2
1,GLO,G3,1720,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2003-01-04,2003,1,4,07:40:00,2003-01-03,2003,1,3,5,JANEIRO,SBSP,CGH,Congonhas,São Paulo,SP,SUDESTE,08:16:00,2003-01-03,2003,1,3,B738,BOEING 737-800 (WINGLETS) PAX,2152.0,19800,331.0,0.6,552.28,177,79,0,1051,0,1287,0,0,8263,0.0,425997,0,0,347881,0,58587,26149,6553,6553,1720_2,SBCT-SBSP,CWB-CGH,-25.531667,-49.176111,4,-23.626111,-46.656389,2


In [275]:
# 2004 clusters – Palmas (SBPJ) and Maringá (SBMG) repeated

In [276]:
clusters_2004 = pd.read_csv('./assets/csv/count/df2004.csv', sep=',')

In [277]:
clusters_2004

Unnamed: 0,icao_code,count
0,SBSP,28609
1,SBBR,22537
2,SBGL,19020
3,SBSV,18909
4,SBGR,16286
...,...,...
58,SBPM,87
59,SNDM,42
60,SBMN,5
61,SBMM,3


In [278]:
clusters_2004 = clusters_2004.merge(temp, how='left', on="icao_code")

In [279]:
clusters_2004

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBSP,28609,-23.626111,-46.656389,São paulo,SP,2.0
1,SBBR,22537,-15.871111,-47.918611,Brasília,DF,1.0
2,SBGL,19020,-22.810000,-43.250556,Rio de janeiro,RJ,2.0
3,SBSV,18909,-12.908611,-38.322500,Salvador,BA,2.0
4,SBGR,16286,-23.435556,-46.473056,Guarulhos,SP,1.0
...,...,...,...,...,...,...,...
58,SBPM,87,-10.290000,-48.357778,Palmas,TO,6.0
59,SNDM,42,-12.482222,-41.276944,Lençóis,BA,8.0
60,SBMN,5,,,,,
61,SBMM,3,,,,,


In [280]:
clusters_2004.city_primary.duplicated().any()

True

In [281]:
clusters_2004[clusters_2004.city_primary.duplicated()]

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
8,SBRJ,9712,-22.91,-43.1625,Rio de janeiro,RJ,3.0
41,SBMH,468,-23.479444,-52.012222,Maringá,PR,6.0
58,SBPM,87,-10.29,-48.357778,Palmas,TO,6.0
61,SBMM,3,,,,,


In [282]:
clusters_2004[clusters_2004['city_primary'] == 'Rio de janeiro']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
2,SBGL,19020,-22.81,-43.250556,Rio de janeiro,RJ,2.0
8,SBRJ,9712,-22.91,-43.1625,Rio de janeiro,RJ,3.0


In [283]:
clusters_2004[clusters_2004['city_primary'] == 'Palmas']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
51,SBPJ,280,-10.29,-48.357778,Palmas,TO,6.0
58,SBPM,87,-10.29,-48.357778,Palmas,TO,6.0


In [284]:
clusters_2004.at[51, 'count'] = 280 + 87

In [285]:
clusters_2004.drop(58, inplace=True)

In [286]:
clusters_2004[clusters_2004['city_primary'] == 'Maringá']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
37,SBMG,663,-23.479444,-52.012222,Maringá,PR,6.0
41,SBMH,468,-23.479444,-52.012222,Maringá,PR,6.0


In [287]:
clusters_2004.at[37, 'count'] = 663 + 468

In [288]:
clusters_2004.drop(41, inplace=True)

In [289]:
clusters_2004[clusters_2004['city_primary'].isnull()]

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
60,SBMN,5,,,,,
61,SBMM,3,,,,,


In [290]:
clusters_2004.dropna(inplace=True)

In [291]:
clusters_2004['cluster'] = clusters_2004['cluster'].astype(int)

In [292]:
clusters_2004.reset_index(drop=True, inplace=True)
clusters_2004

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBSP,28609,-23.626111,-46.656389,São paulo,SP,2
1,SBBR,22537,-15.871111,-47.918611,Brasília,DF,1
2,SBGL,19020,-22.81,-43.250556,Rio de janeiro,RJ,2
3,SBSV,18909,-12.908611,-38.3225,Salvador,BA,2
4,SBGR,16286,-23.435556,-46.473056,Guarulhos,SP,1
5,SBCT,13579,-25.531667,-49.176111,São José dos Pinhais,PR,4
6,SBRF,12064,-8.126389,-34.922778,Recife,PE,2
7,SBPA,10840,-29.994722,-51.171111,Porto alegre,RS,3
8,SBRJ,9712,-22.91,-43.1625,Rio de janeiro,RJ,3
9,SBBH,9537,-19.851944,-43.950556,Belo horizonte,MG,6


In [294]:
# flights 2004

In [317]:
flights_2004 = pd.read_csv("./assets/csv/temp/flights_2004.csv", sep=",", index_col=0).drop(columns="index")

In [318]:
len(flights_2004)

238520

In [319]:
len(flights_2004[flights_2004.icao_origin == flights_2004.icao_dest])

4

In [320]:
flights_2004 = flights_2004[~(flights_2004.icao_origin == flights_2004.icao_dest)]

In [321]:
flights_2004.reset_index(drop=True, inplace=True)

In [322]:
len(flights_2004)

238516

In [323]:
flights_2004.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata
0,VSP,VP,4375,4,SBBR,BSB,Presidente Juscelino Kubitschek,Brasília,DF,3,2004-09-22,08:15:00,2004-09-22,2004,9,22,3,SETEMBRO,SBGO,GYN,Santa Genoveva/Goiânia,Goiânia,GO,CENTRO-OESTE,08:56:00,2004-09-22,2004,9,22,B732,BOEING 737-200,833.0,10933,163.0,0.68,239.27,107,25,0,355,0,193,0,0,2423,0.0,31459,0,0,57865,0,17441,4075,1782,1782,4375_4,SBBR-SBGO,BSB-GYN
1,VSP,VP,4375,3,SBPV,PVH,Governador Jorge Teixeira De Oliveira,Porto Velho,RO,1,2004-09-22,04:04:00,2004-09-22,2004,9,22,3,SETEMBRO,SBBR,BSB,Presidente Juscelino Kubitschek,Brasília,DF,CENTRO-OESTE,06:58:00,2004-09-22,2004,9,22,B732,BOEING 737-200,9692.0,10448,1909.0,2.9,658.13,107,44,0,582,0,111,0,0,3993,0.0,211899,0,0,1111040,0,204263,83996,19945,19945,4375_3,SBPV-SBBR,PVH-BSB


In [324]:
flights_2004.loc[flights_2004.icao_origin == "SBPM", "icao_origin"] = "SBPJ"

In [325]:
flights_2004.loc[flights_2004.icao_origin == "SBMH", "icao_origin"] = "SBMG"

In [326]:
len(flights_2004[flights_2004.sched_date.isnull()])

0

In [327]:
len(flights_2004[flights_2004.dep_date.isnull()])

0

In [328]:
flights_2004.insert(11, 'sched_year', flights_2004.sched_date.str[0:4])
flights_2004.insert(12, 'sched_month', flights_2004.sched_date.str[5:7])
flights_2004.insert(13, 'sched_day', flights_2004.sched_date.str[8:10])

In [329]:
sorted(flights_2004.dep_month.unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [330]:
# creating cluster column for origin airport

In [331]:
flights_2004 = flights_2004.merge(
    temp, how="left", left_on="icao_origin", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [332]:
len(flights_2004[flights_2004.cluster.isnull()])

8

In [322]:
len(flights_2004)

238516

In [333]:
flights_2004 = flights_2004[~(flights_2004.cluster.isnull())]

In [334]:
flights_2004.reset_index(drop=True, inplace=True)

In [335]:
len(flights_2004)

238508

In [336]:
flights_2004 = flights_2004.rename(
    {
        "lat_dd": "lat_dd_origin",
        "long_dd": "long_dd_origin",
        "cluster": "cluster_origin"
    },
    axis=1,
)

In [337]:
sorted(flights_2004.cluster_origin.unique())

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]

In [348]:
flights_2004['cluster_origin'] = np.nan_to_num(flights_2004['cluster_origin']).astype(int)

In [349]:
sorted(flights_2004.cluster_origin.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [338]:
# creating cluster column for destination airport

In [339]:
flights_2004 = flights_2004.merge(
    temp, how="left", left_on="icao_dest", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [341]:
len(flights_2004)

238508

In [340]:
len(flights_2004[flights_2004.cluster.isnull()])

8

In [342]:
flights_2004 = flights_2004[~(flights_2004.cluster.isnull())]

In [343]:
flights_2004.reset_index(drop=True, inplace=True)

In [344]:
len(flights_2004)

238500

In [345]:
flights_2004 = flights_2004.rename(
    {
        "lat_dd": "lat_dd_dest",
        "long_dd": "long_dd_dest",
        "cluster": "cluster_dest"
    },
    axis=1,
)

In [346]:
sorted(flights_2004.cluster_dest.unique())

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]

In [350]:
flights_2004['cluster_dest'] = np.nan_to_num(flights_2004['cluster_dest']).astype(int)

In [351]:
sorted(flights_2004.cluster_origin.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [352]:
flights_2004.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,sched_year,sched_month,sched_day,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata,lat_dd_origin,long_dd_origin,cluster_origin,lat_dd_dest,long_dd_dest,cluster_dest
0,VSP,VP,4375,4,SBBR,BSB,Presidente Juscelino Kubitschek,Brasília,DF,3,2004-09-22,2004,9,22,08:15:00,2004-09-22,2004,9,22,3,SETEMBRO,SBGO,GYN,Santa Genoveva/Goiânia,Goiânia,GO,CENTRO-OESTE,08:56:00,2004-09-22,2004,9,22,B732,BOEING 737-200,833.0,10933,163.0,0.68,239.27,107,25,0,355,0,193,0,0,2423,0.0,31459,0,0,57865,0,17441,4075,1782,1782,4375_4,SBBR-SBGO,BSB-GYN,-15.871111,-47.918611,1,-16.6325,-49.221111,4
1,VSP,VP,4375,3,SBPV,PVH,Governador Jorge Teixeira De Oliveira,Porto Velho,RO,1,2004-09-22,2004,9,22,04:04:00,2004-09-22,2004,9,22,3,SETEMBRO,SBBR,BSB,Presidente Juscelino Kubitschek,Brasília,DF,CENTRO-OESTE,06:58:00,2004-09-22,2004,9,22,B732,BOEING 737-200,9692.0,10448,1909.0,2.9,658.13,107,44,0,582,0,111,0,0,3993,0.0,211899,0,0,1111040,0,204263,83996,19945,19945,4375_3,SBPV-SBBR,PVH-BSB,-8.713611,-63.902778,4,-15.871111,-47.918611,1


In [354]:
# 2005 clusters – Maringá (SBMG) repeated

In [355]:
clusters_2005 = pd.read_csv('./assets/csv/count/df2005.csv', sep=',')

In [356]:
clusters_2005

Unnamed: 0,icao_code,count
0,SBGL,27386
1,SBSP,27228
2,SBBR,20943
3,SBGR,18042
4,SBSV,15326
5,SBCT,14311
6,SBRF,12676
7,SBCF,10723
8,SBPA,10336
9,SBFZ,7976


In [357]:
clusters_2005 = clusters_2005.merge(temp, how='left', on="icao_code")

In [358]:
clusters_2005

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBGL,27386,-22.81,-43.250556,Rio de janeiro,RJ,2
1,SBSP,27228,-23.626111,-46.656389,São paulo,SP,2
2,SBBR,20943,-15.871111,-47.918611,Brasília,DF,1
3,SBGR,18042,-23.435556,-46.473056,Guarulhos,SP,1
4,SBSV,15326,-12.908611,-38.3225,Salvador,BA,2
5,SBCT,14311,-25.531667,-49.176111,São José dos Pinhais,PR,4
6,SBRF,12676,-8.126389,-34.922778,Recife,PE,2
7,SBCF,10723,-19.624444,-43.971944,Confins,MG,3
8,SBPA,10336,-29.994722,-51.171111,Porto alegre,RS,3
9,SBFZ,7976,-3.775833,-38.532222,Fortaleza,CE,2


In [359]:
clusters_2005.city_primary.duplicated().any()

True

In [360]:
clusters_2005[clusters_2005.city_primary.duplicated()]

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
38,SBRJ,593,-22.91,-43.1625,Rio de janeiro,RJ,3
44,SBMH,364,-23.479444,-52.012222,Maringá,PR,6


In [361]:
clusters_2005[clusters_2005['city_primary'] == 'Rio de janeiro']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBGL,27386,-22.81,-43.250556,Rio de janeiro,RJ,2
38,SBRJ,593,-22.91,-43.1625,Rio de janeiro,RJ,3


In [362]:
clusters_2005[clusters_2005['city_primary'] == 'Maringá']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
32,SBMG,895,-23.479444,-52.012222,Maringá,PR,6
44,SBMH,364,-23.479444,-52.012222,Maringá,PR,6


In [363]:
clusters_2005.at[32, 'count'] = 895 + 364

In [364]:
clusters_2005.drop(44, inplace=True)

In [365]:
clusters_2005 = clusters_2005.reset_index(drop=True)
clusters_2005

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBGL,27386,-22.81,-43.250556,Rio de janeiro,RJ,2
1,SBSP,27228,-23.626111,-46.656389,São paulo,SP,2
2,SBBR,20943,-15.871111,-47.918611,Brasília,DF,1
3,SBGR,18042,-23.435556,-46.473056,Guarulhos,SP,1
4,SBSV,15326,-12.908611,-38.3225,Salvador,BA,2
5,SBCT,14311,-25.531667,-49.176111,São José dos Pinhais,PR,4
6,SBRF,12676,-8.126389,-34.922778,Recife,PE,2
7,SBCF,10723,-19.624444,-43.971944,Confins,MG,3
8,SBPA,10336,-29.994722,-51.171111,Porto alegre,RS,3
9,SBFZ,7976,-3.775833,-38.532222,Fortaleza,CE,2


In [367]:
# flights 2005

In [368]:
flights_2005 = pd.read_csv("./assets/csv/temp/flights_2005.csv", sep=",", index_col=0).drop(columns="index")

In [369]:
len(flights_2005)

233165

In [370]:
len(flights_2005[flights_2005.icao_origin == flights_2005.icao_dest])

2

In [371]:
flights_2005 = flights_2005[~(flights_2005.icao_origin == flights_2005.icao_dest)]

In [372]:
flights_2005.reset_index(drop=True, inplace=True)

In [373]:
len(flights_2005)

233163

In [374]:
flights_2005.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata
0,GLO,G3,1790,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2005-02-10,18:21:00,2005-02-10,2005,2,10,4,FEVEREIRO,SBSP,CGH,Congonhas,São Paulo,SP,SUDESTE,19:09:00,2005-02-10,2005,2,10,B737,BOEING 737-700 (WINGLETS) PAX,2203.0,15947,331.0,0.8,414.21,144,54,3,226,0,10,0,0,4511,993.0,3310,0,0,74806,0,47664,17874,5278,5278,1790_2,SBCT-SBSP,CWB-CGH
1,GLO,G3,1724,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2005-02-02,07:31:00,2005-02-02,2005,2,2,3,FEVEREIRO,SBKP,VCP,Viracopos,Campinas,SP,SUDESTE,08:28:00,2005-02-02,2005,2,2,B737,BOEING 737-700 (WINGLETS) PAX,2278.0,15947,349.0,0.95,367.12,144,74,4,810,0,105,0,0,6765,1396.0,36645,0,0,282690,0,50256,25826,5565,5565,1724_2,SBCT-SBKP,CWB-VCP


In [375]:
flights_2005.loc[flights_2005.icao_origin == "SBMH", "icao_origin"] = "SBMG"

In [376]:
len(flights_2005[flights_2005.sched_date.isnull()])

0

In [377]:
len(flights_2005[flights_2005.dep_date.isnull()])

0

In [378]:
flights_2005.insert(11, 'sched_year', flights_2005.sched_date.str[0:4])
flights_2005.insert(12, 'sched_month', flights_2005.sched_date.str[5:7])
flights_2005.insert(13, 'sched_day', flights_2005.sched_date.str[8:10])

In [379]:
sorted(flights_2005.dep_month.unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [380]:
# creating cluster column for origin airport

In [381]:
flights_2005 = flights_2005.merge(
    temp, how="left", left_on="icao_origin", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [382]:
len(flights_2005[flights_2005.cluster.isnull()])

0

In [383]:
flights_2005 = flights_2005.rename(
    {
        "lat_dd": "lat_dd_origin",
        "long_dd": "long_dd_origin",
        "cluster": "cluster_origin"
    },
    axis=1,
)

In [384]:
sorted(flights_2005.cluster_origin.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [385]:
# creating cluster column for destination airport

In [386]:
flights_2005 = flights_2005.merge(
    temp, how="left", left_on="icao_dest", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [387]:
len(flights_2005[flights_2005.cluster.isnull()])

0

In [388]:
flights_2005 = flights_2005.rename(
    {
        "lat_dd": "lat_dd_dest",
        "long_dd": "long_dd_dest",
        "cluster": "cluster_dest"
    },
    axis=1,
)

In [389]:
sorted(flights_2005.cluster_dest.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [390]:
flights_2005.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,sched_year,sched_month,sched_day,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata,lat_dd_origin,long_dd_origin,cluster_origin,lat_dd_dest,long_dd_dest,cluster_dest
0,GLO,G3,1790,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2005-02-10,2005,2,10,18:21:00,2005-02-10,2005,2,10,4,FEVEREIRO,SBSP,CGH,Congonhas,São Paulo,SP,SUDESTE,19:09:00,2005-02-10,2005,2,10,B737,BOEING 737-700 (WINGLETS) PAX,2203.0,15947,331.0,0.8,414.21,144,54,3,226,0,10,0,0,4511,993.0,3310,0,0,74806,0,47664,17874,5278,5278,1790_2,SBCT-SBSP,CWB-CGH,-25.531667,-49.176111,4,-23.626111,-46.656389,2
1,GLO,G3,1724,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2005-02-02,2005,2,2,07:31:00,2005-02-02,2005,2,2,3,FEVEREIRO,SBKP,VCP,Viracopos,Campinas,SP,SUDESTE,08:28:00,2005-02-02,2005,2,2,B737,BOEING 737-700 (WINGLETS) PAX,2278.0,15947,349.0,0.95,367.12,144,74,4,810,0,105,0,0,6765,1396.0,36645,0,0,282690,0,50256,25826,5565,5565,1724_2,SBCT-SBKP,CWB-VCP,-25.531667,-49.176111,4,-23.006944,-47.134444,3


In [392]:
# 2006 clusters – Maringá (SBMG) repeated

In [393]:
clusters_2006 = pd.read_csv('./assets/csv/count/df2006.csv', sep=',')

In [394]:
clusters_2006

Unnamed: 0,icao_code,count
0,SBSP,28280
1,SBGL,25906
2,SBBR,21511
3,SBSV,16247
4,SBGR,15597
...,...,...
56,SBAQ,5
57,SBRJ,1
58,SBME,1
59,SBLE,1


In [395]:
clusters_2006 = clusters_2006.merge(temp, how='left', on="icao_code")

In [396]:
clusters_2006

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBSP,28280,-23.626111,-46.656389,São paulo,SP,2
1,SBGL,25906,-22.810000,-43.250556,Rio de janeiro,RJ,2
2,SBBR,21511,-15.871111,-47.918611,Brasília,DF,1
3,SBSV,16247,-12.908611,-38.322500,Salvador,BA,2
4,SBGR,15597,-23.435556,-46.473056,Guarulhos,SP,1
...,...,...,...,...,...,...,...
56,SBAQ,5,-21.804444,-48.140278,Araraquara,SP,8
57,SBRJ,1,-22.910000,-43.162500,Rio de janeiro,RJ,3
58,SBME,1,-22.342778,-41.763889,Macaé,RJ,8
59,SBLE,1,-12.482222,-41.276944,Lençóis,BA,8


In [397]:
clusters_2006.city_primary.duplicated().any()

True

In [398]:
clusters_2006[clusters_2006.city_primary.duplicated()]

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
51,SBMH,55,-23.479444,-52.012222,Maringá,PR,6
57,SBRJ,1,-22.91,-43.1625,Rio de janeiro,RJ,3


In [399]:
clusters_2006[clusters_2006['city_primary'] == 'Rio de janeiro']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
1,SBGL,25906,-22.81,-43.250556,Rio de janeiro,RJ,2
57,SBRJ,1,-22.91,-43.1625,Rio de janeiro,RJ,3


In [400]:
clusters_2006[clusters_2006['city_primary'] == 'Maringá']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
34,SBMG,716,-23.479444,-52.012222,Maringá,PR,6
51,SBMH,55,-23.479444,-52.012222,Maringá,PR,6


In [401]:
clusters_2006.at[34, 'count'] = 716 + 55

In [402]:
clusters_2006.drop(51, inplace=True)

In [403]:
clusters_2006 = clusters_2006.reset_index(drop=True)
clusters_2006

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBSP,28280,-23.626111,-46.656389,São paulo,SP,2
1,SBGL,25906,-22.81,-43.250556,Rio de janeiro,RJ,2
2,SBBR,21511,-15.871111,-47.918611,Brasília,DF,1
3,SBSV,16247,-12.908611,-38.3225,Salvador,BA,2
4,SBGR,15597,-23.435556,-46.473056,Guarulhos,SP,1
5,SBCT,13066,-25.531667,-49.176111,São José dos Pinhais,PR,4
6,SBCF,12496,-19.624444,-43.971944,Confins,MG,3
7,SBRF,12435,-8.126389,-34.922778,Recife,PE,2
8,SBPA,9938,-29.994722,-51.171111,Porto alegre,RS,3
9,SBFZ,9028,-3.775833,-38.532222,Fortaleza,CE,2


In [None]:
# flights 2006

In [405]:
flights_2006 = pd.read_csv("./assets/csv/temp/flights_2006.csv", sep=",", index_col=0).drop(columns="index")

In [406]:
len(flights_2006)

231324

In [407]:
len(flights_2006[flights_2006.icao_origin == flights_2006.icao_dest])

7

In [408]:
flights_2006 = flights_2006[~(flights_2006.icao_origin == flights_2006.icao_dest)]

In [409]:
flights_2006.reset_index(drop=True, inplace=True)

In [410]:
len(flights_2006)

231317

In [411]:
flights_2006.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata
0,GLO,G3,1755,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2006-09-06,13:58:00,2006-09-06,2006,9,6,3,SETEMBRO,SBSP,CGH,Congonhas,São Paulo,SP,SUDESTE,14:54:00,2006-09-06,2006,9,6,B737,BOEING 737-700 (WINGLETS) PAX,2235.0,15947,331.0,0.93,356.31,144,131,9,1515,0,885,0,0,12900,2979.0,292935,0,0,501465,0,47664,43361,5278,5278,1755_2,SBCT-SBSP,CWB-CGH
1,GLO,G3,1956,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2006-09-03,00:06:00,2006-09-04,2006,9,4,1,SETEMBRO,SBFI,IGU,Cataratas,Foz Do Iguaçu,PR,SUL,01:22:00,2006-09-04,2006,9,4,B738,BOEING 737-800 (WINGLETS) PAX,3246.0,19800,533.0,1.27,419.32,177,164,2,2050,0,40,0,0,14540,1066.0,21320,0,0,1092650,0,94341,87412,10553,10553,1956_2,SBCT-SBFI,CWB-IGU


In [375]:
flights_2006.loc[flights_2006.icao_origin == "SBMH", "icao_origin"] = "SBMG"

In [413]:
len(flights_2006[flights_2006.sched_date.isnull()])

0

In [414]:
len(flights_2006[flights_2006.dep_date.isnull()])

0

In [415]:
flights_2006.insert(11, 'sched_year', flights_2006.sched_date.str[0:4])
flights_2006.insert(12, 'sched_month', flights_2006.sched_date.str[5:7])
flights_2006.insert(13, 'sched_day', flights_2006.sched_date.str[8:10])

In [416]:
sorted(flights_2006.dep_month.unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [417]:
# creating cluster column for origin airport

In [418]:
flights_2006 = flights_2006.merge(
    temp, how="left", left_on="icao_origin", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [419]:
len(flights_2006[flights_2006.cluster.isnull()])

0

In [420]:
flights_2006 = flights_2006.rename(
    {
        "lat_dd": "lat_dd_origin",
        "long_dd": "long_dd_origin",
        "cluster": "cluster_origin"
    },
    axis=1,
)

In [421]:
sorted(flights_2006.cluster_origin.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [422]:
# creating cluster column for destination airport

In [423]:
flights_2006 = flights_2006.merge(
    temp, how="left", left_on="icao_dest", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [424]:
len(flights_2006[flights_2006.cluster.isnull()])

0

In [425]:
flights_2006 = flights_2006.rename(
    {
        "lat_dd": "lat_dd_dest",
        "long_dd": "long_dd_dest",
        "cluster": "cluster_dest"
    },
    axis=1,
)

In [426]:
sorted(flights_2006.cluster_dest.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [427]:
flights_2006.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,sched_year,sched_month,sched_day,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata,lat_dd_origin,long_dd_origin,cluster_origin,lat_dd_dest,long_dd_dest,cluster_dest
0,GLO,G3,1755,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2006-09-06,2006,9,6,13:58:00,2006-09-06,2006,9,6,3,SETEMBRO,SBSP,CGH,Congonhas,São Paulo,SP,SUDESTE,14:54:00,2006-09-06,2006,9,6,B737,BOEING 737-700 (WINGLETS) PAX,2235.0,15947,331.0,0.93,356.31,144,131,9,1515,0,885,0,0,12900,2979.0,292935,0,0,501465,0,47664,43361,5278,5278,1755_2,SBCT-SBSP,CWB-CGH,-25.531667,-49.176111,4,-23.626111,-46.656389,2
1,GLO,G3,1956,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2006-09-03,2006,9,3,00:06:00,2006-09-04,2006,9,4,1,SETEMBRO,SBFI,IGU,Cataratas,Foz Do Iguaçu,PR,SUL,01:22:00,2006-09-04,2006,9,4,B738,BOEING 737-800 (WINGLETS) PAX,3246.0,19800,533.0,1.27,419.32,177,164,2,2050,0,40,0,0,14540,1066.0,21320,0,0,1092650,0,94341,87412,10553,10553,1956_2,SBCT-SBFI,CWB-IGU,-25.531667,-49.176111,4,-25.600278,-54.485,4


In [429]:
# 2007 clusters 

In [430]:
clusters_2007 = pd.read_csv('./assets/csv/count/df2007.csv', sep=',')

In [431]:
clusters_2007

Unnamed: 0,icao_code,count
0,SBGL,24752
1,SBSP,22148
2,SBBR,21647
3,SBGR,18131
4,SBSV,14343
...,...,...
58,SJTC,2
59,SSZR,1
60,SBME,1
61,SBNM,1


In [432]:
clusters_2007 = clusters_2007.merge(temp, how='left', on="icao_code")

In [433]:
clusters_2007

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBGL,24752,-22.810000,-43.250556,Rio de janeiro,RJ,2
1,SBSP,22148,-23.626111,-46.656389,São paulo,SP,2
2,SBBR,21647,-15.871111,-47.918611,Brasília,DF,1
3,SBGR,18131,-23.435556,-46.473056,Guarulhos,SP,1
4,SBSV,14343,-12.908611,-38.322500,Salvador,BA,2
...,...,...,...,...,...,...,...
58,SJTC,2,-22.157778,-49.068333,Arealva,SP,8
59,SSZR,1,-27.908889,-54.522222,Santa rosa,RS,8
60,SBME,1,-22.342778,-41.763889,Macaé,RJ,8
61,SBNM,1,-28.281667,-54.168333,Santo ângelo,RS,8


In [434]:
clusters_2007.city_primary.duplicated().any()

False

In [435]:
clusters_2007[clusters_2007.city_primary.duplicated()]

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster


In [436]:
clusters_2007.to_csv("./assets/csv/cluster/cluster_2007_map.csv", sep=",")

In [437]:
# flights 2007

In [461]:
flights_2007 = pd.read_csv("./assets/csv/temp/flights_2007.csv", sep=",", index_col=0).drop(columns="index")

In [462]:
len(flights_2007)

215642

In [463]:
len(flights_2007[flights_2007.icao_origin == flights_2007.icao_dest])

11

In [464]:
flights_2007 = flights_2007[~(flights_2007.icao_origin == flights_2007.icao_dest)]

In [465]:
flights_2007.reset_index(drop=True, inplace=True)

In [466]:
len(flights_2007)

215631

In [467]:
flights_2007.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata
0,GLO,G3,1833,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2007-07-09,07:22:00,2007-07-09,2007,7,9,1,JULHO,SBPA,POA,Salgado Filho,Porto Alegre,RS,SUL,08:33:00,2007-07-09,2007,7,9,B737,BOEING 737-700 (WINGLETS) PAX,3026.0,15947,534.0,1.18,452.17,144,130,1,1379,0,0,0,0,11204,534.0,0,0,0,736386,0,76896,69420,8515,8515,1833_2,SBCT-SBPA,CWB-POA
1,GLO,G3,1836,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2007-07-09,10:36:00,2007-07-09,2007,7,9,1,JULHO,SBBR,BSB,Presidente Juscelino Kubitschek,Brasília,DF,CENTRO-OESTE,12:19:00,2007-07-09,2007,7,9,B737,BOEING 737-700 (WINGLETS) PAX,5116.0,15947,1082.0,1.72,629.24,144,115,14,415,0,0,0,0,10090,15148.0,0,0,0,449030,0,155808,124430,17254,17254,1836_2,SBCT-SBBR,CWB-BSB


In [468]:
len(flights_2007[flights_2007.sched_date.isnull()])

0

In [469]:
len(flights_2007[flights_2007.dep_date.isnull()])

0

In [470]:
flights_2007.insert(11, 'sched_year', flights_2007.sched_date.str[0:4])
flights_2007.insert(12, 'sched_month', flights_2007.sched_date.str[5:7])
flights_2007.insert(13, 'sched_day', flights_2007.sched_date.str[8:10])

In [471]:
sorted(flights_2007.dep_month.unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [472]:
# creating cluster column for origin airport

In [473]:
flights_2007 = flights_2007.merge(
    temp, how="left", left_on="icao_origin", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [474]:
len(flights_2007[flights_2007.cluster.isnull()])

0

In [475]:
flights_2007 = flights_2007.rename(
    {
        "lat_dd": "lat_dd_origin",
        "long_dd": "long_dd_origin",
        "cluster": "cluster_origin"
    },
    axis=1,
)

In [476]:
sorted(flights_2007.cluster_origin.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [477]:
# creating cluster column for destination airport

In [478]:
flights_2007 = flights_2007.merge(
    temp, how="left", left_on="icao_dest", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [479]:
len(flights_2007[flights_2007.cluster.isnull()])

1

In [480]:
flights_2007 = flights_2007[~(flights_2007.cluster.isnull())]

In [481]:
flights_2007.reset_index(drop=True, inplace=True)

In [482]:
len(flights_2007[flights_2007.cluster.isnull()])

0

In [483]:
flights_2007 = flights_2007.rename(
    {
        "lat_dd": "lat_dd_dest",
        "long_dd": "long_dd_dest",
        "cluster": "cluster_dest"
    },
    axis=1,
)

In [484]:
sorted(flights_2007.cluster_dest.unique())

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]

In [485]:
flights_2007['cluster_dest'] = np.nan_to_num(flights_2007['cluster_dest']).astype(int)

In [486]:
sorted(flights_2007.cluster_dest.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [487]:
flights_2007.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,sched_year,sched_month,sched_day,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata,lat_dd_origin,long_dd_origin,cluster_origin,lat_dd_dest,long_dd_dest,cluster_dest
0,GLO,G3,1833,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2007-07-09,2007,7,9,07:22:00,2007-07-09,2007,7,9,1,JULHO,SBPA,POA,Salgado Filho,Porto Alegre,RS,SUL,08:33:00,2007-07-09,2007,7,9,B737,BOEING 737-700 (WINGLETS) PAX,3026.0,15947,534.0,1.18,452.17,144,130,1,1379,0,0,0,0,11204,534.0,0,0,0,736386,0,76896,69420,8515,8515,1833_2,SBCT-SBPA,CWB-POA,-25.531667,-49.176111,4,-29.994722,-51.171111,3
1,GLO,G3,1836,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2007-07-09,2007,7,9,10:36:00,2007-07-09,2007,7,9,1,JULHO,SBBR,BSB,Presidente Juscelino Kubitschek,Brasília,DF,CENTRO-OESTE,12:19:00,2007-07-09,2007,7,9,B737,BOEING 737-700 (WINGLETS) PAX,5116.0,15947,1082.0,1.72,629.24,144,115,14,415,0,0,0,0,10090,15148.0,0,0,0,449030,0,155808,124430,17254,17254,1836_2,SBCT-SBBR,CWB-BSB,-25.531667,-49.176111,4,-15.871111,-47.918611,1


In [489]:
# 2008 clusters

In [490]:
clusters_2008 = pd.read_csv('./assets/csv/count/df2008.csv', sep=',')

In [491]:
clusters_2008

Unnamed: 0,icao_code,count
0,SBSP,29004
1,SBGL,28965
2,SBBR,26403
3,SBGR,19004
4,SBSV,16018
5,SBCF,13790
6,SBRF,12955
7,SBCT,12371
8,SBPA,10367
9,SBFZ,9306


In [492]:
clusters_2008 = clusters_2008.merge(temp, how='left', on="icao_code")

In [493]:
clusters_2008

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBSP,29004,-23.626111,-46.656389,São paulo,SP,2
1,SBGL,28965,-22.81,-43.250556,Rio de janeiro,RJ,2
2,SBBR,26403,-15.871111,-47.918611,Brasília,DF,1
3,SBGR,19004,-23.435556,-46.473056,Guarulhos,SP,1
4,SBSV,16018,-12.908611,-38.3225,Salvador,BA,2
5,SBCF,13790,-19.624444,-43.971944,Confins,MG,3
6,SBRF,12955,-8.126389,-34.922778,Recife,PE,2
7,SBCT,12371,-25.531667,-49.176111,São José dos Pinhais,PR,4
8,SBPA,10367,-29.994722,-51.171111,Porto alegre,RS,3
9,SBFZ,9306,-3.775833,-38.532222,Fortaleza,CE,2


In [494]:
clusters_2008.city_primary.duplicated().any()

True

In [495]:
clusters_2008[clusters_2008.city_primary.duplicated()]

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
24,SDAM,1480,-22.863246,-47.104051,Campinas,SP,8
27,SBRJ,1278,-22.91,-43.1625,Rio de janeiro,RJ,3


In [496]:
clusters_2008[clusters_2008['city_primary'] == 'Rio de janeiro']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
1,SBGL,28965,-22.81,-43.250556,Rio de janeiro,RJ,2
27,SBRJ,1278,-22.91,-43.1625,Rio de janeiro,RJ,3


In [497]:
clusters_2008[clusters_2008['city_primary'] == 'Campinas']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
16,SBKP,3231,-23.006944,-47.134444,Campinas,SP,3
24,SDAM,1480,-22.863246,-47.104051,Campinas,SP,8


In [498]:
clusters_2008 = clusters_2008.reset_index(drop=True)
clusters_2008

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBSP,29004,-23.626111,-46.656389,São paulo,SP,2
1,SBGL,28965,-22.81,-43.250556,Rio de janeiro,RJ,2
2,SBBR,26403,-15.871111,-47.918611,Brasília,DF,1
3,SBGR,19004,-23.435556,-46.473056,Guarulhos,SP,1
4,SBSV,16018,-12.908611,-38.3225,Salvador,BA,2
5,SBCF,13790,-19.624444,-43.971944,Confins,MG,3
6,SBRF,12955,-8.126389,-34.922778,Recife,PE,2
7,SBCT,12371,-25.531667,-49.176111,São José dos Pinhais,PR,4
8,SBPA,10367,-29.994722,-51.171111,Porto alegre,RS,3
9,SBFZ,9306,-3.775833,-38.532222,Fortaleza,CE,2


In [500]:
# flights 2008

In [501]:
flights_2008 = pd.read_csv("./assets/csv/temp/flights_2008.csv", sep=",", index_col=0).drop(columns="index")

In [502]:
len(flights_2008)

249368

In [503]:
len(flights_2008[flights_2008.icao_origin == flights_2008.icao_dest])

11

In [504]:
flights_2008 = flights_2008[~(flights_2008.icao_origin == flights_2008.icao_dest)]

In [505]:
flights_2008.reset_index(drop=True, inplace=True)

In [506]:
len(flights_2008)

249357

In [507]:
flights_2008.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata
0,GLO,G3,1945,1,SBPA,POA,Salgado Filho,Porto Alegre,RS,5,2008-09-30,13:40:00,2008-09-30,2008,9,30,2,SETEMBRO,SBGR,GRU,Guarulhos - Governador André Franco Montoro,Guarulhos,SP,SUDESTE,15:27:00,2008-09-30,2008,9,30,B738,BOEING 737-800,4021.0,11620,866.0,1.78,486.24,184,132,4,3440,0,0,0,0,13640,3464.0,0,0,0,2979040,0,159344,114312,10062,10062,1945_1,SBPA-SBGR,POA-GRU
1,GLO,G3,1945,1,SBPA,POA,Salgado Filho,Porto Alegre,RS,5,2008-09-29,13:42:00,2008-09-29,2008,9,29,1,SETEMBRO,SBGR,GRU,Guarulhos - Governador André Franco Montoro,Guarulhos,SP,SUDESTE,15:33:00,2008-09-29,2008,9,29,B738,BOEING 737-800,4248.0,15340,866.0,1.85,467.85,184,129,5,3154,0,0,0,0,13204,4330.0,0,0,0,2731360,0,159344,111714,13284,13284,1945_1,SBPA-SBGR,POA-GRU


In [509]:
len(flights_2008[flights_2008.sched_date.isnull()])

0

In [510]:
len(flights_2008[flights_2008.dep_date.isnull()])

14

In [511]:
flights_2008.insert(11, 'sched_year', flights_2008.sched_date.str[0:4])
flights_2008.insert(12, 'sched_month', flights_2008.sched_date.str[5:7])
flights_2008.insert(13, 'sched_day', flights_2008.sched_date.str[8:10])

In [512]:
sorted(flights_2008.dep_month.unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [513]:
# creating cluster column for origin airport

In [514]:
flights_2008 = flights_2008.merge(
    temp, how="left", left_on="icao_origin", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [515]:
len(flights_2008[flights_2008.cluster.isnull()])

0

In [516]:
flights_2008 = flights_2008.rename(
    {
        "lat_dd": "lat_dd_origin",
        "long_dd": "long_dd_origin",
        "cluster": "cluster_origin"
    },
    axis=1,
)

In [517]:
sorted(flights_2008.cluster_origin.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [518]:
# creating cluster column for destination airport

In [519]:
flights_2008 = flights_2008.merge(
    temp, how="left", left_on="icao_dest", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [520]:
len(flights_2008[flights_2008.cluster.isnull()])

0

In [521]:
flights_2008 = flights_2008.rename(
    {
        "lat_dd": "lat_dd_dest",
        "long_dd": "long_dd_dest",
        "cluster": "cluster_dest"
    },
    axis=1,
)

In [522]:
sorted(flights_2008.cluster_dest.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [523]:
flights_2008.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,sched_year,sched_month,sched_day,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata,lat_dd_origin,long_dd_origin,cluster_origin,lat_dd_dest,long_dd_dest,cluster_dest
0,GLO,G3,1945,1,SBPA,POA,Salgado Filho,Porto Alegre,RS,5,2008-09-30,2008,9,30,13:40:00,2008-09-30,2008,9,30,2,SETEMBRO,SBGR,GRU,Guarulhos - Governador André Franco Montoro,Guarulhos,SP,SUDESTE,15:27:00,2008-09-30,2008,9,30,B738,BOEING 737-800,4021.0,11620,866.0,1.78,486.24,184,132,4,3440,0,0,0,0,13640,3464.0,0,0,0,2979040,0,159344,114312,10062,10062,1945_1,SBPA-SBGR,POA-GRU,-29.994722,-51.171111,3,-23.435556,-46.473056,1
1,GLO,G3,1945,1,SBPA,POA,Salgado Filho,Porto Alegre,RS,5,2008-09-29,2008,9,29,13:42:00,2008-09-29,2008,9,29,1,SETEMBRO,SBGR,GRU,Guarulhos - Governador André Franco Montoro,Guarulhos,SP,SUDESTE,15:33:00,2008-09-29,2008,9,29,B738,BOEING 737-800,4248.0,15340,866.0,1.85,467.85,184,129,5,3154,0,0,0,0,13204,4330.0,0,0,0,2731360,0,159344,111714,13284,13284,1945_1,SBPA-SBGR,POA-GRU,-29.994722,-51.171111,3,-23.435556,-46.473056,1


In [525]:
# 2009 clusters 

In [526]:
clusters_2009 = pd.read_csv('./assets/csv/count/df2009.csv', sep=',')

In [527]:
clusters_2009

Unnamed: 0,icao_code,count
0,SBBR,34999
1,SBGL,32172
2,SBSP,27546
3,SBGR,22233
4,SBSV,21423
5,SBCF,19884
6,SBCT,17775
7,SBRF,17495
8,SBKP,16203
9,SBPA,14213


In [528]:
clusters_2009 = clusters_2009.merge(temp, how='left', on="icao_code")

In [529]:
clusters_2009

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBBR,34999,-15.871111,-47.918611,Brasília,DF,1
1,SBGL,32172,-22.81,-43.250556,Rio de janeiro,RJ,2
2,SBSP,27546,-23.626111,-46.656389,São paulo,SP,2
3,SBGR,22233,-23.435556,-46.473056,Guarulhos,SP,1
4,SBSV,21423,-12.908611,-38.3225,Salvador,BA,2
5,SBCF,19884,-19.624444,-43.971944,Confins,MG,3
6,SBCT,17775,-25.531667,-49.176111,São José dos Pinhais,PR,4
7,SBRF,17495,-8.126389,-34.922778,Recife,PE,2
8,SBKP,16203,-23.006944,-47.134444,Campinas,SP,3
9,SBPA,14213,-29.994722,-51.171111,Porto alegre,RS,3


In [530]:
clusters_2009.city_primary.duplicated().any()

True

In [531]:
clusters_2009[clusters_2009.city_primary.duplicated()]

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
16,SBRJ,6504,-22.91,-43.1625,Rio de janeiro,RJ,3
32,SDAM,1237,-22.863246,-47.104051,Campinas,SP,8


In [532]:
clusters_2009[clusters_2009['city_primary'] == 'Rio de janeiro']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
1,SBGL,32172,-22.81,-43.250556,Rio de janeiro,RJ,2
16,SBRJ,6504,-22.91,-43.1625,Rio de janeiro,RJ,3


In [533]:
clusters_2009[clusters_2009['city_primary'] == 'Campinas']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
8,SBKP,16203,-23.006944,-47.134444,Campinas,SP,3
32,SDAM,1237,-22.863246,-47.104051,Campinas,SP,8


In [535]:
# flights 2009

In [553]:
flights_2009 = pd.read_csv("./assets/csv/temp/flights_2009.csv", sep=",", index_col=0).drop(columns="index")

In [554]:
len(flights_2009)

333130

In [555]:
len(flights_2009[flights_2009.icao_origin == flights_2009.icao_dest])

0

In [556]:
flights_2009.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata
0,AZU,AD,4090,1,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2009-02-24,18:57:00,2009-02-24,2009,2,24,2,FEVEREIRO,SBKP,VCP,Viracopos,Campinas,SP,SUDESTE,20:04:00,2009-02-24,2009,2,24,E190,EMBRAER 190,2500.0,11622,349.0,1.12,311.39,106,73,0,590,27,0,0,0,6092,0.0,0,0,0,205910,9423,36994,25477,4056,4056,4090_1,SBCT-SBKP,CWB-VCP
1,AZU,AD,4082,1,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2009-02-25,08:16:00,2009-02-25,2009,2,25,3,FEVEREIRO,SBKP,VCP,Viracopos,Campinas,SP,SUDESTE,09:19:00,2009-02-25,2009,2,25,E190,EMBRAER 190,2500.0,11651,349.0,1.05,332.15,106,133,1,567,116,0,0,0,10733,349.0,0,0,0,197883,40484,36994,46417,4066,4066,4082_1,SBCT-SBKP,CWB-VCP


In [557]:
len(flights_2009[flights_2009.sched_date.isnull()])

0

In [558]:
len(flights_2009[flights_2009.dep_date.isnull()])

0

In [559]:
flights_2009.insert(11, 'sched_year', flights_2009.sched_date.str[0:4])
flights_2009.insert(12, 'sched_month', flights_2009.sched_date.str[5:7])
flights_2009.insert(13, 'sched_day', flights_2009.sched_date.str[8:10])

In [560]:
sorted(flights_2009.dep_month.unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [561]:
# creating cluster column for origin airport

In [562]:
flights_2009 = flights_2009.merge(
    temp, how="left", left_on="icao_origin", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [563]:
len(flights_2009[flights_2009.cluster.isnull()])

0

In [564]:
flights_2009 = flights_2009.rename(
    {
        "lat_dd": "lat_dd_origin",
        "long_dd": "long_dd_origin",
        "cluster": "cluster_origin"
    },
    axis=1,
)

In [565]:
sorted(flights_2009.cluster_origin.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [566]:
# creating cluster column for destination airport

In [567]:
flights_2009 = flights_2009.merge(
    temp, how="left", left_on="icao_dest", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [568]:
len(flights_2009[flights_2009.cluster.isnull()])

0

In [569]:
flights_2009 = flights_2009.rename(
    {
        "lat_dd": "lat_dd_dest",
        "long_dd": "long_dd_dest",
        "cluster": "cluster_dest"
    },
    axis=1,
)

In [570]:
sorted(flights_2009.cluster_dest.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [571]:
flights_2009.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,sched_year,sched_month,sched_day,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata,lat_dd_origin,long_dd_origin,cluster_origin,lat_dd_dest,long_dd_dest,cluster_dest
0,AZU,AD,4090,1,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2009-02-24,2009,2,24,18:57:00,2009-02-24,2009,2,24,2,FEVEREIRO,SBKP,VCP,Viracopos,Campinas,SP,SUDESTE,20:04:00,2009-02-24,2009,2,24,E190,EMBRAER 190,2500.0,11622,349.0,1.12,311.39,106,73,0,590,27,0,0,0,6092,0.0,0,0,0,205910,9423,36994,25477,4056,4056,4090_1,SBCT-SBKP,CWB-VCP,-25.531667,-49.176111,4,-23.006944,-47.134444,3
1,AZU,AD,4082,1,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2009-02-25,2009,2,25,08:16:00,2009-02-25,2009,2,25,3,FEVEREIRO,SBKP,VCP,Viracopos,Campinas,SP,SUDESTE,09:19:00,2009-02-25,2009,2,25,E190,EMBRAER 190,2500.0,11651,349.0,1.05,332.15,106,133,1,567,116,0,0,0,10733,349.0,0,0,0,197883,40484,36994,46417,4066,4066,4082_1,SBCT-SBKP,CWB-VCP,-25.531667,-49.176111,4,-23.006944,-47.134444,3


In [168]:
# 2010 clusters

In [573]:
clusters_2010 = pd.read_csv('./assets/csv/count/df2010.csv', sep=',')

In [574]:
clusters_2010

Unnamed: 0,icao_code,count
0,SBSP,40418
1,SBBR,39656
2,SBGL,32100
3,SBGR,31650
4,SBRJ,25358
...,...,...
99,SSZR,1
100,SSER,1
101,SBSM,1
102,SBNM,1


In [575]:
clusters_2010 = clusters_2010.merge(temp, how='left', on="icao_code")

In [576]:
clusters_2010

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBSP,40418,-23.626111,-46.656389,São paulo,SP,2
1,SBBR,39656,-15.871111,-47.918611,Brasília,DF,1
2,SBGL,32100,-22.810000,-43.250556,Rio de janeiro,RJ,2
3,SBGR,31650,-23.435556,-46.473056,Guarulhos,SP,1
4,SBRJ,25358,-22.910000,-43.162500,Rio de janeiro,RJ,3
...,...,...,...,...,...,...,...
99,SSZR,1,-27.908889,-54.522222,Santa rosa,RS,8
100,SSER,1,-27.660000,-52.276111,Erechim,RS,8
101,SBSM,1,-29.710833,-53.692222,Santa maria,RS,8
102,SBNM,1,-28.281667,-54.168333,Santo ângelo,RS,8


In [577]:
clusters_2010.city_primary.duplicated().any()

True

In [578]:
clusters_2010[clusters_2010.city_primary.duplicated()]

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
4,SBRJ,25358,-22.91,-43.1625,Rio de janeiro,RJ,3
65,SSDO,87,-16.753889,-39.316389,Porto seguro,BA,8


In [579]:
clusters_2010[clusters_2010['city_primary'] == 'Rio de janeiro']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
2,SBGL,32100,-22.81,-43.250556,Rio de janeiro,RJ,2
4,SBRJ,25358,-22.91,-43.1625,Rio de janeiro,RJ,3


In [580]:
clusters_2010[clusters_2010['city_primary'] == 'Porto seguro']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
32,SBPS,1559,-16.438056,-39.077778,Porto seguro,BA,5
65,SSDO,87,-16.753889,-39.316389,Porto seguro,BA,8


In [581]:
clusters_2010.to_csv("./assets/csv/cluster/cluster_2010_map.csv", sep=",")

In [None]:
# flights 2010

In [582]:
flights_2010 = pd.read_csv("./assets/csv/temp/flights_2010.csv", sep=",", index_col=0).drop(columns="index")

In [583]:
len(flights_2010)

429164

In [584]:
len(flights_2010[flights_2010.icao_origin == flights_2010.icao_dest])

0

In [585]:
flights_2010.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata
0,AZU,AD,4092,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2010-08-01,15:14:00,2010-08-01,2010,8,1,7,AGOSTO,SBMG,MGF,Sílvio Name Júnior,Maringá,PR,SUL,16:19:00,2010-08-01,2010,8,1,E195,EMBRAER 195,1506.0,15784,367.0,1.08,339.45,118,96,1,2236,45,0,0,0,9556,367.0,0,0,0,820612,16515,43306,35232,5792,5792,4092_2,SBCT-SBMG,CWB-MGF
1,AZU,AD,4082,1,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2010-08-02,07:59:00,2010-08-02,2010,8,2,1,AGOSTO,SBKP,VCP,Viracopos,Campinas,SP,SUDESTE,09:00:00,2010-08-02,2010,8,2,E190,EMBRAER 190,1380.0,16662,349.0,1.02,341.92,106,96,1,2027,0,0,0,0,9302,349.0,0,0,0,707423,0,36994,33504,5815,5815,4082_1,SBCT-SBKP,CWB-VCP


In [586]:
len(flights_2010[flights_2010.sched_date.isnull()])

0

In [587]:
len(flights_2010[flights_2010.dep_date.isnull()])

0

In [588]:
flights_2010.insert(11, 'sched_year', flights_2010.sched_date.str[0:4])
flights_2010.insert(12, 'sched_month', flights_2010.sched_date.str[5:7])
flights_2010.insert(13, 'sched_day', flights_2010.sched_date.str[8:10])

In [589]:
sorted(flights_2010.dep_month.unique())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [590]:
# creating cluster column for origin airport

In [591]:
flights_2010 = flights_2010.merge(
    temp, how="left", left_on="icao_origin", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [592]:
len(flights_2010[flights_2010.cluster.isnull()])

0

In [593]:
flights_2010 = flights_2010.rename(
    {
        "lat_dd": "lat_dd_origin",
        "long_dd": "long_dd_origin",
        "cluster": "cluster_origin"
    },
    axis=1,
)

In [594]:
sorted(flights_2010.cluster_origin.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [595]:
# creating cluster column for destination airport

In [596]:
flights_2010 = flights_2010.merge(
    temp, how="left", left_on="icao_dest", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [597]:
len(flights_2010[flights_2010.cluster.isnull()])

0

In [598]:
flights_2010 = flights_2010.rename(
    {
        "lat_dd": "lat_dd_dest",
        "long_dd": "long_dd_dest",
        "cluster": "cluster_dest"
    },
    axis=1,
)

In [599]:
sorted(flights_2010.cluster_dest.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [600]:
flights_2010.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,sched_year,sched_month,sched_day,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata,lat_dd_origin,long_dd_origin,cluster_origin,lat_dd_dest,long_dd_dest,cluster_dest
0,AZU,AD,4092,2,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2010-08-01,2010,8,1,15:14:00,2010-08-01,2010,8,1,7,AGOSTO,SBMG,MGF,Sílvio Name Júnior,Maringá,PR,SUL,16:19:00,2010-08-01,2010,8,1,E195,EMBRAER 195,1506.0,15784,367.0,1.08,339.45,118,96,1,2236,45,0,0,0,9556,367.0,0,0,0,820612,16515,43306,35232,5792,5792,4092_2,SBCT-SBMG,CWB-MGF,-25.531667,-49.176111,4,-23.479444,-52.012222,6
1,AZU,AD,4082,1,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2010-08-02,2010,8,2,07:59:00,2010-08-02,2010,8,2,1,AGOSTO,SBKP,VCP,Viracopos,Campinas,SP,SUDESTE,09:00:00,2010-08-02,2010,8,2,E190,EMBRAER 190,1380.0,16662,349.0,1.02,341.92,106,96,1,2027,0,0,0,0,9302,349.0,0,0,0,707423,0,36994,33504,5815,5815,4082_1,SBCT-SBKP,CWB-VCP,-25.531667,-49.176111,4,-23.006944,-47.134444,3


In [602]:
# 2011 clusters – Arealva (SBAE) and Minaçu (SWIQ) repeated

In [603]:
clusters_2011 = pd.read_csv('./assets/csv/count/df2011.csv', sep=',')

In [604]:
clusters_2011

Unnamed: 0,icao_code,count
0,SBBR,41565
1,SBKP,38258
2,SBSP,37961
3,SBGR,37615
4,SBGL,34613
...,...,...
117,SWOB,22
118,SWIQ,20
119,SBLE,13
120,SBZM,7


In [605]:
clusters_2011 = clusters_2011.merge(temp, how='left', on="icao_code")

In [606]:
clusters_2011

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBBR,41565,-15.871111,-47.918611,Brasília,DF,1
1,SBKP,38258,-23.006944,-47.134444,Campinas,SP,3
2,SBSP,37961,-23.626111,-46.656389,São paulo,SP,2
3,SBGR,37615,-23.435556,-46.473056,Guarulhos,SP,1
4,SBGL,34613,-22.810000,-43.250556,Rio de janeiro,RJ,2
...,...,...,...,...,...,...,...
117,SWOB,22,-2.533611,-66.067222,Fonte boa,AM,8
118,SWIQ,20,-13.550556,-48.200556,Minaçu,GO,8
119,SBLE,13,-12.482222,-41.276944,Lençóis,BA,8
120,SBZM,7,-21.513056,-43.173056,Goianá,MG,8


In [607]:
clusters_2011.city_primary.duplicated().any()

True

In [608]:
clusters_2011[clusters_2011.city_primary.duplicated()]

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
7,SBRJ,23863,-22.91,-43.1625,Rio de janeiro,RJ,3
83,SJTC,145,-22.157778,-49.068333,Arealva,SP,8
87,SSDO,132,-16.753889,-39.316389,Porto seguro,BA,8
118,SWIQ,20,-13.550556,-48.200556,Minaçu,GO,8


In [609]:
clusters_2011[clusters_2011['city_primary'] == 'Rio de janeiro']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
4,SBGL,34613,-22.81,-43.250556,Rio de janeiro,RJ,2
7,SBRJ,23863,-22.91,-43.1625,Rio de janeiro,RJ,3


In [610]:
clusters_2011[clusters_2011['city_primary'] == 'Arealva']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
77,SBAE,173,-22.157778,-49.068333,Arealva,SP,8
83,SJTC,145,-22.157778,-49.068333,Arealva,SP,8


In [611]:
clusters_2011.at[77, 'count'] = 173 + 145

In [612]:
clusters_2011.drop(88, inplace=True)

In [613]:
clusters_2011[clusters_2011['city_primary'] == 'Porto seguro']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
29,SBPS,2871,-16.438056,-39.077778,Porto seguro,BA,5
87,SSDO,132,-16.753889,-39.316389,Porto seguro,BA,8


In [614]:
clusters_2011[clusters_2011['city_primary'] == 'Minaçu']

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
110,SBMC,37,-13.550556,-48.200556,Minaçu,GO,8
118,SWIQ,20,-13.550556,-48.200556,Minaçu,GO,8


In [615]:
clusters_2011.at[118, 'count'] = 37 + 20

In [616]:
clusters_2011.drop(110, inplace=True)

In [617]:
clusters_2011 = clusters_2011.reset_index(drop=True)
clusters_2011

Unnamed: 0,icao_code,count,lat_dd,long_dd,city_primary,state,cluster
0,SBBR,41565,-15.871111,-47.918611,Brasília,DF,1
1,SBKP,38258,-23.006944,-47.134444,Campinas,SP,3
2,SBSP,37961,-23.626111,-46.656389,São paulo,SP,2
3,SBGR,37615,-23.435556,-46.473056,Guarulhos,SP,1
4,SBGL,34613,-22.810000,-43.250556,Rio de janeiro,RJ,2
...,...,...,...,...,...,...,...
115,SWOB,22,-2.533611,-66.067222,Fonte boa,AM,8
116,SWIQ,57,-13.550556,-48.200556,Minaçu,GO,8
117,SBLE,13,-12.482222,-41.276944,Lençóis,BA,8
118,SBZM,7,-21.513056,-43.173056,Goianá,MG,8


In [618]:
clusters_2011.to_csv("./assets/csv/cluster/cluster_2011_map.csv", sep=",")

In [619]:
# flights 2011

In [638]:
flights_2011 = pd.read_csv("./assets/csv/temp/flights_2011.csv", sep=",", index_col=0).drop(columns="index")

In [639]:
len(flights_2011)

509719

In [640]:
len(flights_2011[flights_2011.icao_origin == flights_2011.icao_dest])

2

In [641]:
flights_2011 = flights_2011[~(flights_2011.icao_origin == flights_2011.icao_dest)]

In [642]:
flights_2011.reset_index(drop=True, inplace=True)

In [643]:
len(flights_2011)

509717

In [644]:
flights_2011.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata
0,AZU,AD,4176,1,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2011-01-01,10:07:00,2011-01-01,2011,1,1,6,JANEIRO,SBVT,VIX,Eurico De Aguiar Salles,Vitória,ES,SUDESTE,11:48:00,2011-01-01,2011,1,1,E195,EMBRAER 195,2501.0,14574,1082.0,1.68,644.08,118,79,0,1982,6,0,0,0,7913,0.0,0,0,0,2144520,6492,127676,85478,15769,15769,4176_1,SBCT-SBVT,CWB-VIX
1,AZU,AD,4090,1,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2011-01-01,19:37:00,2011-01-01,2011,1,1,6,JANEIRO,SBKP,VCP,Viracopos,Campinas,SP,SUDESTE,20:34:00,2011-01-01,2011,1,1,E190,EMBRAER 190,1420.0,16412,349.0,0.95,367.12,106,64,11,1718,1,0,0,0,7344,3839.0,0,0,0,599582,349,36994,22336,5727,5727,4090_1,SBCT-SBKP,CWB-VCP


In [645]:
flights_2011.loc[flights_2011.icao_origin == "SJTC", "icao_origin"] = "SBAE"

In [646]:
flights_2011.loc[flights_2011.icao_origin == "SBMC", "icao_origin"] = "SWIQ"

In [647]:
len(flights_2011[flights_2011.sched_date.isnull()])

0

In [648]:
len(flights_2011[flights_2011.dep_date.isnull()])

2

In [649]:
flights_2011.insert(11, 'sched_year', flights_2011.sched_date.str[0:4])
flights_2011.insert(12, 'sched_month', flights_2011.sched_date.str[5:7])
flights_2011.insert(13, 'sched_day', flights_2011.sched_date.str[8:10])

In [650]:
sorted(flights_2011.dep_month.unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [651]:
# creating cluster column for origin airport

In [652]:
flights_2011 = flights_2011.merge(
    temp, how="left", left_on="icao_origin", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [653]:
len(flights_2011[flights_2011.cluster.isnull()])

0

In [654]:
flights_2011 = flights_2011.rename(
    {
        "lat_dd": "lat_dd_origin",
        "long_dd": "long_dd_origin",
        "cluster": "cluster_origin"
    },
    axis=1,
)

In [655]:
sorted(flights_2011.cluster_origin.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [656]:
# creating cluster column for destination airport

In [657]:
flights_2011 = flights_2011.merge(
    temp, how="left", left_on="icao_dest", right_on="icao_code"
).drop(columns=["icao_code", "city_primary", "state"])

In [658]:
len(flights_2011[flights_2011.cluster.isnull()])

0

In [659]:
flights_2011 = flights_2011.rename(
    {
        "lat_dd": "lat_dd_dest",
        "long_dd": "long_dd_dest",
        "cluster": "cluster_dest"
    },
    axis=1,
)

In [660]:
sorted(flights_2011.cluster_dest.unique())

[1, 2, 3, 4, 5, 6, 7, 8]

In [661]:
flights_2011.head(2)

Unnamed: 0,icao_carrier,iata_carrier,flight,flight_step,icao_origin,iata_origin,dep_airport_name,dep_city,dep_state,dep_region,sched_date,sched_year,sched_month,sched_day,dep_time,dep_date,dep_year,dep_month,dep_day,dep_week_day,reference_month,icao_dest,iata_dest,arr_airport_name,arr_city,arr_state,arr_region,arr_time,arr_date,arr_year,arr_month,arr_day,icao_aircraft_type,aircraft_model,fuel_consumption_l,payload_kg,flight_distance_km,flight_hrs,flight_speed_avg,seats_available,seats_sold,seats_free,pax_baggage_free_kg,pax_baggage_paid_kg,cargo_paid_kg,cargo_mail_kg,cargo_free_kg,aircraft_freight_kg,pax_free_km,cargo_paid_km,cargo_free_km,cargo_mail_km,baggage_free_km,baggage_paid_km,available_seat_km,revenue_pax_km,available_tonne_km,revenue_tonne_km,flight_unique_nr,route_icao,route_iata,lat_dd_origin,long_dd_origin,cluster_origin,lat_dd_dest,long_dd_dest,cluster_dest
0,AZU,AD,4176,1,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2011-01-01,2011,1,1,10:07:00,2011-01-01,2011,1,1,6,JANEIRO,SBVT,VIX,Eurico De Aguiar Salles,Vitória,ES,SUDESTE,11:48:00,2011-01-01,2011,1,1,E195,EMBRAER 195,2501.0,14574,1082.0,1.68,644.08,118,79,0,1982,6,0,0,0,7913,0.0,0,0,0,2144520,6492,127676,85478,15769,15769,4176_1,SBCT-SBVT,CWB-VIX,-25.531667,-49.176111,4,-20.258056,-40.286389,4
1,AZU,AD,4090,1,SBCT,CWB,Afonso Pena,São José Dos Pinhais,PR,5,2011-01-01,2011,1,1,19:37:00,2011-01-01,2011,1,1,6,JANEIRO,SBKP,VCP,Viracopos,Campinas,SP,SUDESTE,20:34:00,2011-01-01,2011,1,1,E190,EMBRAER 190,1420.0,16412,349.0,0.95,367.12,106,64,11,1718,1,0,0,0,7344,3839.0,0,0,0,599582,349,36994,22336,5727,5727,4090_1,SBCT-SBKP,CWB-VCP,-25.531667,-49.176111,4,-23.006944,-47.134444,3
