Import packages

In [1]:
import pandas as pd
import psycopg2
import sqlalchemy
import matplotlib as plt

Import flights dataset

In [2]:
df_flights = pd.read_csv('df_flights.csv')

Fill in null(Nan) values

In [3]:
df_flights['carrier_delay'] = df_flights['carrier_delay'].fillna(0.0)
df_flights['weather_delay']	= df_flights['weather_delay'].fillna(0.0)
df_flights['nas_delay']	= df_flights['nas_delay'].fillna(0.0)
df_flights['security_delay'] = df_flights['security_delay'].fillna(0.0)
df_flights['late_aircraft_delay'] = df_flights['late_aircraft_delay'].fillna(0.0)

In [4]:
# no difference in scheduled and departure time, so 0 delay
df_flights['dep_delay'].fillna(0, inplace=True)

In [5]:
# add unknown to tail numbers if not there
df_flights['tail_num']=df_flights['tail_num'].fillna('Unknown')

Drop columns and rows with a lot of missing values

In [6]:
df_flights.drop(['first_dep_time', 'cancellation_code', 'total_add_gtime', 
                 'longest_add_gtime', 'no_name'], axis=1, inplace=True)

In [7]:
for index, row in df_flights.iterrows():  # get rid of cancelled flights
    if row['cancelled'] == 1.0:
        df_flights.drop(index=index, inplace=True)

In [8]:
df_flights['arr_delay'] = df_flights['arr_delay'].fillna('none')
for index, row in df_flights.iterrows():   # drop rows missing target variable
    if row['arr_delay'] == 'none':
        df_flights.drop(index=index, inplace=True)

In [9]:
df_flights['wheels_off'] = df_flights['wheels_off'].fillna('none')
for index, row in df_flights.iterrows():   # dropping rows missing datetime information
    if row['wheels_off'] == 'none':
        df_flights.drop(index=index, inplace=True)

In [10]:
df_flights['carrier_name'] = ''

my_dict ={
    'UA':'United Airlines',
    'PT':'Piedmont Airlines',
    'G7':'Lindbergh Airlines',
    'CP':'Compass Airlines',
    'QX':'Horizon Airlines',
    'AX':'Trans States Airlines',
    'ZW':'Air Wisconsin',
    'C5':'CommutAir',
    'EM':'Anderson Aviation',
    'KS':'Peninsula Airways',
    'AS':'Alaska Airlines',
    '9E':'Endeavor Air',
    'B6':'JetBlue Airways',
    'EV':'ExpressJet',
    'F9':'Frontier Airlines',
    'G4':'Allegiant Air',
    'HA':'Hawaiian Airlines',
    'MQ':'Envoy Air',
    'NK':'Spirit Airlines',
    'OH':'PSA Airlines',
    'OO':'SkyWest Airlines',
    'VX':'Virgin America',
    'WN':'Southwest Airlines',
    'YV':'Mesa Airline',
    'YX':'Republic Airways',
    'AA':'American Airlines',
    'DL':'Delta Airlines'
}

for word in my_dict.keys():
    df_flights.loc[df_flights['op_unique_carrier'].str.contains(word, na=False), 
                   ['carrier_name']] = my_dict[word]

Add 'delayed' column

In [11]:
df_flights['arr_delay'] = df_flights['arr_delay'].astype(int) #Change 'arr_delay' dtype to int


df_flights['delayed'] = 0 #Add new column to categorize delayed flights

df_flights.loc[df_flights['arr_delay'] > 0, 'delayed'] = 1

In [12]:
df_flights

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,carrier_name,delayed
0,2018-01-01,WN,WN,WN,1997,WN,N733SA,1997,13232,MDW,...,66.0,1.0,405.0,10.0,0.0,0.0,0.0,26.0,Southwest Airlines,1
1,2018-01-01,WN,WN,WN,181,WN,N8633A,181,13232,MDW,...,131.0,1.0,990.0,11.0,0.0,0.0,0.0,30.0,Southwest Airlines,1
2,2018-01-01,WN,WN,WN,185,WN,N8625A,185,13232,MDW,...,130.0,1.0,990.0,0.0,0.0,0.0,0.0,0.0,Southwest Airlines,1
3,2018-01-01,WN,WN,WN,403,WN,N564WN,403,13232,MDW,...,127.0,1.0,990.0,0.0,0.0,0.0,0.0,0.0,Southwest Airlines,0
4,2018-01-01,WN,WN,WN,1229,WN,N8305E,1229,13232,MDW,...,128.0,1.0,990.0,0.0,0.0,0.0,0.0,0.0,Southwest Airlines,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31020,2019-07-31,DL,DL_CODESHARE,DL,3290,9E,N914XJ,3290,15412,TYS,...,91.0,1.0,648.0,0.0,0.0,0.0,0.0,0.0,Endeavor Air,0
31021,2019-07-31,DL,DL_CODESHARE,DL,3291,9E,N311PQ,3291,10721,BOS,...,73.0,1.0,468.0,0.0,0.0,0.0,0.0,0.0,Endeavor Air,0
31022,2019-07-31,DL,DL_CODESHARE,DL,3292,9E,N297PQ,3292,11193,CVG,...,106.0,1.0,812.0,0.0,0.0,0.0,0.0,0.0,Endeavor Air,0
31023,2019-07-31,DL,DL_CODESHARE,DL,3292,9E,N297PQ,3292,11298,DFW,...,111.0,1.0,812.0,0.0,0.0,0.0,0.0,0.0,Endeavor Air,0


In [13]:
df_flights

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,carrier_name,delayed
0,2018-01-01,WN,WN,WN,1997,WN,N733SA,1997,13232,MDW,...,66.0,1.0,405.0,10.0,0.0,0.0,0.0,26.0,Southwest Airlines,1
1,2018-01-01,WN,WN,WN,181,WN,N8633A,181,13232,MDW,...,131.0,1.0,990.0,11.0,0.0,0.0,0.0,30.0,Southwest Airlines,1
2,2018-01-01,WN,WN,WN,185,WN,N8625A,185,13232,MDW,...,130.0,1.0,990.0,0.0,0.0,0.0,0.0,0.0,Southwest Airlines,1
3,2018-01-01,WN,WN,WN,403,WN,N564WN,403,13232,MDW,...,127.0,1.0,990.0,0.0,0.0,0.0,0.0,0.0,Southwest Airlines,0
4,2018-01-01,WN,WN,WN,1229,WN,N8305E,1229,13232,MDW,...,128.0,1.0,990.0,0.0,0.0,0.0,0.0,0.0,Southwest Airlines,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31020,2019-07-31,DL,DL_CODESHARE,DL,3290,9E,N914XJ,3290,15412,TYS,...,91.0,1.0,648.0,0.0,0.0,0.0,0.0,0.0,Endeavor Air,0
31021,2019-07-31,DL,DL_CODESHARE,DL,3291,9E,N311PQ,3291,10721,BOS,...,73.0,1.0,468.0,0.0,0.0,0.0,0.0,0.0,Endeavor Air,0
31022,2019-07-31,DL,DL_CODESHARE,DL,3292,9E,N297PQ,3292,11193,CVG,...,106.0,1.0,812.0,0.0,0.0,0.0,0.0,0.0,Endeavor Air,0
31023,2019-07-31,DL,DL_CODESHARE,DL,3292,9E,N297PQ,3292,11298,DFW,...,111.0,1.0,812.0,0.0,0.0,0.0,0.0,0.0,Endeavor Air,0


Import and join Chicago weather data to flights database

In [14]:
df_chicago = pd.read_csv('chicago.csv') # Import weather data for Chicago

In [15]:
df_chicago['origin_city_name'] = 'Chicago, IL' # Add origin city column in Chicago database

# Change dates column name to the same name as the flights database
df_chicago.rename(columns={"dates_list": "fl_date"}, inplace=True) 

In [16]:
df_flights[df_flights['origin_city_name'] == 'Atlanta, GA'].shape

(1570, 39)

In [17]:
df_flights[df_flights['origin_city_name'] == 'Chicago, IL'].head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,carrier_name,delayed
0,2018-01-01,WN,WN,WN,1997,WN,N733SA,1997,13232,MDW,...,66.0,1.0,405.0,10.0,0.0,0.0,0.0,26.0,Southwest Airlines,1
1,2018-01-01,WN,WN,WN,181,WN,N8633A,181,13232,MDW,...,131.0,1.0,990.0,11.0,0.0,0.0,0.0,30.0,Southwest Airlines,1
2,2018-01-01,WN,WN,WN,185,WN,N8625A,185,13232,MDW,...,130.0,1.0,990.0,0.0,0.0,0.0,0.0,0.0,Southwest Airlines,1
3,2018-01-01,WN,WN,WN,403,WN,N564WN,403,13232,MDW,...,127.0,1.0,990.0,0.0,0.0,0.0,0.0,0.0,Southwest Airlines,0
4,2018-01-01,WN,WN,WN,1229,WN,N8305E,1229,13232,MDW,...,128.0,1.0,990.0,0.0,0.0,0.0,0.0,0.0,Southwest Airlines,1


Import and join Atlanta weather data to flights database

In [18]:
df_atlanta = pd.read_csv('atlanta.csv') # Import weather data for Atlanta

In [19]:
df_atlanta['origin_city_name'] = 'Atlanta, GA' # Add origin city column in Atlanta database

# Change dates column name to the same name as the flights database
df_atlanta.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Baltimore weather data to flights database

In [20]:
df_baltimore = pd.read_csv('baltimore.csv') # Import weather data for Baltimore

In [21]:
df_baltimore['origin_city_name'] = 'Baltimore, MD' # Add origin city column in Baltomore database

# Change dates column name to the same name as the flights database
df_baltimore.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Boston weather data to flights database

In [22]:
df_boston = pd.read_csv('boston.csv') # Import weather data for Boston

In [23]:
df_boston['origin_city_name'] = 'Boston, MA' # Add origin city column in Boston database

# Change dates column name to the same name as the flights database
df_boston.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Charlotte weather data to flights database

In [24]:
df_charlotte = pd.read_csv('charlotte.csv') # Import weather data for Charlotte

In [25]:
df_charlotte['origin_city_name'] = 'Charlotte, NC' # Add origin city column in Charlotte database

# Change dates column name to the same name as the flights database
df_charlotte.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Dallas weather data to flights database

In [26]:
df_dallas = pd.read_csv('dallas.csv') # Import weather data for Dallas

In [27]:
df_dallas['origin_city_name'] = 'Dallas, TX' # Add origin city column in Dallas database

# Change dates column name to the same name as the flights database
df_dallas.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Dallas weather data to flights database

In [28]:
df_fortworth = pd.read_csv('dallas.csv') # Import weather data for Fort Worth

In [29]:
# Add origin city column in Fort Worth database
df_fortworth['origin_city_name'] = 'Dallas/Fort Worth, TX'

# Change dates column name to the same name as the flights database
df_fortworth.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Denver weather data to flights database

In [30]:
df_denver = pd.read_csv('denver.csv') # Import weather data for Denver

In [31]:
# Add origin city column in Denver database
df_denver['origin_city_name'] = 'Denver, CO'

# Change dates column name to the same name as the flights database
df_denver.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Detroit weather data to flights database

In [32]:
df_detroit = pd.read_csv('detroit.csv') # Import weather data for Detroit

In [33]:
# Add origin city column in Detroit database
df_detroit['origin_city_name'] = 'Detroit, MI'

# Change dates column name to the same name as the flights database
df_detroit.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Durham weather data to flights database

In [34]:
df_durham = pd.read_csv('durham.csv') # Import weather data for Durham

In [35]:
# Add origin city column in Durham database
df_durham['origin_city_name'] = 'Raleigh/Durham, NC'

# Change dates column name to the same name as the flights database
df_durham.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Fort Lauderdale weather data to flights database

In [36]:
df_lauderdale = pd.read_csv('Fort_Lauderdale.csv') # Import weather data for Fort Lauderdale

In [37]:
# Add origin city column in Fort Lauderdale database
df_lauderdale['origin_city_name'] = 'Fort Lauderdale, FL'

# Change dates column name to the same name as the flights database
df_lauderdale.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Houston weather data to flights database

In [38]:
df_houston = pd.read_csv('houston.csv') # Import weather data for Houston

In [39]:
# Add origin city column in Houston database
df_houston['origin_city_name'] = 'Houston, TX'

# Change dates column name to the same name as the flights database
df_houston.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Los Angeles weather data to flights database

In [40]:
df_angeles = pd.read_csv('losangeles.csv') # Import weather data for Los Angeles

In [41]:
# Add origin city column in Los Angeles database
df_angeles['origin_city_name'] = 'Los Angeles, CA'

# Change dates column name to the same name as the flights database
df_angeles.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Miami weather data to flights database

In [42]:
df_miami = pd.read_csv('Miami.csv') # Import weather data for Miami

In [43]:
# Add origin city column in Los Angeles database
df_miami['origin_city_name'] = 'Miami, FL'

# Change dates column name to the same name as the flights database
df_miami.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Minneapolis weather data to flights database

In [44]:
df_minneapolis = pd.read_csv('Minneapolis.csv') # Import weather data for Minneapolis

In [45]:
# Add origin city column in Minneapolis database
df_minneapolis['origin_city_name'] = 'Minneapolis, MN'

# Change dates column name to the same name as the flights database
df_minneapolis.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join New Orleans weather data to flights database

In [46]:
df_orleans = pd.read_csv('New_Orleans.csv') # Import weather data for Minneapolis

In [47]:
# Add origin city column in Minneapolis database
df_orleans['origin_city_name'] = 'New Orleans, LA'

# Change dates column name to the same name as the flights database
df_orleans.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Newark weather data to flights database

In [48]:
df_newark = pd.read_csv('newark.csv') # Import weather data for Minneapolis

In [49]:
# Add origin city column in Minneapolis database
df_newark['origin_city_name'] = 'Newark, NJ'

# Change dates column name to the same name as the flights database
df_newark.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join New York weather data to flights database

In [50]:
df_newyork = pd.read_csv('newyork.csv') # Import weather data for Minneapolis

In [51]:
# Add origin city column in Minneapolis database
df_newyork['origin_city_name'] = 'New York, NY'

# Change dates column name to the same name as the flights database
df_newyork.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Orlando weather data to flights database

In [52]:
df_orlando = pd.read_csv('orlando.csv') # Import weather data for Minneapolis

In [53]:
# Add origin city column in Minneapolis database
df_orlando['origin_city_name'] = 'Orlando, FL'

# Change dates column name to the same name as the flights database
df_orlando.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Philadelphia weather data to flights database

In [54]:
df_philadelphia = pd.read_csv('philadelphia.csv') # Import weather data for Minneapolis

In [55]:
# Add origin city column in Minneapolis database
df_philadelphia['origin_city_name'] = 'Philadelphia, PA'

# Change dates column name to the same name as the flights database
df_philadelphia.rename(columns={"dates_list": "fl_date"}, inplace=True) 

### df_flights[df_flights['origin_city_name'] == 'Philadelphia, PA']

Import and join Phoenix weather data to flights database

In [56]:
df_phoenix = pd.read_csv('phoenix.csv') # Import weather data for Minneapolis

In [57]:
# Add origin city column in Minneapolis database
df_phoenix['origin_city_name'] = 'Phoenix, AZ'

# Change dates column name to the same name as the flights database
df_phoenix.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Portland weather data to flights database

In [58]:
df_portland = pd.read_csv('portland.csv') # Import weather data for Minneapolis

In [59]:
# Add origin city column in Minneapolis database
df_portland['origin_city_name'] = 'Portland, OR'

# Change dates column name to the same name as the flights database
df_portland.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Salt Lake City weather data to flights database

In [60]:
df_salt = pd.read_csv('Salt_Lake.csv') # Import weather data for Minneapolis

In [61]:
# Add origin city column in Minneapolis database
df_salt['origin_city_name'] = 'Salt Lake City, UT'

# Change dates column name to the same name as the flights database
df_salt.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join San Diego weather data to flights database

In [62]:
df_sandiego = pd.read_csv('San_Diego.csv') # Import weather data for Minneapolis

In [63]:
# Add origin city column in Minneapolis database
df_sandiego['origin_city_name'] = 'San Diego, CA'

# Change dates column name to the same name as the flights database
df_sandiego.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join San Jose weather data to flights database

In [64]:
df_sanjose = pd.read_csv('San_Jose.csv') # Import weather data for Minneapolis

In [65]:
# Add origin city column in Minneapolis database
df_sanjose['origin_city_name'] = 'San Jose, CA'

# Change dates column name to the same name as the flights database
df_sanjose.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join San Francisco weather data to flights database

In [66]:
df_francisco = pd.read_csv('sanfrancisco.csv') # Import weather data for Minneapolis

In [67]:
# Add origin city column in Minneapolis database
df_francisco['origin_city_name'] = 'San Francisco, CA'

# Change dates column name to the same name as the flights database
df_francisco.rename(columns={"dates_list": "fl_date"}, inplace=True) 

### df_flights[df_flights['origin_city_name'] == 'San Francisco, CA']

Import and join Seattle weather data to flights database

In [68]:
df_seattle = pd.read_csv('seattle.csv') # Import weather data for Minneapolis

In [69]:
# Add origin city column in Minneapolis database
df_seattle['origin_city_name'] = 'Seattle, WA'

# Change dates column name to the same name as the flights database
df_seattle.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join St. Louis weather data to flights database

In [70]:
df_louis = pd.read_csv('St_Louis.csv')

In [71]:
# Add origin city column in Minneapolis database
df_louis['origin_city_name'] = 'St. Louis, MO'

# Change dates column name to the same name as the flights database
df_louis.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Las Vegas weather data to flights database

In [72]:
df_vegas = pd.read_csv('vegas.csv')

In [73]:
# Add origin city column in Minneapolis database
df_vegas['origin_city_name'] = 'Las Vegas, NV'

# Change dates column name to the same name as the flights database
df_vegas.rename(columns={"dates_list": "fl_date"}, inplace=True) 

Import and join Washington weather data to flights database

In [74]:
df_washington = pd.read_csv('washington.csv')

In [75]:
# Add origin city column in Minneapolis database
df_washington['origin_city_name'] = 'Washington, DC'

# Change dates column name to the same name as the flights database
df_washington.rename(columns={"dates_list": "fl_date"}, inplace=True) 

In [76]:
frames = [df_chicago, df_atlanta, df_baltimore, df_boston, df_charlotte, df_dallas,
         df_denver, df_detroit, df_durham, df_lauderdale, df_houston, df_angeles, df_miami,
         df_minneapolis, df_orleans, df_newark, df_newyork, df_orlando, df_philadelphia, df_phoenix,
         df_portland, df_salt, df_sandiego, df_sanjose, df_francisco, df_seattle, df_louis, 
          df_vegas, df_washington, df_fortworth]
df_merged = pd.concat(frames)

df_merged = df_merged.drop_duplicates()

In [90]:
df_merged.to_csv('weather_dataframe.csv', index=False)

In [77]:
# Merge dataframes
df_flights1 = pd.merge(df_flights, df_merged, how='left', on=['fl_date','origin_city_name'])

In [78]:
df_flights1
df_flights1.to_csv('flights_merged.csv', index=False)

In [79]:
print(df_chicago.shape)
print(df_atlanta.shape)
print(df_baltimore.shape)
print(df_boston.shape)
print(df_charlotte.shape)
print(df_dallas.shape)
print(df_denver.shape)
print(df_detroit.shape)
print(df_durham.shape)
print(df_lauderdale.shape)
print(df_houston.shape)
print(df_angeles.shape)
print(df_miami.shape)
print(df_minneapolis.shape)
print(df_orleans.shape)
print(df_newark.shape)
print(df_newyork.shape)
print(df_orlando.shape)
print(df_philadelphia.shape)
print(df_phoenix.shape)


print(df_portland.shape)
print(df_salt.shape)
print(df_sandiego.shape)
print(df_sanjose.shape)
print(df_francisco.shape)
print(df_seattle.shape)
print(df_louis.shape)
print(df_vegas.shape)
print(df_washington.shape)
print(df_fortworth.shape)

print(df_merged.shape)

(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(938, 8)
(27360, 8)


In [80]:
df_flights1

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,security_delay,late_aircraft_delay,carrier_name,delayed,temp_night_list,temp_day_list,speed_night,speed_day,desc_night,desc_day
0,2018-01-01,WN,WN,WN,1997,WN,N733SA,1997,13232,MDW,...,0.0,26.0,Southwest Airlines,1,-17.0,-17.0,21.0,15.0,Clear,Patchy light snow
1,2018-01-01,WN,WN,WN,181,WN,N8633A,181,13232,MDW,...,0.0,30.0,Southwest Airlines,1,-17.0,-17.0,21.0,15.0,Clear,Patchy light snow
2,2018-01-01,WN,WN,WN,185,WN,N8625A,185,13232,MDW,...,0.0,0.0,Southwest Airlines,1,-17.0,-17.0,21.0,15.0,Clear,Patchy light snow
3,2018-01-01,WN,WN,WN,403,WN,N564WN,403,13232,MDW,...,0.0,0.0,Southwest Airlines,0,-17.0,-17.0,21.0,15.0,Clear,Patchy light snow
4,2018-01-01,WN,WN,WN,1229,WN,N8305E,1229,13232,MDW,...,0.0,0.0,Southwest Airlines,1,-17.0,-17.0,21.0,15.0,Clear,Patchy light snow
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30371,2019-07-31,DL,DL_CODESHARE,DL,3290,9E,N914XJ,3290,15412,TYS,...,0.0,0.0,Endeavor Air,0,,,,,,
30372,2019-07-31,DL,DL_CODESHARE,DL,3291,9E,N311PQ,3291,10721,BOS,...,0.0,0.0,Endeavor Air,0,25.0,35.0,13.0,12.0,Clear,Sunny
30373,2019-07-31,DL,DL_CODESHARE,DL,3292,9E,N297PQ,3292,11193,CVG,...,0.0,0.0,Endeavor Air,0,,,,,,
30374,2019-07-31,DL,DL_CODESHARE,DL,3292,9E,N297PQ,3292,11298,DFW,...,0.0,0.0,Endeavor Air,0,32.0,35.0,12.0,11.0,Clear,Sunny


In [81]:
df_flights

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,carrier_name,delayed
0,2018-01-01,WN,WN,WN,1997,WN,N733SA,1997,13232,MDW,...,66.0,1.0,405.0,10.0,0.0,0.0,0.0,26.0,Southwest Airlines,1
1,2018-01-01,WN,WN,WN,181,WN,N8633A,181,13232,MDW,...,131.0,1.0,990.0,11.0,0.0,0.0,0.0,30.0,Southwest Airlines,1
2,2018-01-01,WN,WN,WN,185,WN,N8625A,185,13232,MDW,...,130.0,1.0,990.0,0.0,0.0,0.0,0.0,0.0,Southwest Airlines,1
3,2018-01-01,WN,WN,WN,403,WN,N564WN,403,13232,MDW,...,127.0,1.0,990.0,0.0,0.0,0.0,0.0,0.0,Southwest Airlines,0
4,2018-01-01,WN,WN,WN,1229,WN,N8305E,1229,13232,MDW,...,128.0,1.0,990.0,0.0,0.0,0.0,0.0,0.0,Southwest Airlines,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31020,2019-07-31,DL,DL_CODESHARE,DL,3290,9E,N914XJ,3290,15412,TYS,...,91.0,1.0,648.0,0.0,0.0,0.0,0.0,0.0,Endeavor Air,0
31021,2019-07-31,DL,DL_CODESHARE,DL,3291,9E,N311PQ,3291,10721,BOS,...,73.0,1.0,468.0,0.0,0.0,0.0,0.0,0.0,Endeavor Air,0
31022,2019-07-31,DL,DL_CODESHARE,DL,3292,9E,N297PQ,3292,11193,CVG,...,106.0,1.0,812.0,0.0,0.0,0.0,0.0,0.0,Endeavor Air,0
31023,2019-07-31,DL,DL_CODESHARE,DL,3292,9E,N297PQ,3292,11298,DFW,...,111.0,1.0,812.0,0.0,0.0,0.0,0.0,0.0,Endeavor Air,0


In [82]:
origin_city = df_flights.groupby('origin_city_name').count().sort_values(by='arr_delay', 
                            ascending=False).head(30)

origin_city

Unnamed: 0_level_0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,carrier_name,delayed
origin_city_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Chicago, IL",1717,1717,1717,1717,1717,1717,1717,1717,1717,1717,...,1717,1717,1717,1717,1717,1717,1717,1717,1717,1717
"Atlanta, GA",1570,1570,1570,1570,1570,1570,1570,1570,1570,1570,...,1570,1570,1570,1570,1570,1570,1570,1570,1570,1570
"New York, NY",1209,1209,1209,1209,1209,1209,1209,1209,1209,1209,...,1209,1209,1209,1209,1209,1209,1209,1209,1209,1209
"Dallas/Fort Worth, TX",1065,1065,1065,1065,1065,1065,1065,1065,1065,1065,...,1065,1065,1065,1065,1065,1065,1065,1065,1065,1065
"Charlotte, NC",900,900,900,900,900,900,900,900,900,900,...,900,900,900,900,900,900,900,900,900,900
"Houston, TX",897,897,897,897,897,897,897,897,897,897,...,897,897,897,897,897,897,897,897,897,897
"Los Angeles, CA",895,895,895,895,895,895,895,895,895,895,...,895,895,895,895,895,895,895,895,895,895
"Denver, CO",889,889,889,889,889,889,889,889,889,889,...,889,889,889,889,889,889,889,889,889,889
"Washington, DC",853,853,853,853,853,853,853,853,853,853,...,853,853,853,853,853,853,853,853,853,853
"Phoenix, AZ",810,810,810,810,810,810,810,810,810,810,...,810,810,810,810,810,810,810,810,810,810


In [83]:
df_flights1.groupby('origin_city_name').count().sort_values(by='arr_delay', 
                            ascending=False).head(30)

Unnamed: 0_level_0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,security_delay,late_aircraft_delay,carrier_name,delayed,temp_night_list,temp_day_list,speed_night,speed_day,desc_night,desc_day
origin_city_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Chicago, IL",1717,1717,1717,1717,1717,1717,1717,1717,1717,1717,...,1717,1717,1717,1717,1717,1717,1717,1717,1717,1717
"Atlanta, GA",1570,1570,1570,1570,1570,1570,1570,1570,1570,1570,...,1570,1570,1570,1570,1570,1570,1570,1570,1570,1570
"New York, NY",1209,1209,1209,1209,1209,1209,1209,1209,1209,1209,...,1209,1209,1209,1209,1209,1209,1209,1209,1209,1209
"Dallas/Fort Worth, TX",1065,1065,1065,1065,1065,1065,1065,1065,1065,1065,...,1065,1065,1065,1065,1065,1065,1065,1065,1065,1065
"Charlotte, NC",900,900,900,900,900,900,900,900,900,900,...,900,900,900,900,900,900,900,900,900,900
"Houston, TX",897,897,897,897,897,897,897,897,897,897,...,897,897,897,897,897,897,897,897,897,897
"Los Angeles, CA",895,895,895,895,895,895,895,895,895,895,...,895,895,895,895,895,895,895,895,895,895
"Denver, CO",889,889,889,889,889,889,889,889,889,889,...,889,889,889,889,889,889,889,889,889,889
"Washington, DC",853,853,853,853,853,853,853,853,853,853,...,853,853,853,853,853,853,853,853,853,853
"Phoenix, AZ",810,810,810,810,810,810,810,810,810,810,...,810,810,810,810,810,810,810,810,810,810


In [84]:
df_filtered = df_flights[df_flights['origin_city_name'] == 'Raleigh/Durham, NC']
df_filtered.groupby(['dest_city_name', 'crs_dep_time', 'carrier_name']).count().sort_values(by='fl_date', ascending=False).head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,delayed
dest_city_name,crs_dep_time,carrier_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
"New York, NY",1000,ExpressJet,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"Minneapolis, MN",1658,Delta Airlines,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"New York, NY",1505,Endeavor Air,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"New York, NY",1700,Endeavor Air,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"New York, NY",715,Endeavor Air,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"Nashville, TN",645,Lindbergh Airlines,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"Newark, NJ",600,Republic Airways,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"Dallas/Fort Worth, TX",1559,American Airlines,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"Atlanta, GA",630,Delta Airlines,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"Charlotte, NC",700,American Airlines,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


In [85]:
df_filtered1 = df_flights1[df_flights1['origin_city_name'] == 'Raleigh/Durham, NC']

df_filtered1.groupby(['dest_city_name', 'crs_dep_time', 'carrier_name']).count().sort_values(by='fl_date', ascending=False).head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,nas_delay,security_delay,late_aircraft_delay,delayed,temp_night_list,temp_day_list,speed_night,speed_day,desc_night,desc_day
dest_city_name,crs_dep_time,carrier_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
"New York, NY",1000,ExpressJet,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"Minneapolis, MN",1658,Delta Airlines,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"New York, NY",1505,Endeavor Air,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"New York, NY",1700,Endeavor Air,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"New York, NY",715,Endeavor Air,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"Nashville, TN",645,Lindbergh Airlines,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"Newark, NJ",600,Republic Airways,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"Dallas/Fort Worth, TX",1559,American Airlines,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"Atlanta, GA",630,Delta Airlines,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"Charlotte, NC",700,American Airlines,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


In [86]:
df_merged_filter = (df_merged['origin_city_name'] == 'Raleigh/Durham, NC') & (df_merged['fl_date'] == '2018-11-03')
df_merged[df_merged_filter]

Unnamed: 0,fl_date,temp_night_list,temp_day_list,speed_night,speed_day,desc_night,desc_day,origin_city_name
314,2018-11-03,14,15,11,10,Cloudy,Sunny,"Raleigh/Durham, NC"


In [87]:
df_filtered_final = (df_flights1['origin_city_name'] == 'Raleigh/Durham, NC') & (df_flights1['dest_city_name'] == 'Charlotte, NC') & (df_flights1['crs_dep_time'] == 518) & (df_flights1['carrier_name'] == 'American Airlines')
df_flights1[df_filtered_final].iloc[:, 36:47]

Unnamed: 0,late_aircraft_delay,carrier_name,delayed,temp_night_list,temp_day_list,speed_night,speed_day,desc_night,desc_day
3952,0.0,American Airlines,0,14.0,15.0,11.0,10.0,Cloudy,Sunny


In [88]:
df_filtered_fudge = (df_flights['origin_city_name'] == 'Raleigh/Durham, NC') & (df_flights['dest_city_name'] == 'Charlotte, NC') & (df_flights['crs_dep_time'] == 518) & (df_flights['carrier_name'] == 'American Airlines')
df_flights[df_filtered_fudge]

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,carrier_name,delayed
3990,2018-11-03,AA,AA,AA,580,AA,N126UW,580,14492,RDU,...,36.0,1.0,130.0,0.0,0.0,0.0,0.0,0.0,American Airlines,0


In [89]:
print(len(df_flights['origin_city_name'].unique()))
print(len(df_flights['fl_date'].unique()))

print(len(df_merged.groupby(['origin_city_name', 'fl_date']).shape))


339
555


AttributeError: 'DataFrameGroupBy' object has no attribute 'shape'

In [None]:
df_flights.columns