# Weather Exploration Notebook

In [59]:
import pandas as pd
import numpy as np

import os.path
import requests
import io

import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
from sklearn.ensemble import GradientBoostingClassifier

import wrangle
import model
from preprocessing import main_modeling_prep

In [2]:
weather_data = pd.read_csv("US_WeatherEvents_2016-2019.csv")

In [11]:
weather_data[weather_data.State == "NC"].City.unique()

array(['Davis', 'Sanford', 'Goldsboro', 'Salisbury', 'Greensboro',
       'Southport', 'Louisburg', 'Oxford', 'Stoneville', 'Elm City',
       'Smithfield', 'Statesville', 'Franklin', 'Concord', 'Monroe',
       'Richlands', 'Rockingham', 'Whiteville', 'Pope Army Airfield',
       'Washington', 'Erwin', 'Pikeville', 'Tarboro', 'Wilmington',
       'New London', 'Morrisville', 'Mount Airy', 'Jefferson',
       'Engelhard', 'Frisco', 'Andrews', 'Elizabeth City', 'New Bern',
       'Winston Salem', 'Lexington', 'Elizabethtown', 'Iron Station',
       'Rutherfordton', 'Aulander', 'Asheboro', 'Timberlake',
       'Burlington', 'Maxton', 'Wadesboro', 'North Wilkesboro', 'Hoffman',
       'Jacksonville', 'Carthage', 'Edenton', 'Kinston', 'Lumberton',
       'Clinton', 'Fayetteville', 'Manteo', 'Newport', 'Halifax',
       'Kenansville', 'Beaufort', 'Havelock', 'Cleveland', 'Gastonia',
       'Morganton', 'Greenville', 'Maple', 'New River', 'Fletcher',
       'Township 2 Berryhill', 'Kill Devi

In [13]:
weather_data[weather_data.City == "Dallas"]

Unnamed: 0,EventId,Type,Severity,StartTime(UTC),EndTime(UTC),TimeZone,AirportCode,LocationLat,LocationLng,City,County,State,ZipCode
788917,W-789030,Rain,Light,2016-01-06 15:53:00,2016-01-06 16:53:00,US/Central,KRBD,32.6809,-96.8682,Dallas,Dallas,TX,75237.0
788918,W-789031,Rain,Light,2016-01-07 01:39:00,2016-01-07 02:00:00,US/Central,KRBD,32.6809,-96.8682,Dallas,Dallas,TX,75237.0
788919,W-789032,Rain,Light,2016-01-07 02:18:00,2016-01-07 02:41:00,US/Central,KRBD,32.6809,-96.8682,Dallas,Dallas,TX,75237.0
788920,W-789033,Rain,Heavy,2016-01-07 02:41:00,2016-01-07 03:06:00,US/Central,KRBD,32.6809,-96.8682,Dallas,Dallas,TX,75237.0
788921,W-789034,Rain,Moderate,2016-01-07 03:06:00,2016-01-07 04:26:00,US/Central,KRBD,32.6809,-96.8682,Dallas,Dallas,TX,75237.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3805626,W-3806127,Rain,Light,2019-12-29 18:15:00,2019-12-29 18:55:00,US/Eastern,KPUJ,33.9120,-84.9410,Dallas,Paulding,GA,30157.0
3805627,W-3806128,Rain,Light,2019-12-29 20:35:00,2019-12-29 20:55:00,US/Eastern,KPUJ,33.9120,-84.9410,Dallas,Paulding,GA,30157.0
3805628,W-3806129,Rain,Light,2019-12-29 22:15:00,2019-12-29 22:55:00,US/Eastern,KPUJ,33.9120,-84.9410,Dallas,Paulding,GA,30157.0
3805629,W-3806130,Rain,Light,2019-12-30 01:55:00,2019-12-30 02:35:00,US/Eastern,KPUJ,33.9120,-84.9410,Dallas,Paulding,GA,30157.0


In [None]:
# we only want data for 2018

In [None]:
# we only wand data for the following cities

top_airports = ["ATL", "LAX", "ORD", "DFW", "DEN", "JFK", "SFO", "SEA", "LAS", "MCO", "EWR", "CLT", "PHX", "IAH", "MIA"]

In [18]:
airport_codes = pd.read_csv("https://storage.googleapis.com/kagglesdsdata/datasets%2F626214%2F1116273%2Fairports.csv?GoogleAccessId=gcp-kaggle-com@kaggle-161607.iam.gserviceaccount.com&Expires=1594952131&Signature=CbuLdZR%2B9YkSTVhPslvV67WILvcTGa1EnhtSpZza10YkL2ccV6HIZLkikKrkF%2BEgwFySBM88hZ4Gp9aqZqA5s5BC%2FL7l71dpdyM8PkBWxtmvJkEYuPYxPvzwt2M10q7T4TzK1mFzUUwPz1CtvLcBdx%2F3GJgd5z4nSGepU3SkoalQHJ4JUQ0SpE5liZiCJ2SB%2FTA4fcz62TvAKv21Jwe7oq4q8CnrG6rvSjW5uRDkvKXDc1sXH2WG2CIZYyt%2FKz2B%2BJsi2iZ9AS9dAhbLO8GVOP6EhtIA9%2ForRmxuBStdXY%2BY1lojJaekb9agtbNS5wAIr%2Fkk0LUHmKPxt7b6jTiLtQ%3D%3D")

In [22]:
airport_codes = airport_codes[airport_codes.Country == "United States"]

In [23]:
airport_codes.head()

Unnamed: 0,Name,City,Country,IATA,ICAO,Latitude,Longitude
3212,Barter Island LRRS Airport,Barter Island,United States,BTI,PABA,70.134003,-143.582001
3213,Wainwright Air Station,Fort Wainwright,United States,\N,PAWT,70.613403,-159.860001
3214,Cape Lisburne LRRS Airport,Cape Lisburne,United States,LUR,PALU,68.875099,-166.110001
3215,Point Lay LRRS Airport,Point Lay,United States,PIZ,PPIZ,69.732903,-163.005005
3216,Hilo International Airport,Hilo,United States,ITO,PHTO,19.721399,-155.048004


In [27]:
weather_data = weather_data.merge(airport_codes, how="left", left_on="AirportCode", right_on="ICAO")

In [32]:
top_airports = ["ATL", "LAX", "ORD", "DFW", "DEN", "JFK", "SFO", "SEA", "LAS", "MCO", "EWR", "CLT", "PHX", "IAH", "MIA"]

for i in top_airports:
    weather_data.loc[weather_data['IATA'] == i, 'is_top'] = True 
    
weather_data.is_top = weather_data.is_top.fillna(False)

In [37]:
weather_data = weather_data[weather_data.is_top]

In [43]:
weather_data["StartTime(UTC)"] = pd.to_datetime(weather_data["StartTime(UTC)"])

In [45]:
weather_data = weather_data.set_index("StartTime(UTC)")

In [47]:
weather_data = weather_data["2018"]

In [150]:
weather_data.sort_values("City_y")

Unnamed: 0_level_0,EventId,Type,Severity,EndTime(UTC),TimeZone,AirportCode,LocationLat,LocationLng,City_x,County,State,ZipCode,Name,City_y,Country,IATA,ICAO,Latitude,Longitude,is_top
StartTime(UTC),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2018-12-01 17:58:00,W-3204675,Rain,Light,2018-12-01 18:13:00,US/Eastern,KATL,33.6301,-84.4418,Atlanta,Clayton,GA,30320.0,Hartsfield Jackson Atlanta International Airport,Atlanta,United States,ATL,KATL,33.636700,-84.428101,True
2018-09-09 20:52:00,W-3204531,Rain,Light,2018-09-09 21:18:00,US/Eastern,KATL,33.6301,-84.4418,Atlanta,Clayton,GA,30320.0,Hartsfield Jackson Atlanta International Airport,Atlanta,United States,ATL,KATL,33.636700,-84.428101,True
2018-09-10 19:05:00,W-3204532,Rain,Light,2018-09-10 19:14:00,US/Eastern,KATL,33.6301,-84.4418,Atlanta,Clayton,GA,30320.0,Hartsfield Jackson Atlanta International Airport,Atlanta,United States,ATL,KATL,33.636700,-84.428101,True
2018-09-11 20:12:00,W-3204533,Rain,Light,2018-09-11 20:52:00,US/Eastern,KATL,33.6301,-84.4418,Atlanta,Clayton,GA,30320.0,Hartsfield Jackson Atlanta International Airport,Atlanta,United States,ATL,KATL,33.636700,-84.428101,True
2018-09-11 20:57:00,W-3204534,Rain,Light,2018-09-11 21:52:00,US/Eastern,KATL,33.6301,-84.4418,Atlanta,Clayton,GA,30320.0,Hartsfield Jackson Atlanta International Airport,Atlanta,United States,ATL,KATL,33.636700,-84.428101,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-06-24 14:12:00,W-3376399,Cold,Severe,2018-06-24 14:46:00,US/Pacific,KSEA,47.4447,-122.3144,Seattle,King,WA,98148.0,Seattle Tacoma International Airport,Seattle,United States,SEA,KSEA,47.449001,-122.308998,True
2018-06-24 15:53:00,W-3376400,Fog,Moderate,2018-06-24 17:22:00,US/Pacific,KSEA,47.4447,-122.3144,Seattle,King,WA,98148.0,Seattle Tacoma International Airport,Seattle,United States,SEA,KSEA,47.449001,-122.308998,True
2018-06-28 21:53:00,W-3376401,Rain,Light,2018-06-28 22:53:00,US/Pacific,KSEA,47.4447,-122.3144,Seattle,King,WA,98148.0,Seattle Tacoma International Airport,Seattle,United States,SEA,KSEA,47.449001,-122.308998,True
2018-06-11 01:53:00,W-3376391,Rain,Light,2018-06-11 02:53:00,US/Pacific,KSEA,47.4447,-122.3144,Seattle,King,WA,98148.0,Seattle Tacoma International Airport,Seattle,United States,SEA,KSEA,47.449001,-122.308998,True


---

In [152]:
dataframe = pd.DataFrame()
unique_cities = weather_data.City_y.unique()

for city in unique_cities:
    city_data = weather_data[weather_data.City_y == city]
    r = pd.date_range(start=city_data.index.min(), end=city_data.index.max(), freq="H")
    city = city.reindex(r, copy=True)
    city =  city.fillna(method="ffill")
    
    
    
    day = city["EndTime(UTC)"].unique()[0]
    day_2 = fitbit["EndTime(UTC)"].unique()[1]

In [157]:
fitbit["EndTime(UTC)"].unique()[0], fitbit["EndTime(UTC)"].unique()[1]

('2018-01-09 00:52:00', '2018-01-11 08:52:00')

In [97]:
charlotte = weather_data[weather_data.City_y == "Charlotte"]

In [98]:
r = pd.date_range(start=charlotte.index.min(), end=charlotte.index.max(), freq="H")

In [124]:
fitbit = charlotte.reindex(r, copy=True)

In [125]:
fitbit = fitbit.fillna(method="ffill")

In [126]:
fitbit.head()

Unnamed: 0,EventId,Type,Severity,EndTime(UTC),TimeZone,AirportCode,LocationLat,LocationLng,City_x,County,State,ZipCode,Name,City_y,Country,IATA,ICAO,Latitude,Longitude,is_top
2018-01-08 20:52:00,W-4437347,Rain,Light,2018-01-09 00:52:00,US/Eastern,KCLT,35.2225,-80.9543,Township 2 Berryhill,Mecklenburg,NC,28278.0,Charlotte Douglas International Airport,Charlotte,United States,CLT,KCLT,35.214001,-80.9431,True
2018-01-08 21:52:00,W-4437347,Rain,Light,2018-01-09 00:52:00,US/Eastern,KCLT,35.2225,-80.9543,Township 2 Berryhill,Mecklenburg,NC,28278.0,Charlotte Douglas International Airport,Charlotte,United States,CLT,KCLT,35.214001,-80.9431,True
2018-01-08 22:52:00,W-4437347,Rain,Light,2018-01-09 00:52:00,US/Eastern,KCLT,35.2225,-80.9543,Township 2 Berryhill,Mecklenburg,NC,28278.0,Charlotte Douglas International Airport,Charlotte,United States,CLT,KCLT,35.214001,-80.9431,True
2018-01-08 23:52:00,W-4437347,Rain,Light,2018-01-09 00:52:00,US/Eastern,KCLT,35.2225,-80.9543,Township 2 Berryhill,Mecklenburg,NC,28278.0,Charlotte Douglas International Airport,Charlotte,United States,CLT,KCLT,35.214001,-80.9431,True
2018-01-09 00:52:00,W-4437347,Rain,Light,2018-01-09 00:52:00,US/Eastern,KCLT,35.2225,-80.9543,Township 2 Berryhill,Mecklenburg,NC,28278.0,Charlotte Douglas International Airport,Charlotte,United States,CLT,KCLT,35.214001,-80.9431,True


In [131]:
day = fitbit["EndTime(UTC)"].unique()[0]
day_2 = fitbit["EndTime(UTC)"].unique()[1]

In [142]:
type_fo = fitbit[(fitbit.index > day) & (fitbit.index < day_2)].Type.unique()

In [147]:
fitbit[(fitbit.index > day) & (fitbit.index < day_2)] = fitbit[(fitbit.index > day) & (fitbit.index < day_2)].replace(type_fo, "Clear")

In [148]:
fitbit.head(20)

Unnamed: 0,EventId,Type,Severity,EndTime(UTC),TimeZone,AirportCode,LocationLat,LocationLng,City_x,County,State,ZipCode,Name,City_y,Country,IATA,ICAO,Latitude,Longitude,is_top
2018-01-08 20:52:00,W-4437347,Rain,Light,2018-01-09 00:52:00,US/Eastern,KCLT,35.2225,-80.9543,Township 2 Berryhill,Mecklenburg,NC,28278.0,Charlotte Douglas International Airport,Charlotte,United States,CLT,KCLT,35.214001,-80.9431,True
2018-01-08 21:52:00,W-4437347,Rain,Light,2018-01-09 00:52:00,US/Eastern,KCLT,35.2225,-80.9543,Township 2 Berryhill,Mecklenburg,NC,28278.0,Charlotte Douglas International Airport,Charlotte,United States,CLT,KCLT,35.214001,-80.9431,True
2018-01-08 22:52:00,W-4437347,Rain,Light,2018-01-09 00:52:00,US/Eastern,KCLT,35.2225,-80.9543,Township 2 Berryhill,Mecklenburg,NC,28278.0,Charlotte Douglas International Airport,Charlotte,United States,CLT,KCLT,35.214001,-80.9431,True
2018-01-08 23:52:00,W-4437347,Rain,Light,2018-01-09 00:52:00,US/Eastern,KCLT,35.2225,-80.9543,Township 2 Berryhill,Mecklenburg,NC,28278.0,Charlotte Douglas International Airport,Charlotte,United States,CLT,KCLT,35.214001,-80.9431,True
2018-01-09 00:52:00,W-4437347,Rain,Light,2018-01-09 00:52:00,US/Eastern,KCLT,35.2225,-80.9543,Township 2 Berryhill,Mecklenburg,NC,28278.0,Charlotte Douglas International Airport,Charlotte,United States,CLT,KCLT,35.214001,-80.9431,True
2018-01-09 01:52:00,W-4437347,Clear,Light,2018-01-09 00:52:00,US/Eastern,KCLT,35.2225,-80.9543,Township 2 Berryhill,Mecklenburg,NC,28278.0,Charlotte Douglas International Airport,Charlotte,United States,CLT,KCLT,35.214001,-80.9431,True
2018-01-09 02:52:00,W-4437347,Clear,Light,2018-01-09 00:52:00,US/Eastern,KCLT,35.2225,-80.9543,Township 2 Berryhill,Mecklenburg,NC,28278.0,Charlotte Douglas International Airport,Charlotte,United States,CLT,KCLT,35.214001,-80.9431,True
2018-01-09 03:52:00,W-4437347,Clear,Light,2018-01-09 00:52:00,US/Eastern,KCLT,35.2225,-80.9543,Township 2 Berryhill,Mecklenburg,NC,28278.0,Charlotte Douglas International Airport,Charlotte,United States,CLT,KCLT,35.214001,-80.9431,True
2018-01-09 04:52:00,W-4437347,Clear,Light,2018-01-09 00:52:00,US/Eastern,KCLT,35.2225,-80.9543,Township 2 Berryhill,Mecklenburg,NC,28278.0,Charlotte Douglas International Airport,Charlotte,United States,CLT,KCLT,35.214001,-80.9431,True
2018-01-09 05:52:00,W-4437347,Clear,Light,2018-01-09 00:52:00,US/Eastern,KCLT,35.2225,-80.9543,Township 2 Berryhill,Mecklenburg,NC,28278.0,Charlotte Douglas International Airport,Charlotte,United States,CLT,KCLT,35.214001,-80.9431,True


-----

In [79]:
df = wrangle.prep_flight_data()

In [80]:
df.drop(columns=["Unnamed: 0"], inplace=True)

In [81]:
df["crs_dep_time"] = df['crs_dep_time'].astype(str).apply(lambda x: x.zfill(4))

In [82]:
df["fl_datetime"] = df.fl_date.astype(str) + " " + df.crs_dep_time.astype(str)

In [83]:
df.fl_datetime

0          2018-01-01 1517
1          2018-01-01 1115
2          2018-01-01 0630
3          2018-01-01 2241
4          2018-01-01 0750
                ...       
2965755    2018-12-31 1955
2965756    2018-12-31 1321
2965757    2018-12-31 1751
2965758    2018-12-31 2015
2965759    2018-12-31 1300
Name: fl_datetime, Length: 2965760, dtype: object

In [86]:
df.fl_datetime = pd.to_datetime(df.fl_datetime, format="%Y-%m-%d %H%M")

In [87]:
df.head()

Unnamed: 0,fl_date,op_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,...,actual_elapsed_time,air_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,Airline,fl_datetime
0,2018-01-01,UA,2429,EWR,DEN,1517,1512.0,-5.0,15.0,1527.0,...,250.0,225.0,1605.0,0.0,0.0,0.0,0.0,0.0,United Air Lines Inc.,2018-01-01 15:17:00
1,2018-01-01,UA,2427,LAS,SFO,1115,1107.0,-8.0,11.0,1118.0,...,83.0,65.0,414.0,0.0,0.0,0.0,0.0,0.0,United Air Lines Inc.,2018-01-01 11:15:00
2,2018-01-01,UA,2424,ORD,ALB,630,650.0,20.0,13.0,703.0,...,106.0,83.0,723.0,0.0,0.0,0.0,0.0,0.0,United Air Lines Inc.,2018-01-01 06:30:00
3,2018-01-01,UA,2422,ORD,OMA,2241,2244.0,3.0,15.0,2259.0,...,79.0,62.0,416.0,0.0,0.0,0.0,0.0,0.0,United Air Lines Inc.,2018-01-01 22:41:00
4,2018-01-01,UA,2421,IAH,LAS,750,747.0,-3.0,14.0,801.0,...,193.0,173.0,1222.0,0.0,0.0,0.0,0.0,0.0,United Air Lines Inc.,2018-01-01 07:50:00
