In [None]:
import os,sys

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import date
import seaborn as sns
import sklearn

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
data_folder = '../../data/'

In [None]:
df_airlines = pd.read_csv(os.path.join(data_folder,"airlines.csv"),sep=';')
df_airports = pd.read_csv(os.path.join(data_folder,"airports.csv"),sep=';')
df_flights = pd.read_csv(os.path.join(data_folder,"flights.csv"),sep=';')
df_planes = pd.read_csv(os.path.join(data_folder,"planes.csv"),sep=';')
df_weather = pd.read_csv(os.path.join(data_folder,"weather.csv"),sep=';')

## a) Perform a left join with flights and airlines by carrier field

In [None]:
df_flights = df_flights.merge(df_airlines,on='carrier')
df_flights

## b) Create a histogram plot of air_time field of flights dataframe 

In [None]:
plt.figure(figsize=(10,10))
plt.title("histogram of air time")
plt.xlabel("air time in minutes")
plt.ylabel("number of flights")
histo = plt.hist(df_flights.air_time.dropna(),bins=100)
plt.savefig("fig/hist_air_time.png")

## c) Create a plot of number of flights per day.

In [None]:
df_flights['dep_date'] = df_flights.apply(lambda r : date(r['year'],r['month'],r['day']),axis=1 )

In [None]:
df_nb_flight_per_day = df_flights.reset_index().groupby('dep_date')['index'].count()

In [None]:
plt.figure(figsize=(10,10))
plt.title("number of flight per departure date")
plt.xlabel("departure date")
plt.ylabel("number of flights")
plt.plot(df_nb_flight_per_day)
plt.savefig('fig/nb_flight_per_day.png')

In [None]:
df_flights['dep_day_of_week'] = df_flights['dep_date'].apply(lambda x : x.weekday())

In [None]:
df_flights.groupby(lambda x : x['dep_date'],axis=1)['one'].count()

In [None]:
plt.plot(df_flights.groupby('dep_day_of_week')['one'].count())

## (d) What features would you use to forecast volume? 

I would use the volume in the recent past it self, just like for a time serie prediction (SARIMA), the day of week, and all weather. The problem here is that we clearly not under the time serie assumption as it is known that flights volume has a seasonnality over the year and here we have a sample with the size of a year.

## (e) Perform a logistic regression to model volume (do not worry on overfitting).

### Data preparation

In [None]:
df_flights['ond'] = df_flights[['origin','dest']].apply(lambda x : x['origin']+x['dest'],axis=1)
df_flights['one'] = 1
df_nb_flight_per_day = df_nb_flight_per_day.reset_index()
df_nb_flight_per_day = df_nb_flight_per_day.rename(columns={'index':'nb_flight'})
df_weather['dep_date'] = df_weather.apply(lambda r : date(r['year'],r['month'],r['day']),axis=1 )

In [None]:
df_nb_flights_orig = df_flights.groupby(['dep_date','origin'])['one'].count().reset_index().pivot_table(values='one',index='dep_date',columns='origin')

In [None]:
df_nb_flights_orig['nb_flight_ground_truth'] = df_nb_flights_orig.apply(lambda x : x['EWR'] + x['JFK'] + x['LGA'],axis=1)

In [None]:
df_weather_orig_pivot = df_weather.pivot_table(values=[col for col in df_weather.columns if col not in ['dep_date','origin']],index='dep_date',columns='origin').reset_index()

In [None]:
df_features = df_nb_flights_orig.reset_index().merge(df_weather_orig_pivot,on='dep_date')

In [None]:
df_features['dep_day_of_week'] = df_features['dep_date'].apply(lambda x : x.weekday())

### Volume Modelisation

In [None]:
logreg = LogisticRegression(C=1e5)

In [None]:
X = df_features[[col for col in df_features.columns if col not in ['dep_date','EWR','JFK','LGA','nb_flight_ground_truth']]]
y = df_features['nb_flight_ground_truth']

In [None]:
logreg.fit(X,y)

In [None]:
df_features['predicted'] = logreg.predict(X)

In [None]:
df_features['modelisation_error'] = df_features['predicted'] - y

In [None]:
plt.figure(figsize=(20,20))
plt.plot(df_features[['dep_date','predicted','nb_flight_ground_truth','modelisation_error']].set_index('dep_date'))

In [None]:
df_flights.merge(df_weather,on='origin',indicator=)