In [None]:
import os
import sys
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [None]:
from utils.files.file_helper import load_binary_file, save_binary_file
from utils.statistics.correlation import check_correlations

import utils.configuration

import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
config = utils.configuration.Configuration()

In [None]:
MERGED_DATA_LOCATION = config.config['DEFAULT']['MERGED_DATA_LOCATION']
PREPROCESSED_DATA_FILE_BIN = config.config['DEFAULT']['PREPROCESSED_DATA_FILE_BIN']

In [None]:
data = load_binary_file(MERGED_DATA_LOCATION, PREPROCESSED_DATA_FILE_BIN)

## Describing data

### Overall data info

In [None]:
data.describe()

### Correlations

In [None]:
data.corr()

In [None]:
check_correlations(data)

## Plotting data

In [None]:
data.hist(figsize=(20, 20));

### Plotting classes

In [None]:
plt.figure(figsize=(8, 8))
ax = sns.countplot(x = data.status)
ax.set_title('Dataset class balance check')
ax.set_ylabel('Count')
plt.show()

### Day / delay plot

In [None]:
sns.stripplot(x="day_of_week", y="dep_delay", data=data, jitter=True)

### Month / Delay plot

In [None]:
sns.stripplot(x="month", y="dep_delay", data=data, jitter=True)

In [None]:
data.columns

### Flights per airline plot

In [None]:
airlines_group = data[['status', 'op_unique_carrier']]
airlines_group_num = airlines_group.groupby(['op_unique_carrier']).size()
airlines_group = data[['status', 'op_unique_carrier']]
airlines_group = airlines_group[airlines_group['status'] != 'no_delay']
airlines_group_delays_num = airlines_group.groupby(['op_unique_carrier']).size()
delay_info = pd.DataFrame({'Carrier': np.unique(airlines_group.op_unique_carrier.values), \
                           'Number of flights': airlines_group_num.values, \
                           'Number of delays': airlines_group_delays_num.values})

In [None]:
delay_info.plot(kind='bar', figsize=(8, 8), title = 'Number of flights per carrier', x='Carrier')

### Delay ratings

In [None]:
delay_info_rating['Delay index'] = delay_info['Number of delays'] / delay_info['Number of flights']

In [None]:
plt.figure(figsize=(8, 8))
ax = sns.barplot(x="Carrier", y="Delay index", data=delay_info_rating)
ax.set_title('Airline delay index ')
plt.show()