In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('data/statuses/train_data.csv')
df2 = pd.read_csv('data/statuses/val_data.csv')

In [3]:
import folium
import numpy as np
from scipy import sparse
from math import sqrt
import pandas as pd

BOUNDARIES = [(21.9430, -67.5), (55.7765, -135)]
VIL_THRESHOLD_COLORS = [
    (10000, (0.63, 0.0, 0.01, 1.0)),
    (32.32, (0.87, 0.56, 0.0, 1.0)),
    (12.16, (0.95, 0.75, 0.0, 1.0)),
    (7.08, (0.93, 0.95, 0.0, 1.0)),
    (3.53, (0.38, 0.69, 0.0, 1.0)),
    (0.77, (0.63, 0.94, 0.0, 1.0)),
    (0.52, (0.9, 0.9, 0.9, 0.03)),
]


def matrix_to_weather_colormap(sparse_matrix: sparse.csr_matrix) -> np.ndarray:
    matrix = sparse_matrix.toarray()
    result = np.zeros(shape=matrix.shape + (4,))
    for thresh, color in VIL_THRESHOLD_COLORS:
        result[matrix <= thresh] = color
    return result


def plot_matrix(sparse_matrix: sparse.csr_matrix, points) -> folium.Map:
    fmap = folium.Map(location=[35, -100], zoom_start=6)

    colored_matrix = matrix_to_weather_colormap(sparse_matrix=sparse_matrix)

    folium.raster_layers.ImageOverlay(
        colored_matrix,
        pixelated=True,
        opacity=0.8,
        mercator_project=True,
        bounds=BOUNDARIES,
    ).add_to(fmap)

    # add points list to folium map
    for point in points:
        folium.CircleMarker(location=(point[0], point[1]),
                            radius=2,
                            weight=5).add_to(fmap)
    return fmap


def load_and_show_vil(file_path: str, points) -> folium.Map:
    sparse_matrix = sparse.load_npz(file_path)
    plot = plot_matrix(sparse_matrix=sparse_matrix, points=points)
    return plot


def distance(coords_1, coords_2):
    return sqrt((coords_1[0] - coords_2[0]) ** 2 + (coords_1[1] - coords_2[1]) ** 2)

def distance_fromlist(lst):
    all_distance = 0
    for i in range(len(lst) - 1):
        all_distance += distance(lst[i], lst[i + 1])
    return all_distance

def get_route(coords_1, coords_2, n):
    return [(coords_1[0] + (coords_2[0] - coords_1[0]) * i / n, coords_1[1] + (coords_2[1] - coords_1[1]) * i / n) for i
            in range(n)]


def all_routes(lst, c=500):
    all_points = []
    if c < len(lst):
        c = len(lst)

    distance_list = list()
    all_distance = 0
    for i in range(len(lst) - 1):
        distance_list.append(distance(lst[i], lst[i + 1]))
        all_distance += distance(lst[i], lst[i + 1])

    for i in range(len(lst) - 2):
        n = round(int(c * distance_list[i] / all_distance))
        if n == 0:
            n = 1
        all_points += get_route(lst[i], lst[i + 1], n)

    how_many_left = c - len(all_points)
    all_points += get_route(lst[-2], lst[-1], how_many_left - 1)
    all_points.append(lst[-1])
    return all_points


def make_map(path, points, c=500):
    return load_and_show_vil(path,
                             points=all_routes(points, c))


In [165]:
df["airport"].value_counts()

KSEA    21856
KIAH    15668
KDFW    14476
Name: airport, dtype: int64

In [13]:
def pipeline(df):
    df['distance'] = df['waypoints'].apply(lambda x: distance_fromlist(eval(x)))
    df['no_of_waypoints'] = df['waypoints'].apply(lambda x: len(eval(x)))
    df = df.drop(columns=['timestamp', 'waypoints', 'observation_id'])
    df['route_type'] = df['route_type'].map({"DEPARTURE": 0, "ARRIVAL": 1})
    df['timestamp_hour'] = df['timestamp_hour'].str.split(':').apply(lambda row: int(row[0]))
    # add year month day column from timestamp_date
    df['timestamp_date'] = pd.to_datetime(df['timestamp_date'])
    # df['year'] = df['timestamp_date'].dt.year
    df['month'] = df['timestamp_date'].dt.month
    df['month'].map({1:1, 2:1, 3:2, 4:2, 5:2, 6:3, 7:3, 8:3, 9:4, 10:4, 11:4, 12:1})

    # df['day'] = df['timestamp_date'].dt.day
    # get day from timestamp_date
    df['dayoftheweek'] = df['timestamp_date'].dt.dayofweek
    # drop timestamp_date
    df = df.drop(columns=['timestamp_date'])
    y = df['status']
    X = df.drop(columns=['status'])
    X = pd.get_dummies(X, columns=['airport', 'timestamp_hour', 'dayoftheweek'])
    X = X.drop(columns=['route_id'])
    y = y.map({"CLSD": 0, "OPEN": 1})

    return X, y

In [11]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score 
from sklearn.metrics import plot_roc_curve

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import plot_confusion_matrix 
import xgboost

In [158]:
# def evaluate_model(model, X, y):
#     scores = cross_val_score(model, X, y, scoring='accuracy', cv=20, n_jobs=-1, error_score='raise')
#     return scores

In [14]:
X, y = pipeline(df)
modelDT = DecisionTreeClassifier()
modelDT.fit(X, y)
from sklearn.metrics import f1_score
print(f1_score(modelDT.predict(X), y))

X_t, y_t = pipeline(df2)
print(f1_score(modelDT.predict(X_t),y_t))

0.9381379952825514
0.844392523364486


In [None]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(modelDT, open(filename, 'wb'))

In [24]:
def model_all(route_id, timestamp_date, timestamp_hour, airport, route_type):
    df = pd.read_csv('data/statuses/train_data.csv')
    df2 = pd.DataFrame()
    df2.columns = df.columns
    
    new_row = {'observation_id':-1, 'route_id':route_id, 'timestamp':timestamp_date + ' ' + timestamp_hour, 'status':'cokolwiek', 
    'airport': airport, 'route_type': route_type, 'waypoints':df.loc[df['route_id'] == route_id, 'waypoints']}
    df2 = df2.append(new_row, ignore_index=True)
    X_test, Y_test = pipeline(df2)
    print(f1_score(modelDT.predict(X_t),y_t))
    
    

    

    loaded_model = pickle.load(open('finalized_model.sav', 'rb'))
    result = loaded_model.score(X_test, Y_test)
    print(result)


Unnamed: 0,observation_id,route_id,timestamp,status,airport,route_type,waypoints,timestamp_date,timestamp_hour,distance,no_of_waypoints
