# Course Section K2. Daycurve 1-NN Classification with 10-fold CV

In [19]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dtaidistance.dtw import distance as dtw_dist

# Filtering warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
%matplotlib inline

In [20]:
import Fred as fred
import math

# Create own Distance Measures and import from Fred-Frechet

# Discrete Frechet
def disc_frechet(x, y):
    a = fred.Curve(x)
    b = fred.Curve(y)
    dist = fred.discrete_frechet(a, b)
    return dist.value

# Discrete Dynamic Time Warping
def disc_dtw(x, y):
    a = fred.Curve(x)
    b = fred.Curve(y)
    dist = fred.discrete_dynamic_time_warping(a, b)
    return dist.value

# Discrete Dynamic Time Warping with traversal constraint
def window_dtw(x, y, w=4):
    n = len(x)
    m = len(y)

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dtw = np.full((n, m), math.inf)
    dtw[0, 0] = 0

    # all possible paths filled with zeros
    for i in range(1, n):
        for j in range(max(1, i-w), min(m, i+w)):
            cost = (x[i] - y[j])**2
            dtw[i, j] = cost + min(dtw[i-1, j],
                                   dtw[i, j-1],
                                   dtw[i-1, j-1])

    return math.sqrt(dtw[n-1, m-1])

# Discrete Frechet with traversal constraint
def window_disc_frechet(x, y, w=4):
    n = len(x)
    m = len(y)

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dfre = np.full((n, m), math.inf)
    dfre[0, 0] = 0

    # all possible paths filled with zeros
    for i in range(1, n):
        for j in range(max(1, i-w), min(m, i+w)):
            cost = (x[i] - y[j])**2
            dfre[i, j] = max(cost, min(dfre[i-1, j],
                                       dfre[i, j-1],
                                       dfre[i-1, j-1]))

    return math.sqrt(dfre[n-1, m-1])

def window_df(x, y, w=4, p=2):
    n = len(x) 
    m = len(y) 

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dtw = np.full((n, m), math.inf)
    dtw[0, 0] = 0
    
    for i in range(0,n):
        for j in range(max(0, i-w), min(m, i+w)):
            dtw[i, j] = 0
    
    
    # all possible paths filled with zeros
    for i in range(n):
        for j in range(max(0, i-w), min(m, i+w)):
            dtw[i,j] = abs(x[i] - y[j]) ** p
            if i > 0 or j > 0:
                dtw[i, j] = max(dtw[i,j], min(dtw[i-1, j] if i > 0 else math.inf,
                                              dtw[i, j-1] if j > 0 else math.inf,
                                              dtw[i-1, j-1] if (i > 0 and j > 0) else math.inf
                                             )
                               )
     
    return (dtw[n-1, m-1]) ** (1/p)

# k-greatest-distances-Mahnhattandistanz
def k_greatest_manhattan(x, y, w=6):
    dists = np.abs(x - y)
    return np.sum(np.sort(dists)[-w:][::-1])

# Earth mover's distance
from scipy.stats import wasserstein_distance

def emd(u, v):
    # create bin-vectors
    bin_u = [i for i in range(0, len(u))]
    bin_v = [i for i in range(0, len(v))]
    
    # normalize input vectors to unit-vectors
    u_norm = u / np.linalg.norm(u, ord=1)
    v_norm = v / np.linalg.norm(v, ord=1)
    
    return wasserstein_distance(bin_u, bin_v, u_norm, v_norm)

def dtw_ai(x, y, w=None):
    return dtw_dist(x, y, window=w, use_c=True)

In [21]:
ofEx = pd.read_csv("")

In [22]:
# -97 rausschmeissen
ofEx = ofEx.drop(ofEx[ofEx.final_result < 0].index)

In [23]:
y = ofEx['final_result']
X = ofEx.drop(['id', 'final_result'], axis=1)

In [24]:
X.head()

Unnamed: 0,day1,day2,day3,day4,day5,day6,day7,day8,day9,day10,...,day71,day72,day73,day74,day75,day76,day77,day78,day79,day80
0,31,0,83,2,2,0,0,287,0,24,...,33,3,76,0,18,0,2,0,38,0
1,119,0,6,0,76,187,21,35,3,19,...,10,0,57,0,46,0,17,47,35,0
2,274,8,0,7,0,100,5,52,112,0,...,63,0,82,0,62,6,150,0,79,0
3,0,0,0,0,0,0,0,29,0,0,...,0,23,95,0,34,0,3,3,12,0
5,0,17,41,0,0,23,29,31,11,23,...,0,35,14,0,38,37,45,2,33,17


## A vs. C bis F

In [25]:
X = X.drop(X[(y > 6) & (y < 10)].index)
y = y[X.index]

In [26]:
X.head()

Unnamed: 0,day1,day2,day3,day4,day5,day6,day7,day8,day9,day10,...,day71,day72,day73,day74,day75,day76,day77,day78,day79,day80
0,31,0,83,2,2,0,0,287,0,24,...,33,3,76,0,18,0,2,0,38,0
1,119,0,6,0,76,187,21,35,3,19,...,10,0,57,0,46,0,17,47,35,0
2,274,8,0,7,0,100,5,52,112,0,...,63,0,82,0,62,6,150,0,79,0
5,0,17,41,0,0,23,29,31,11,23,...,0,35,14,0,38,37,45,2,33,17
7,99,19,6,8,28,64,67,45,0,0,...,48,9,32,0,22,2,46,0,67,0


In [27]:
len(X)

205

In [28]:
len(y)

205

In [29]:
# Logarithmieren
X_l = np.log1p(X) 

In [30]:
# Zu binärem Problem machen
y[y < 10] = 0
y[y > 0] = 1

In [31]:
y.value_counts()

0    103
1    102
Name: final_result, dtype: int64

In [14]:
start = time.time()
distances = {'Manhattan':'cityblock', 'Euclidean': 'euclidean', 'Maximum': 'chebyshev',
             'DF': disc_frechet, 'DTW': dtw_ai, 'WDF': window_df, 'WDTW': window_dtw,
             'k_g_Manhattan': k_greatest_manhattan, 'EMD': emd
                     }
nachbar = 1
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

df = pd.DataFrame(columns = ['distance', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8])
 
df['distance'] = distances

row = 0
for key, dist in distances.items():
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=dist)
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [key, np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]

    df.loc[row, 0:] = score

    row = row + 1

print(df.to_latex(index=False))
df

end = time.time()
total = end - start
print(total)

\begin{tabular}{llllll}
\toprule
     distance &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
    Manhattan &  0.755952 &  0.755455 &  0.805476 &  0.674545 &  0.730988 \\
    Euclidean &   0.75619 &  0.756364 &  0.804632 &  0.676364 &  0.731209 \\
      Maximum &  0.702143 &  0.703636 &  0.683235 &  0.765455 &  0.716011 \\
           DF &  0.611429 &  0.614545 &  0.599475 &  0.699091 &  0.637014 \\
          DTW &  0.730238 &  0.731364 &  0.824921 &  0.598182 &  0.676679 \\
          WDF &  0.656429 &  0.658636 &  0.627534 &  0.769091 &  0.686336 \\
         WDTW &  0.698571 &  0.699545 &  0.719621 &  0.685455 &  0.692763 \\
k\_g\_Manhattan &   0.74119 &  0.741364 &  0.717075 &  0.785455 &  0.748112 \\
          EMD &  0.681905 &  0.682727 &  0.647964 &  0.783636 &  0.705549 \\
\bottomrule
\end{tabular}

652.6815092563629


In [15]:
start = time.time()

df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,81):
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=dtw_ai, metric_params={'w': i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    #df.loc[i, 0:] = score
    print(score)
    
print('WDTW')
print(df.to_latex(index=False))
df

end = time.time()
total = end - start
print(total)

[0, 0.7302380952380951, 0.7313636363636364, 0.824920634920635, 0.5981818181818181, 0.6766787065703475]
[1, 0.7561904761904763, 0.7563636363636365, 0.8046320346320346, 0.6763636363636364, 0.7312093960391174]
[2, 0.7604761904761904, 0.7604545454545454, 0.7623989898989899, 0.7645454545454546, 0.7613678795143326]
[3, 0.6685714285714286, 0.6699999999999999, 0.6937012987012987, 0.6281818181818182, 0.6531719673515338]
[4, 0.6783333333333335, 0.6786363636363636, 0.6977758352758351, 0.6463636363636364, 0.6645130035233009]
[5, 0.6766666666666666, 0.6777272727272728, 0.6832792207792207, 0.6845454545454546, 0.6758910533910535]
[6, 0.6383333333333334, 0.6381818181818183, 0.6553282828282828, 0.6063636363636363, 0.6216024910761753]
[7, 0.7071428571428571, 0.7081818181818182, 0.737069597069597, 0.6772727272727272, 0.6920419014287635]
[8, 0.7066666666666667, 0.7072727272727273, 0.7462121212121211, 0.6463636363636364, 0.6783988380041011]
[9, 0.7461904761904762, 0.7472727272727273, 0.7937950937950938, 0.

[79, 0.7302380952380951, 0.7313636363636364, 0.824920634920635, 0.5981818181818181, 0.6766787065703475]
[80, 0.7302380952380951, 0.7313636363636364, 0.824920634920635, 0.5981818181818181, 0.6766787065703475]
WDTW
\begin{tabular}{rlllll}
\toprule
 window & accuracy & roc\_auc & precision & recall &  f1 \\
\midrule
      0 &      NaN &     NaN &       NaN &    NaN & NaN \\
      1 &      NaN &     NaN &       NaN &    NaN & NaN \\
      2 &      NaN &     NaN &       NaN &    NaN & NaN \\
      3 &      NaN &     NaN &       NaN &    NaN & NaN \\
      4 &      NaN &     NaN &       NaN &    NaN & NaN \\
      5 &      NaN &     NaN &       NaN &    NaN & NaN \\
      6 &      NaN &     NaN &       NaN &    NaN & NaN \\
      7 &      NaN &     NaN &       NaN &    NaN & NaN \\
      8 &      NaN &     NaN &       NaN &    NaN & NaN \\
      9 &      NaN &     NaN &       NaN &    NaN & NaN \\
     10 &      NaN &     NaN &       NaN &    NaN & NaN \\
     11 &      NaN &     NaN &      

Aus vorherigen Versuchen mit 5-CV ist klar, dass für DPD ab ca. w=10 keine Verbesserung eintritt, daher nur bis 12 berechnen sonst dauert es > 11 h. Siehe Click 'n' Cluster-Präsentationen.

In [16]:
start = time.time()
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,12):    
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=window_df, metric_params={'w': i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    df.loc[i, 0:] = score

print('WDF')
print(df.to_latex(index=False))
df

end = time.time()
total = end - start
print(total)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

WDF
\begin{tabular}{rlllll}
\toprule
 window &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
      0 &  0.502381 &       0.5 &       0.0 &       0.0 &       0.0 \\
      1 &      0.63 &  0.629545 &  0.631841 &  0.638182 &  0.631994 \\
      2 &  0.669762 &  0.670455 &  0.662069 &  0.707273 &  0.679172 \\
      3 &   0.60119 &  0.602727 &  0.583419 &  0.679091 &   0.62254 \\
      4 &  0.656429 &  0.658636 &  0.627534 &  0.769091 &  0.686336 \\
      5 &  0.605476 &  0.606364 &  0.594466 &  0.686364 &  0.632807 \\
      6 &  0.566667 &  0.568182 &  0.555609 &      0.69 &  0.611119 \\
      7 &  0.590714 &  0.592727 &  0.576978 &  0.699091 &  0.625367 \\
      8 &   0.58119 &  0.583182 &  0.568904 &  0.660909 &  0.606222 \\
      9 &   0.59119 &  0.592273 &  0.575015 &  0.658182 &  0.609605 \\
     10 &  0.625476 &  0.626364 &  0.600101 &  0.726364 &  0.653044 \\
     11 &  0.630238 &  0.631364 &  0.613998 &  0.716364 &  0.654056 \\
\bottomrule
\end{tabular}

565

In [32]:
start = time.time()

nachbar=1
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,81):
    knn = KNeighborsClassifier(n_neighbors=nachbar,metric=k_greatest_manhattan, metric_params={'w':i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=5, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    #df.loc[i, 0:] = score
    print(score)

print('kgMan')
print(df.to_latex(index=False))
df  

end = time.time()
total = end - start
print(total)

[0, 0.751219512195122, 0.7504761904761905, 0.7977923976608187, 0.6766666666666665, 0.730989010989011]
[1, 0.6731707317073171, 0.6728571428571429, 0.6559191919191919, 0.7152380952380952, 0.6821512691525191]
[2, 0.7414634146341463, 0.7414285714285714, 0.7118787878787879, 0.8038095238095238, 0.7541269841269841]
[3, 0.7365853658536585, 0.7361904761904763, 0.7116666666666667, 0.7828571428571429, 0.7445454545454544]
[4, 0.7560975609756098, 0.7554761904761905, 0.7287212787212788, 0.8128571428571428, 0.7677791780957324]
[5, 0.7268292682926829, 0.7259523809523809, 0.699989343989344, 0.7828571428571429, 0.7380067279726584]
[6, 0.7317073170731707, 0.7314285714285715, 0.7043888575192924, 0.7933333333333333, 0.7458727066253762]
[7, 0.7365853658536585, 0.7361904761904763, 0.7186607823907595, 0.7738095238095238, 0.7442695024771031]
[8, 0.7609756097560976, 0.7604761904761904, 0.737720823798627, 0.8028571428571428, 0.76724536333232]
[9, 0.7609756097560976, 0.76, 0.7450143460658335, 0.7923809523809524, 

[80, 0.751219512195122, 0.7504761904761905, 0.7977923976608187, 0.6766666666666665, 0.730989010989011]
kgMan
\begin{tabular}{rlllll}
\toprule
 window & accuracy & roc\_auc & precision & recall &  f1 \\
\midrule
      0 &      NaN &     NaN &       NaN &    NaN & NaN \\
      1 &      NaN &     NaN &       NaN &    NaN & NaN \\
      2 &      NaN &     NaN &       NaN &    NaN & NaN \\
      3 &      NaN &     NaN &       NaN &    NaN & NaN \\
      4 &      NaN &     NaN &       NaN &    NaN & NaN \\
      5 &      NaN &     NaN &       NaN &    NaN & NaN \\
      6 &      NaN &     NaN &       NaN &    NaN & NaN \\
      7 &      NaN &     NaN &       NaN &    NaN & NaN \\
      8 &      NaN &     NaN &       NaN &    NaN & NaN \\
      9 &      NaN &     NaN &       NaN &    NaN & NaN \\
     10 &      NaN &     NaN &       NaN &    NaN & NaN \\
     11 &      NaN &     NaN &       NaN &    NaN & NaN \\
\bottomrule
\end{tabular}

110.59106421470642
