# Course Section K3. Daycurve 1-NN Classification with 10-fold CV

In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dtaidistance.dtw import distance as dtw_dist

# Filtering warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
%matplotlib inline

In [2]:
import Fred as fred
import math

# Create own Distance Measures and import from Fred-Frechet

# Discrete Frechet
def disc_frechet(x, y):
    a = fred.Curve(x)
    b = fred.Curve(y)
    dist = fred.discrete_frechet(a, b)
    return dist.value

# Discrete Dynamic Time Warping
def disc_dtw(x, y):
    a = fred.Curve(x)
    b = fred.Curve(y)
    dist = fred.discrete_dynamic_time_warping(a, b)
    return dist.value

# Discrete Dynamic Time Warping with traversal constraint
def window_dtw(x, y, w=4):
    n = len(x)
    m = len(y)

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dtw = np.full((n, m), math.inf)
    dtw[0, 0] = 0

    # all possible paths filled with zeros
    for i in range(1, n):
        for j in range(max(1, i-w), min(m, i+w)):
            cost = (x[i] - y[j])**2
            dtw[i, j] = cost + min(dtw[i-1, j],
                                   dtw[i, j-1],
                                   dtw[i-1, j-1])

    return math.sqrt(dtw[n-1, m-1])

# Discrete Frechet with traversal constraint
def window_disc_frechet(x, y, w=4):
    n = len(x)
    m = len(y)

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dfre = np.full((n, m), math.inf)
    dfre[0, 0] = 0

    # all possible paths filled with zeros
    for i in range(1, n):
        for j in range(max(1, i-w), min(m, i+w)):
            cost = (x[i] - y[j])**2
            dfre[i, j] = max(cost, min(dfre[i-1, j],
                                       dfre[i, j-1],
                                       dfre[i-1, j-1]))

    return math.sqrt(dfre[n-1, m-1])

def window_df(x, y, w=4, p=2):
    n = len(x) 
    m = len(y) 

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dtw = np.full((n, m), math.inf)
    dtw[0, 0] = 0
    
    for i in range(0,n):
        for j in range(max(0, i-w), min(m, i+w)):
            dtw[i, j] = 0
    
    
    # all possible paths filled with zeros
    for i in range(n):
        for j in range(max(0, i-w), min(m, i+w)):
            dtw[i,j] = abs(x[i] - y[j]) ** p
            if i > 0 or j > 0:
                dtw[i, j] = max(dtw[i,j], min(dtw[i-1, j] if i > 0 else math.inf,
                                              dtw[i, j-1] if j > 0 else math.inf,
                                              dtw[i-1, j-1] if (i > 0 and j > 0) else math.inf
                                             )
                               )
     
    return (dtw[n-1, m-1]) ** (1/p)

# k-greatest-distances-Mahnhattandistanz
def k_greatest_manhattan(x, y, w=6):
    dists = np.abs(x - y)
    return np.sum(np.sort(dists)[-w:][::-1])

# Earth mover's distance
from scipy.stats import wasserstein_distance

def emd(u, v):
    # create bin-vectors
    bin_u = [i for i in range(0, len(u))]
    bin_v = [i for i in range(0, len(v))]
    
    # normalize input vectors to unit-vectors
    u_norm = u / np.linalg.norm(u, ord=1)
    v_norm = v / np.linalg.norm(v, ord=1)
    
    return wasserstein_distance(bin_u, bin_v, u_norm, v_norm)

def dtw_ai(x, y, w=None):
    return dtw_dist(x, y, window=w, use_c=True)

In [3]:
ofEx = pd.read_csv("")

In [4]:
# -97 rausschmeissen
ofEx = ofEx.drop(ofEx[ofEx.final_result < 0].index)

In [5]:
y = ofEx['final_result']
X = ofEx.drop(['id', 'final_result'], axis=1)

In [6]:
X.head()

Unnamed: 0,day1,day2,day3,day4,day5,day6,day7,day8,day9,day10,...,day71,day72,day73,day74,day75,day76,day77,day78,day79,day80
0,44,60,33,12,53,50,56,5,6,0,...,10,0,208,0,0,0,0,9,4,0
1,58,87,0,24,66,39,67,42,58,0,...,64,22,67,0,80,40,13,83,131,0
2,0,102,4,12,66,36,2,20,24,2,...,38,16,315,0,26,17,3,4,15,4
3,154,149,0,0,109,34,113,0,81,0,...,57,21,16,0,71,5,0,22,18,0
4,71,305,24,61,28,0,41,0,23,0,...,7,31,43,0,31,0,4,14,13,0


## A vs. C bis F

In [7]:
X = X.drop(X[(y > 6) & (y < 10)].index)
y = y[X.index]

In [8]:
X.head()

Unnamed: 0,day1,day2,day3,day4,day5,day6,day7,day8,day9,day10,...,day71,day72,day73,day74,day75,day76,day77,day78,day79,day80
1,58,87,0,24,66,39,67,42,58,0,...,64,22,67,0,80,40,13,83,131,0
8,13,71,0,0,27,0,14,18,10,5,...,28,0,195,0,5,4,5,5,5,0
10,0,0,0,0,0,0,28,100,39,20,...,113,16,58,0,21,54,25,47,16,0
11,44,48,0,0,92,6,16,33,22,0,...,54,11,0,0,0,11,0,0,2,0
12,63,61,4,7,87,57,14,38,0,2,...,8,53,273,0,55,29,14,54,48,0


In [9]:
len(X)

213

In [10]:
len(y)

213

In [11]:
# Logarithmieren
X_l = np.log1p(X) 

In [12]:
# Zu binärem Problem machen
y[y < 10] = 0
y[y > 0] = 1

In [13]:
y.value_counts()

0    119
1     94
Name: final_result, dtype: int64

In [None]:
start =time.time()

distances = {'Manhattan':'cityblock', 'Euclidean': 'euclidean', 'Maximum': 'chebyshev',
             'DF': disc_frechet, 'DTW': dtw_ai, 'WDF': window_df, 'WDTW': window_dtw,
             'k_g_Manhattan': k_greatest_manhattan, 'EMD': emd
                     }
nachbar = 1
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

df = pd.DataFrame(columns = ['distance', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8])
 
df['distance'] = distances

row = 0
for key, dist in distances.items():
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=dist)
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [key, np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]

    df.loc[row, 0:] = score

    row = row + 1

print(df.to_latex(index=False))
df

end = time.time()
total = end - start
print(total)

In [None]:
start =time.time()

df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,81):
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=dtw_ai, metric_params={'w': i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    #df.loc[i, 0:] = score
    print(score)

print('WDTW')
print(df.to_latex(index=False))
df

end = time.time()
total = end - start
print(total)

In [15]:
start =time.time()

nachbar = 1
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,12):
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=window_df, metric_params={'w': i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    df.loc[i, 0:] = score
    print(i)

print('WDF')
print(df.to_latex(index=False))
df

end = time.time()
total = end - start
print(total)

0
1
2
3
4
5
6
7
8
9
10
11
WDF
\begin{tabular}{rlllll}
\toprule
 window &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
      0 &  0.441126 &       0.5 &  0.441126 &       1.0 &  0.612016 \\
      1 &   0.59697 &  0.586187 &    0.5425 &  0.482222 &  0.503109 \\
      2 &  0.601732 &   0.60053 &  0.554053 &  0.586667 &  0.563411 \\
      3 &  0.562771 &  0.563838 &  0.513037 &  0.582222 &  0.540654 \\
      4 &  0.531169 &  0.530429 &  0.468232 &  0.522222 &  0.491257 \\
      5 &  0.534848 &  0.535884 &  0.477762 &  0.542222 &  0.501667 \\
      6 &  0.516017 &  0.516237 &   0.43053 &  0.521111 &  0.464614 \\
      7 &  0.525758 &  0.526995 &  0.458366 &  0.541111 &  0.487137 \\
      8 &  0.525541 &  0.526061 &  0.454515 &  0.523333 &   0.48176 \\
      9 &  0.529654 &  0.527828 &  0.466818 &  0.501111 &  0.479263 \\
     10 &  0.538961 &  0.533384 &    0.4775 &  0.478889 &  0.475478 \\
     11 &  0.553247 &  0.548662 &  0.488434 &  0.501111 &  0.491727 \\
\bot

In [16]:
start =time.time()

nachbar=1
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,81):
    knn = KNeighborsClassifier(n_neighbors=nachbar,metric=k_greatest_manhattan, metric_params={'w':i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=5, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    #df.loc[i, 0:] = score
    print(score)

print('kgMan')
print(df.to_latex(index=False))
df 

end = time.time()
total = end - start
print(total)

[0, 0.6479512735326688, 0.6392988812611238, 0.6138888888888889, 0.5637426900584795, 0.5858445796526291]
[1, 0.6713178294573644, 0.6650330536486143, 0.6376754385964911, 0.6076023391812866, 0.6173277205288149]
[2, 0.6483942414174972, 0.6405797101449275, 0.6066267942583732, 0.5666666666666667, 0.5813135780628041]
[3, 0.6105204872646735, 0.5966819221967963, 0.5738193969772917, 0.4701754385964912, 0.5115459844871609]
[4, 0.6342192691029901, 0.6193777015001272, 0.6206060606060606, 0.4818713450292398, 0.5298476903148761]
[5, 0.6248062015503877, 0.6074624968217646, 0.6128282828282827, 0.4497076023391813, 0.5064524948735475]
[6, 0.601328903654485, 0.5800502161200102, 0.5766511266511266, 0.38654970760233914, 0.4443650793650793]
[7, 0.587375415282392, 0.5630530129672007, 0.5769230769230769, 0.34385964912280703, 0.40970248898107336]
[8, 0.6057585825027686, 0.5827167556572592, 0.5988888888888889, 0.37485380116959066, 0.44788863147858515]
[9, 0.5869324473975637, 0.5614829646580219, 0.556825396825396

[79, 0.6479512735326688, 0.6392988812611238, 0.6138888888888889, 0.5637426900584795, 0.5858445796526291]
[80, 0.6479512735326688, 0.6392988812611238, 0.6138888888888889, 0.5637426900584795, 0.5858445796526291]
kgMan
\begin{tabular}{rlllll}
\toprule
 window & accuracy & roc\_auc & precision & recall &  f1 \\
\midrule
      0 &      NaN &     NaN &       NaN &    NaN & NaN \\
      1 &      NaN &     NaN &       NaN &    NaN & NaN \\
      2 &      NaN &     NaN &       NaN &    NaN & NaN \\
      3 &      NaN &     NaN &       NaN &    NaN & NaN \\
      4 &      NaN &     NaN &       NaN &    NaN & NaN \\
      5 &      NaN &     NaN &       NaN &    NaN & NaN \\
      6 &      NaN &     NaN &       NaN &    NaN & NaN \\
      7 &      NaN &     NaN &       NaN &    NaN & NaN \\
      8 &      NaN &     NaN &       NaN &    NaN & NaN \\
      9 &      NaN &     NaN &       NaN &    NaN & NaN \\
     10 &      NaN &     NaN &       NaN &    NaN & NaN \\
     11 &      NaN &     NaN &   