In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import folium
from matplotlib import pyplot as plt
from matplotlib import colors
from os import listdir
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from snippets import *

%matplotlib inline

In [2]:
DATA_PATH = "../../data"

GPS_PATH = f"{DATA_PATH}/gps_data_raw.csv"
ACCEL_PATH = f"{DATA_PATH}/accel_data_raw.csv"
PREDS_PATH = f"{DATA_PATH}/preds"

listdir(PREDS_PATH)

['result.csv',
 'baseline_gps_proba.csv',
 'proba_rf.csv',
 'baseline_gps.csv',
 'lstm_pred.csv',
 'baseline_accel_proba.csv',
 'submission_rf.csv',
 'submission_catboost.csv',
 'sub_Isolation_forest.csv',
 'baseline_accel.csv',
 'k-means.csv',
 'proba_catboost.csv']

In [3]:
df_preds = pd.read_csv(GPS_PATH) \
    .groupby(by=["driver_hash", "fraud"]) \
    .count() \
    .index \
    .to_frame() \
    .reset_index(drop=True)
df_preds

Unnamed: 0,driver_hash,fraud
0,-9218579406240981296,-1
1,-9148378939849570226,-1
2,-9090289600630456229,-1
3,-9073383204587901813,-1
4,-9067392653059450527,-1
...,...,...
312,8979134000488028450,-1
313,8979322487342770688,-1
314,9028001926696043346,-1
315,9142516420276355425,-1


In [4]:
lstm = pd.read_csv(f"{PREDS_PATH}/lstm_pred.csv")
display(lstm.head(1))
display(lstm["class"].value_counts())

df_preds = df_preds.merge(lstm, how="left", on="driver_hash")
df_preds = df_preds.rename(columns={"class": "LSTM"}).fillna(1.0)
df_preds.sort_values(by="LSTM")

Unnamed: 0,driver_hash,class
0,4633179079544742811,0.0


0.0    272
1.0     42
Name: class, dtype: int64

Unnamed: 0,driver_hash,fraud,LSTM
0,-9218579406240981296,-1,0.0
207,2532182704261981122,-1,0.0
206,2475962150892434238,-1,0.0
205,2423204062901461132,-1,0.0
204,2377516333004407886,-1,0.0
...,...,...,...
69,-5629789915372058845,-1,1.0
188,1293535473153840726,1,1.0
260,5960642249321857991,0,1.0
189,1366943225879240163,-1,1.0


In [5]:
isfor = pd.read_csv(f"{PREDS_PATH}/sub_Isolation_forest.csv")
display(isfor.head(1))
display(isfor["fraud_to_sub"].value_counts())

df_preds = df_preds.merge(isfor[["driver_hash", "fraud_to_sub"]], how="left", on="driver_hash")
df_preds = df_preds.rename(columns={"fraud_to_sub": "IsolForest"})
df_preds.loc[df_preds["IsolForest"].isna(), "IsolForest"] = df_preds.loc[df_preds["IsolForest"].isna(), "fraud"]
df_preds.sort_values(by="IsolForest")

Unnamed: 0.1,Unnamed: 0,driver_hash,fraud_to_sub,percent_anomaly_point
0,0,-9218579406240981296,0,0.009778


0    277
1      3
Name: fraud_to_sub, dtype: int64

Unnamed: 0,driver_hash,fraud,LSTM,IsolForest
0,-9218579406240981296,-1,0.0,0.0
213,2739762428580541796,-1,0.0,0.0
212,2710587379068018652,-1,0.0,0.0
211,2675529565097588969,-1,0.0,0.0
210,2651284740077073610,-1,1.0,0.0
...,...,...,...,...
200,2025312357143911724,1,1.0,1.0
82,-4687681644253937266,1,1.0,1.0
113,-2988946964244641879,1,0.0,1.0
12,-8729475955540025841,1,1.0,1.0


In [6]:
kmeans = pd.read_csv(f"{PREDS_PATH}/k-means.csv")
display(kmeans.head(1))
display(kmeans["class"].value_counts())

df_preds = df_preds.merge(kmeans, how="left", on="driver_hash")
df_preds = df_preds.rename(columns={"class": "k-Means"})
df_preds.loc[df_preds["k-Means"].isna(), "k-Means"] = df_preds.loc[df_preds["k-Means"].isna(), "fraud"]
df_preds.sort_values(by="k-Means")

Unnamed: 0,driver_hash,class
0,-9218579406240981296,0


0    278
1      2
Name: class, dtype: int64

Unnamed: 0,driver_hash,fraud,LSTM,IsolForest,k-Means
0,-9218579406240981296,-1,0.0,0.0,0.0
213,2739762428580541796,-1,0.0,0.0,0.0
212,2710587379068018652,-1,0.0,0.0,0.0
211,2675529565097588969,-1,0.0,0.0,0.0
210,2651284740077073610,-1,1.0,0.0,0.0
...,...,...,...,...,...
33,-7745607963253656189,1,1.0,1.0,1.0
12,-8729475955540025841,1,1.0,1.0,1.0
214,2765355583032989810,1,1.0,1.0,1.0
53,-6715898809375179473,1,1.0,1.0,1.0


In [7]:
proba_rf = pd.read_csv(f"{PREDS_PATH}/proba_rf.csv")
subm_gps = pd.read_csv(f"{PREDS_PATH}/submission_rf.csv")
subm_gps = subm_gps.merge(proba_rf, on="track")
subm_gps = subm_gps.rename(columns={"is_fraud_x": "subm_rf", "is_fraud_y": "rf_proba"})

subm_cb = pd.read_csv(f"{PREDS_PATH}/submission_catboost.csv")
subm_gps = subm_gps.merge(subm_cb, on="track")
subm_gps = subm_gps.rename(columns={"is_fraud": "subm_cb"})

proba_cb = pd.read_csv(f"{PREDS_PATH}/proba_catboost.csv")
subm_gps = subm_gps.merge(proba_cb, on="track")
subm_gps = subm_gps.rename(columns={"is_fraud": "cb_proba"})

subm_gps = subm_gps.rename(columns={"track":"driver_hash"})
subm_gps = subm_gps.merge(df_preds[["driver_hash", "fraud"]], how="right", on="driver_hash")
subm_gps.loc[subm_gps["subm_rf"].isna(), "subm_rf"] = subm_gps.loc[subm_gps["subm_rf"].isna(), "fraud"]
subm_gps.loc[subm_gps["rf_proba"].isna(), "rf_proba"] = subm_gps.loc[subm_gps["rf_proba"].isna(), "subm_rf"]
subm_gps.loc[subm_gps["subm_cb"].isna(), "subm_cb"] = subm_gps.loc[subm_gps["subm_cb"].isna(), "fraud"]
subm_gps.loc[subm_gps["cb_proba"].isna(), "cb_proba"] = subm_gps.loc[subm_gps["cb_proba"].isna(), "subm_cb"]

display(subm_gps["subm_rf"].value_counts())
display(subm_gps["subm_cb"].value_counts())


1.0    183
0.0    134
Name: subm_rf, dtype: int64

1.0    197
0.0    120
Name: subm_cb, dtype: int64

In [8]:
subm_gps.loc[subm_gps["rf_proba"] < 0.95, "subm_rf"] = 0.0
display(subm_gps["subm_rf"].value_counts())

0.0    286
1.0     31
Name: subm_rf, dtype: int64

In [9]:
subm_gps.loc[subm_gps["cb_proba"] < 0.98, "subm_cb"] = 0.0
display(subm_gps["subm_cb"].value_counts())

0.0    268
1.0     49
Name: subm_cb, dtype: int64

In [10]:
df_preds = df_preds.merge(subm_gps[["driver_hash", "subm_rf", "subm_cb"]], on="driver_hash") \
    .rename(columns={"subm_rf": "RandForest", "subm_cb": "CatBoost"})
df_preds

Unnamed: 0,driver_hash,fraud,LSTM,IsolForest,k-Means,RandForest,CatBoost
0,-9218579406240981296,-1,0.0,0.0,0.0,0.0,0.0
1,-9148378939849570226,-1,0.0,0.0,0.0,0.0,0.0
2,-9090289600630456229,-1,0.0,0.0,0.0,0.0,1.0
3,-9073383204587901813,-1,0.0,0.0,0.0,0.0,0.0
4,-9067392653059450527,-1,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...
312,8979134000488028450,-1,0.0,0.0,0.0,0.0,0.0
313,8979322487342770688,-1,0.0,0.0,0.0,0.0,0.0
314,9028001926696043346,-1,1.0,0.0,0.0,0.0,0.0
315,9142516420276355425,-1,1.0,0.0,0.0,0.0,0.0


In [11]:
proba_gps = pd.read_csv(f"{PREDS_PATH}/baseline_gps_proba.csv")
baseline = pd.read_csv(f"{PREDS_PATH}/baseline_gps.csv")
baseline = baseline.merge(proba_gps, on="driver_hash")

subm_cb = pd.read_csv(f"{PREDS_PATH}/baseline_accel.csv")
baseline = baseline.merge(subm_cb, on="driver_hash")

proba_accel = pd.read_csv(f"{PREDS_PATH}/baseline_accel_proba.csv")
baseline = baseline.merge(proba_accel, on="driver_hash")
baseline

baseline = baseline.merge(df_preds[["driver_hash", "fraud"]], how="right", on="driver_hash")
baseline.loc[baseline["fraud_gps"].isna(), "fraud_gps"] = baseline.loc[baseline["fraud_gps"].isna(), "fraud"]
baseline.loc[baseline["gps_proba"].isna(), "gps_proba"] = baseline.loc[baseline["gps_proba"].isna(), "fraud_gps"]
baseline.loc[baseline["fraud_tm"].isna(), "fraud_tm"] = baseline.loc[baseline["fraud_tm"].isna(), "fraud"]
baseline.loc[baseline["tm_proba"].isna(), "tm_proba"] = baseline.loc[baseline["tm_proba"].isna(), "fraud_tm"]

display(baseline["fraud_gps"].value_counts())
display(baseline["fraud_tm"].value_counts())


0.0    190
1.0    127
Name: fraud_gps, dtype: int64

0.0    199
1.0    118
Name: fraud_tm, dtype: int64

In [12]:
baseline.loc[baseline["gps_proba"] < 0.99, "fraud_gps"] = 0.0
display(baseline["fraud_gps"].value_counts())

0.0    236
1.0     81
Name: fraud_gps, dtype: int64

In [13]:
baseline.loc[baseline["tm_proba"] < 0.99, "fraud_tm"] = 0.0
display(baseline["fraud_tm"].value_counts())

0.0    220
1.0     97
Name: fraud_tm, dtype: int64

In [14]:
df_preds = df_preds.merge(baseline[["driver_hash", "fraud_gps", "fraud_tm"]], on="driver_hash") \
    .rename(columns={"fraud_gps": "Baseline GPS", "fraud_tm": "Baseline Accel"})
df_preds

Unnamed: 0,driver_hash,fraud,LSTM,IsolForest,k-Means,RandForest,CatBoost,Baseline GPS,Baseline Accel
0,-9218579406240981296,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-9148378939849570226,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-9090289600630456229,-1,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-9073383204587901813,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-9067392653059450527,-1,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
312,8979134000488028450,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
313,8979322487342770688,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
314,9028001926696043346,-1,1.0,0.0,0.0,0.0,0.0,0.0,0.0
315,9142516420276355425,-1,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
columns = ["driver_hash", "Baseline GPS", "Baseline Accel", "RandForest", "CatBoost", "IsolForest", "k-Means", "LSTM"]
df_preds[columns].astype(int)

Unnamed: 0,driver_hash,Baseline GPS,Baseline Accel,RandForest,CatBoost,IsolForest,k-Means,LSTM
0,-9218579406240981296,0,0,0,0,0,0,0
1,-9148378939849570226,0,0,0,0,0,0,0
2,-9090289600630456229,0,0,0,1,0,0,0
3,-9073383204587901813,0,0,0,0,0,0,0
4,-9067392653059450527,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...
312,8979134000488028450,0,0,0,0,0,0,0
313,8979322487342770688,0,0,0,0,0,0,0
314,9028001926696043346,0,0,0,0,0,0,1
315,9142516420276355425,0,0,0,0,0,0,1


In [16]:
#df_preds[columns].astype(int).to_csv(f"{PREDS_PATH}/result.csv", index=False)

In [17]:
#!cat "{PREDS_PATH}/result.csv"

In [26]:
columns = list(reversed(["RandForest", "CatBoost", "IsolForest", "k-Means", "LSTM"]))
df_preds[columns].value_counts().sort_values().sort_index()

LSTM  k-Means  IsolForest  CatBoost  RandForest
0.0   0.0      0.0         0.0       0.0           229
                                     1.0             3
                           1.0       0.0            22
                                     1.0            12
               1.0         0.0       0.0             2
                                     1.0             1
      1.0      0.0         0.0       0.0             1
               1.0         1.0       1.0             2
1.0   0.0      0.0         0.0       0.0            30
                                     1.0             1
                           1.0       0.0             1
      1.0      0.0         0.0       0.0             1
               1.0         1.0       1.0            12
dtype: int64