# Problemstellung

3 bekannte Fahrer (Florian, Matthias, Fabian) und 1 unbekannter Fahrer. Der unbekannte Datensatz soll einem bekannten Fahrer zugeordnet werden.
Multiclass Classification mit Confidence Score

# Lösungsansatz



In [1]:
import math
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.decomposition import LatentDirichletAllocation
import seaborn as sns
import featuretools as ft
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as mp
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter

non_feature_cols = {
    "driver_id",
    "window_id",
    "timestamp",
    "sample_id"
}

  import pkg_resources


In [2]:
def readData(location : str):
    return pd.read_csv(location, header=0)

def prepareDriverData(dataframe : pd.DataFrame, driverId : int) -> pd.DataFrame:
    dataframe = dataframe.apply(pd.to_numeric, errors="coerce").drop([0,1]).interpolate().dropna(axis=1, how="all")
    dataframe = dataframe.loc[:, (dataframe != dataframe.iloc[0]).any()]
    dataframe["driver_id"] = driverId
    return dataframe

def segmentData(data : pd.DataFrame, windowSize : int, overlap : int) -> pd.DataFrame:
    if windowSize <= 0:
        raise ValueError("windowSize muss > 0 sein")

    data = data.sort_values("timestamp").reset_index(drop=True)
    data["window_id"] = -1

    windowId = 0
    for start in range(0, len(data), windowSize):
        end = min(start + windowSize, len(data))
        data.loc[start:end - 1, "window_id"] = windowId
        windowId += 1

    return data

def readPrepareAndSegmentData(locations : list[str], windowSize : int, overlap : int) -> list[pd.DataFrame]:
    dataframes = []
    for i, location in enumerate(locations):
        dataframes.append(segmentData(prepareDriverData(readData(location), i), windowSize, overlap))
    return dataframes

def joinAndReduceData(dataframes : list[pd.DataFrame]) -> pd.DataFrame:
    jointData = pd.concat(dataframes, ignore_index=True).fillna(0)
    print(jointData[["window_id","driver_id"]])

    rotationVelocities = jointData[["car0_wheel0_rot_vel", "car0_wheel1_rot_vel", "car0_wheel2_rot_vel", "car0_wheel3_rot_vel"]]
    jointData = jointData.drop(columns=["car0_rpm", "car0_engine_rpm", "car0_velocity_vehicle", "car0_wheel0_rot_vel", "car0_wheel1_rot_vel", "car0_wheel2_rot_vel", "car0_wheel3_rot_vel"])
    jointData["car0_wheel_avg_rot_vel"] = rotationVelocities.mean(axis=1)
    return jointData

def generateFeaturesFeatureTools(data : pd.DataFrame):

    drivers = (
        data[["driver_id"]]
        .drop_duplicates()
        .set_index("driver_id")
    )

    windows = (
        data[["window_id", "driver_id"]]
        .drop_duplicates()
        .set_index("window_id")
    )

    samples = data.copy()
    samples["sample_id"] = range(len(samples))
    samples = samples.set_index("sample_id")


    es = ft.EntitySet(id="sim_data")

    es = es.add_dataframe(
        dataframe_name="drivers",
        dataframe=drivers,
        index="driver_id"
    )

    es = es.add_dataframe(
        dataframe_name="windows",
        dataframe=windows,
        index="window_id"
    )

    es = es.add_dataframe(
        dataframe_name="samples",
        dataframe=samples,
        index="sample_id",
        time_index="timestamp"
    )

    
    es = es.add_relationship(
        "drivers", "driver_id",
        "windows", "driver_id"
    )

    es = es.add_relationship(
        "windows", "window_id",
        "samples", "window_id"
    )

    cutoff_times = (
        data.groupby("window_id")["timestamp"]
        .max()
        .reset_index()
        .rename(columns={
            "window_id": "instance_id",
            "timestamp": "time"
        })
    )

    feature_matrix, feature_defs = ft.dfs(
        entityset=es,
        target_dataframe_name="windows",
        agg_primitives=["mean", "min", "max", "std"],
        cutoff_time=cutoff_times,
        max_depth=1
    )


    return feature_matrix, feature_defs

def generateFeatures(data: pd.DataFrame) -> pd.DataFrame:
    data = data[data["window_id"] != -1]

    feature_cols = [
        c for c in data.columns
        if c not in non_feature_cols
        and pd.api.types.is_numeric_dtype(data[c])
    ]

    features = (
        data
        .groupby(["driver_id", "window_id"])[feature_cols]
        .agg(["mean", "min", "max", "std"])
    )

    features.columns = [
        f"{col}_{stat}" for col, stat in features.columns
    ]

    return features.reset_index()

def groupFeaturesByName(df: pd.DataFrame) -> list[pd.DataFrame]:
    groups = defaultdict(list)

    for col in df.columns:
        if "_" not in col:
            continue

        base_name = col.rsplit("_", 1)[0]
        groups[base_name].append(col)

    grouped_dfs = [
        df[cols].copy()
        for cols in groups.values()
    ]

    return grouped_dfs

def clusterGroups(
    groups: list[pd.DataFrame],
    eps: float = 0.5,
    min_samples: int = 3
) -> pd.DataFrame:

    clusters = pd.DataFrame(index=groups[0].index)

    for i, group in enumerate(groups):
        X = StandardScaler().fit_transform(group.values)

        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(X)

        clusters[f"group_{i}_cluster"] = labels

    return clusters

In [3]:
recordings = [
    "./recordings/recording_fabian_2.csv",
    "./recordings/recording_florian_2.csv",
    "./recordings/recording_matthias_2.csv"
]

dataframes = readPrepareAndSegmentData(recordings, 50, 25)
data = joinAndReduceData(dataframes)

  return pd.read_csv(location, header=0)
  return pd.read_csv(location, header=0)
  return pd.read_csv(location, header=0)


        window_id  driver_id
0               0          0
1               0          0
2               0          0
3               0          0
4               0          0
...           ...        ...
128532        918          2
128533        918          2
128534        918          2
128535        918          2
128536        918          2

[128537 rows x 2 columns]


In [4]:
featureMatrix = generateFeatures(data)

In [None]:
groups = groupFeaturesByName(featureMatrix.drop(columns=["driver_id","window_id"]))

In [None]:
clusters = clusterGroups(groups, eps=0.7, min_samples=1)

In [None]:
clustersClean = clusters.replace(-1, 0)
lda = LatentDirichletAllocation(n_components=29)
topics = lda.fit_transform(clustersClean)

In [10]:
trainData = pd.read_csv("./preprocessed/JointRecordingsAggregations.csv")
X = trainData.drop(columns=["driver_id", "window_id"])
TRAIN_FEATURES = X.columns.tolist()
y = trainData["driver_id"]

rf = RandomForestClassifier(n_estimators=200, random_state=42)
gb = GradientBoostingClassifier()
svm = SVC(probability=True)
knn = KNeighborsClassifier(n_neighbors=7)

ensemble = VotingClassifier(
    estimators=[
        ("rf", rf),
        ("gb", gb),
        ("svm", svm),
        ("knn", knn)
    ],
    voting="soft"
)

ensemble.fit(X, y)


0,1,2
,estimators,"[('rf', ...), ('gb', ...), ...]"
,voting,'soft'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,n_neighbors,7
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [None]:
unknown = readPrepareAndSegmentData(["./recordings/recording_fabian_1.csv"], 50, 25)
unknown_df = joinAndReduceData(unknown)
unknown_features = generateFeatures(unknown_df)

X_unknown = unknown_features.drop(columns=["driver_id", "window_id"], errors="ignore")
X_unknown = X_unknown.reindex(columns=TRAIN_FEATURES, fill_value=0)


probs = ensemble.predict_proba(X_unknown)
preds = ensemble.classes_[np.argmax(probs, axis=1)]
final_driver = Counter(preds).most_common(1)[0]
print(final_driver)

  return pd.read_csv(location, header=0)


        window_id  driver_id
0               0          0
1               0          0
2               0          0
3               0          0
4               0          0
...           ...        ...
112491       2249          0
112492       2249          0
112493       2249          0
112494       2249          0
112495       2249          0

[112496 rows x 2 columns]
(np.int64(0), 2225)


In [None]:
data.to_csv("./preprocessed/JointData.csv")

In [9]:
featureMatrix.to_csv("./preprocessed/JointRecordingsAggregations.csv")