In [6]:
# Importing libraries
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier  # Example model
from sklearn.metrics import make_scorer, accuracy_score

# MERGE ALL THE DATA 


In [7]:
# Importing datasets
filepaths_left={
    "pvs1_gps_mpu" : r"archive/PVS 1/dataset_gps_mpu_left.csv",
    "pvs2_gps_mpu" : r"archive/PVS 2/dataset_gps_mpu_left.csv",
    "pvs3_gps_mpu" : r"archive/PVS 3/dataset_gps_mpu_left.csv",
    "pvs4_gps_mpu" : r"archive/PVS 4/dataset_gps_mpu_left.csv",
    "pvs5_gps_mpu" : r"archive/PVS 5/dataset_gps_mpu_left.csv",
    "pvs6_gps_mpu" : r"archive/PVS 6/dataset_gps_mpu_left.csv",
    "pvs7_gps_mpu" : r"archive/PVS 7/dataset_gps_mpu_left.csv",
    "pvs8_gps_mpu" : r"archive/PVS 8/dataset_gps_mpu_left.csv",
    "pvs9_gps_mpu" : r"archive/PVS 9/dataset_gps_mpu_left.csv"
}

filepaths_right={
    "pvs1_gps_mpu" : r"archive/PVS 1/dataset_gps_mpu_right.csv",
    "pvs2_gps_mpu" : r"archive/PVS 2/dataset_gps_mpu_right.csv",
    "pvs3_gps_mpu" : r"archive/PVS 3/dataset_gps_mpu_right.csv",
    "pvs4_gps_mpu" : r"archive/PVS 4/dataset_gps_mpu_right.csv",
    "pvs5_gps_mpu" : r"archive/PVS 5/dataset_gps_mpu_right.csv",
    "pvs6_gps_mpu" : r"archive/PVS 6/dataset_gps_mpu_right.csv",
    "pvs7_gps_mpu" : r"archive/PVS 7/dataset_gps_mpu_right.csv",
    "pvs8_gps_mpu" : r"archive/PVS 8/dataset_gps_mpu_right.csv",
    "pvs9_gps_mpu" : r"archive/PVS 9/dataset_gps_mpu_right.csv"


}

# Left and right corresponds to sensordata from either side of the cars
datasets_left = {name:pd.read_csv(path) for name,path in filepaths_left.items()}
datasets_right = {name:pd.read_csv(path) for name,path in filepaths_right.items()}

In [8]:
def merge(left, right, name ):
    pvs= pd.concat([left, right], axis=1)
    length= len(left)

    # Renaming labels
    pvs.columns = [
    'timestamp', 'acc_x_dashboard_l', 'acc_y_dashboard_l', 'acc_z_dashboard_l',
    'acc_x_above_suspension_l', 'acc_y_above_suspension_l', 'acc_z_above_suspension_l', 
    'acc_x_below_suspension_l', 'acc_y_below_suspension_l', 'acc_z_below_suspension_l', 
    'gyro_x_dashboard_l', 'gyro_y_dashboard_l', 'gyro_z_dashboard_l', 'gyro_x_above_suspension_l', 
    'gyro_y_above_suspension_l', 'gyro_z_above_suspension_l', 'gyro_x_below_suspension_l', 
    'gyro_y_below_suspension_l', 'gyro_z_below_suspension_l', 'mag_x_dashboard_l', 'mag_y_dashboard_l', 
    'mag_z_dashboard_l', 'mag_x_above_suspension_l', 'mag_y_above_suspension_l', 'mag_z_above_suspension_l', 
    'temp_dashboard_l', 'temp_above_suspension_l', 'temp_below_suspension_l', 'timestamp_gps', 
    'latitude', 'longitude', 'speed', 'timestamp', 'acc_x_dashboard_r', 'acc_y_dashboard_r', 
    'acc_z_dashboard_r', 'acc_x_above_suspension_r', 'acc_y_above_suspension_r', 
    'acc_z_above_suspension_r', 'acc_x_below_suspension_r', 'acc_y_below_suspension_r', 
    'acc_z_below_suspension_r', 'gyro_x_dashboard_r', 'gyro_y_dashboard_r', 'gyro_z_dashboard_r', 
    'gyro_x_above_suspension_r', 'gyro_y_above_suspension_r', 'gyro_z_above_suspension_r', 
    'gyro_x_below_suspension_r', 'gyro_y_below_suspension_r', 'gyro_z_below_suspension_r', 
    'mag_x_dashboard_r', 'mag_y_dashboard_r', 'mag_z_dashboard_r', 'mag_x_above_suspension_r', 
    'mag_y_above_suspension_r', 'mag_z_above_suspension_r', 'temp_dashboard_r', 'temp_above_suspension_r', 
    'temp_below_suspension_r', 'timestamp_gps', 'latitude', 'longitude', 'speed'
    ]
    

    # Remove duplicate columns (those with the same name, e.g., 'timestamp_gps', 'latitude', etc.)
    pvs_removed = pvs.loc[:, ~pvs.columns.duplicated()]

    # Columns to keep. We try these first
    to_keep=["timestamp",
             "acc_x_dashboard_l",
             "acc_y_dashboard_l",
             "acc_z_dashboard_l",
             "speed",
             "gyro_x_dashboard_l",
             "gyro_y_dashboard_l",
             "gyro_z_dashboard_l"
            ]
    
    pvs_removed=pvs_removed[to_keep]

    #create the driver column ( target )
    if name in ("pvs1_gps_mpu","pvs4_gps_mpu","pvs7_gps_mpu"):
        pvs_removed['Driver'] = 1
    elif name in ("pvs2_gps_mpu","pvs5_gps_mpu","pvs8_gps_mpu"):
        pvs_removed['Driver'] = 2
    else: 
        pvs_removed['Driver'] = 3

    return pvs_removed

### CREATE TARGETS
We want to predict the driver, and we know that each PVS contains data regarding one single driver. 
As a conseguence, we can simply create a dataframe containing the same values repeated n times for each PVS.

The datasets refer to the following drivers: 

PVS1--> 1

PVS2--> 2

PVS3--> 3

PVS4--> 1

PVS5--> 2

PVS6--> 3

PVS7--> 1

PVS8--> 2

PVS9--> 3


In [9]:
# Using the merge function to create the new versions of the datasets
datasets = {name: merge(datasets_left[name],datasets_right[name], name) for name in datasets_right.keys()}

In [10]:
#print all the features 
columns=set()
for name,dataset in datasets.items():
    columns.update(list(dataset.columns))

for el in columns:
    print(el)

gyro_y_dashboard_l
gyro_z_dashboard_l
timestamp
acc_x_dashboard_l
acc_z_dashboard_l
acc_y_dashboard_l
gyro_x_dashboard_l
Driver
speed


In [11]:
datasets["pvs1_gps_mpu"].head()

Unnamed: 0,timestamp,acc_x_dashboard_l,acc_y_dashboard_l,acc_z_dashboard_l,speed,gyro_x_dashboard_l,gyro_y_dashboard_l,gyro_z_dashboard_l,Driver
0,1577219000.0,0.365116,0.167893,9.793961,0.009128,-0.133896,-0.018883,0.138092,1
1,1577219000.0,0.392649,0.176273,9.771216,0.009128,-0.027084,-0.003624,0.000763,1
2,1577219000.0,0.409408,0.181062,9.732909,0.009128,0.125504,-0.186729,-0.09079,1
3,1577219000.0,0.371101,0.164302,9.749668,0.009128,-0.08812,-0.034142,0.046539,1
4,1577219000.0,0.390255,0.159514,9.869378,0.009128,-0.179672,0.118446,-0.182343,1


# WORK ON CORRELATION (OPTIONAL SO FAR)

In [None]:
def compute_correlation(df,plot=False):

    # Only compute using the numeric data types in the dataset
    numeric_df = df.select_dtypes(include=['number'])
    
    #print(numeric_df.columns)
    correlation_matrix=numeric_df.corr()

    if not plot:
        return correlation_matrix
        
    # Create the heatmap
    plt.figure(figsize=(18, 14))  # Increase the size of the figure
    
    # Plot the heatmap with adjusted figure size
    sns.heatmap(correlation_matrix, annot=False, fmt='.2f', cmap='coolwarm', linewidths=0.5,vmin=-1, vmax=1)
    
    # Rotate the tick labels for better readability and shrink font size
    plt.xticks(rotation=90, fontsize=8)  # Rotate x-axis labels by 90 degrees and adjust font size
    plt.yticks(rotation=0, fontsize=8)   # Keep y-axis labels horizontal and adjust font size
    
    # Add a title with smaller font size
    plt.title("Correlation Matrix", fontsize=12)
    
    # Show the plot with a tight layout
    plt.tight_layout()  # Adjust layout to prevent clipping
    plt.show()

    return correlation_matrix

In [22]:
datasets_correlation={name: compute_correlation(datasets[name]) for name in datasets.keys()}

## REDUCE THE DATA
Try to reduce the data by merging k consecutive lines ( reduce by a factor of k )

In [13]:
#reduce the number of rows of the dataset
def reduce_df(df,ratio):
    new = pd.DataFrame()
    for i in range(0, len(df) - 1, ratio):  # Step by 2 to get pairs

        if i%10000==0:
            print(i)
            
        row1 = df.iloc[i]
        row2 = df.iloc[i + 1]
        # Calculate the mean of the pair (assuming numeric data)
        merged_row = (row1 + row2) / 2
        
        # Append the result as a new row to `new`
        new = pd.concat([new, merged_row.to_frame().T], ignore_index=True)
        
    return new

ratio=20
datasets_reduced={name:reduce_df(datasets[name],ratio) for name in datasets.keys()}




0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
0
10000
20000
30000
40000
50000
60000
70000
80000
90000


## WORK ON A WINDOW

In [14]:
def delete_timestamp(df):
    to_remove=["timestamp_max","timestamp_min","timestamp_mean","timestamp_STD"]
    new= df.drop(columns=to_remove, axis=1, errors='ignore')

    return new

def update_name(df):
    #print(df.columns)
    #df = df.loc[:, ~df.columns.str.contains('timestamp')]  # Remove columns containing 'timestamp'
    new_column_names = [ 'timestamp_max','acc_x_dashboard_l_max', 'acc_y_dashboard_l_max', 
        'acc_z_dashboard_l_max', 'speed_max', 'gyro_x_dashboard_l_max', 
        'gyro_y_dashboard_l_max', 'gyro_z_dashboard_l_max', 
        'timestamp_min', 'acc_x_dashboard_l_min', 'acc_y_dashboard_l_min', 
        'acc_z_dashboard_l_min', 'speed_min', 'gyro_x_dashboard_l_min', 
        'gyro_y_dashboard_l_min', 'gyro_z_dashboard_l_min', 
        'timestamp_mean', 'acc_x_dashboard_l_mean', 'acc_y_dashboard_l_mean', 
        'acc_z_dashboard_l_mean', 'speed_mean', 'gyro_x_dashboard_l_mean', 
        'gyro_y_dashboard_l_mean', 'gyro_z_dashboard_l_mean', 
        'timestamp_STD_', 'acc_x_dashboard_l_STD_', 'acc_y_dashboard_l_STD_', 
        'acc_z_dashboard_l_STD_', 'speed_STD_', 'gyro_x_dashboard_l_STD_', 
        'gyro_y_dashboard_l_STD_', 'gyro_z_dashboard_l_STD_',
         'acc_x_dashboard_l_jerk', 'acc_y_dashboard_l_jerk', 
        'acc_z_dashboard_l_jerk', 'speed_jerk', 'gyro_x_dashboard_l_jerk', 
        'gyro_y_dashboard_l_jerk', 'gyro_z_dashboard_l_jerk'
    ]
    df.columns = new_column_names
    return df



#let's define a window
def computeWindow(name,df, windowSize=100):
    print("--------------------------")
    print(f"computing dataset {name}")
    
    #define empty dataframe  

    X= df.iloc[:, :-1]
    Y= df.iloc[:len(df)-windowSize+1, -1]
    windowed_df=pd.DataFrame()

    #define starting and ending index
    for start_idx in range(len(df)-windowSize+1):
        end_idx=start_idx+windowSize

        #extract rows belonging to the window
        window=X.iloc[start_idx:end_idx]

        #take beginning timestamp and ending timestamp
        start_timestamp=window.iloc[0,0]
        end_timestamp=window.iloc[-1,0]

        
        #keep only sensor data
        sensor_data_window=window.iloc[:,1:]

        #compute metrics for the specific window
        max_values=window.max()
        min_values=window.min()
        mean_values=window.mean()
        std_values=window.std()
        jerk_values=(sensor_data_window.iloc[-1]-sensor_data_window.iloc[0])/(end_timestamp-start_timestamp)

        #concate them (place side by side)
        new_row=pd.concat([max_values,min_values,mean_values,std_values,jerk_values])
        new_row = new_row.to_frame().T 

        windowed_df = pd.concat([windowed_df, new_row], ignore_index=True)

        if start_idx%10000==0:
            print(start_idx)

    #place aside again  X and Y
    final=pd.concat([delete_timestamp(update_name(windowed_df)),Y], axis=1)
            
            
    return final


In [15]:
def create_model(type):
    if type=="RandomForest":
        return RandomForestClassifier(n_estimators=100, random_state=42)

def evaluate_model(dfs,window_size):
    #create the window based on the window size 
    #window the datasets
    dfs_windowed={name:computeWindow(name,dfs[name]) for name in dfs.keys()}

    #create a single df out of the windowed dfs (append them)
    df = pd.concat(dfs_windowed.values(), axis=0, ignore_index=True)

    #take X and y
    X=df.iloc[:,:-1]
    y=df.iloc[:,-1]


    #split data into train and test set 
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42)


    #create and fit the model 
    model_type="RandomForest"
    model=create_model(model_type)

    #fit the model 
    model.fit(X_train,y_train) #check again this 

    #evaluate the model
    y_pred=model.predict(X_test)
    print(type(y_pred))
    print(type(y_testy_test.to_numpy()))

    #compute accuracy
    accuracy=accuracy_score( y_test.to_numpy(), y_pred)
    return accuracy

def tuning_and_evaluation(dfs,window_sizes):

    #initialize optimal results
    best_accuracy=0
    best_window_size=0

    for window_size in window_sizes:
        print(f"--------------------")
        print(f"EVALUATE window_size: {window_size}")
        
        accuracy=evaluate_model(dfs,window_size)
        
        #update optimal results if needed
        if accuracy>best_accuracy:
            best_accuracy=accuracy
            best_window_size=window_size
            
        print(f"Best window size: {best_window_size} with accuracy: {best_accuracy}")

    

In [16]:
def create_final(dfs,window_size):
    dfs_windowed={name:computeWindow(name,dfs[name]) for name in dfs.keys()}

    #create a single df out of the windowed dfs (append them)
    df = pd.concat(dfs_windowed.values(), axis=0, ignore_index=True)

    #take X and y
    X=df.iloc[:,:-1]
    y=df.iloc[:,-1]

    return X,y
    
def evaluate(X,y,num_folds):
    #split data into train and test set 
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42)

    print(type(X_train))
    #create and fit the model 
    model_type="RandomForest"
    model=create_model(model_type)

    fold_accuracies = []
    
    for train_index, val_index in kf.split(X_train):
        X_ttrain, X_val = X_train[train_index], X_train[val_index] 
        y_ttrain, y_val = y_train[train_index], y_train[val_index]

        
            
        model.fit(X_ttrain, y_ttrain) 
        y_pred = model.predict(X_val) 
        accuracy = accuracy_score(y_val, y_pred) 
        print([[y_val==y_pred]==True].count())
        print('accuracy:', accuracy)
        fold_accuracies.append(accuracy)

    average_accuracy = sum(fold_accuracies) / num_folds
    print('average of fold',average_accuracy)

    return average_accuracy

    

In [12]:
#window the datasets
#datasets_windowed=evaluate_model(datasets_reduced,100)
window_sizes=[100]


X,y=create_final(datasets_reduced,100)


--------------------------
computing dataset pvs1_gps_mpu
0
10000
20000
30000
--------------------------
computing dataset pvs2_gps_mpu
0
10000
20000
30000
--------------------------
computing dataset pvs3_gps_mpu
0
10000
20000
--------------------------
computing dataset pvs4_gps_mpu
0
10000
20000
30000
--------------------------
computing dataset pvs5_gps_mpu
0
10000
20000
30000
--------------------------
computing dataset pvs6_gps_mpu
0
10000
20000
--------------------------
computing dataset pvs7_gps_mpu
0
10000
20000
30000
--------------------------
computing dataset pvs8_gps_mpu
0
10000
20000
30000
--------------------------
computing dataset pvs9_gps_mpu
0
10000
20000


In [15]:
from sklearn.model_selection import KFold
num_folds = 5
kf = KFold(n_splits=num_folds)

#print(X.to_numpy().shape)
evaluate(X.to_numpy(),y.to_numpy(),num_folds)

<class 'numpy.ndarray'>


KeyboardInterrupt: 

In [55]:
#CONCATENATE ALL OF THEM AND FIND THE FINAL DATASET 
combined_df = pd.concat(datasets_windowed.values(), axis=0, ignore_index=True)
combined_df.head()
#combined_df.size()
print(type(combined_df))
print(combined_df.shape)

<class 'pandas.core.frame.DataFrame'>
(53157, 37)


## WORK AGAIN ON CORRELATION (NOW INTERESTING)

In [15]:
corr_matrix_final=compute_correlation(combined_df,False)

In [18]:
def remove_highly_correlated(df,corr,threshold=0.9):
    #take absoulute values
    corr_matrix_abs=corr.abs()
    
    #take only upper diagonal 
    mask = np.triu(np.ones_like(corr, dtype=bool), k=1)
    
    highly_correlated_features=np.where(corr_matrix_abs*mask>threshold)
    
    #print and store highly correlated features
    
    to_drop=set()
    for i,j in zip(*highly_correlated_features):
        #print( f"highly correlated pair: {corr_matrix_abs.columns[i]}-{corr_matrix_abs.columns[j]},(correlation: {corr_matrix_abs.iloc[i, j]:.2f})")
        to_drop.add(corr_matrix_abs.columns[j])
        #print(corr_matrix_abs.columns[j])
    
    #remove to_drop columns
    cleaned_df=df.drop(columns=to_drop)
    print(f"droopped columns {to_drop}")
    #print(f"removed {count} columns")
    print(f"new dataframe has shape {cleaned_df.shape}")

    return cleaned_df,to_drop

In [30]:
cleaned_final_df, dropped_columns = remove_highly_correlated(combined_df, corr_matrix_final, 0.6)

final_metrics_correlation = {name: compute_correlation(cleaned_final_df[name]) for name in cleaned_final_df.columns}


droopped columns {'speed_min', 'gyro_y_dashboard_l_max', 'gyro_y_dashboard_l_STD_', 'acc_z_dashboard_l_STD_', 'gyro_x_dashboard_l_max', 'gyro_z_dashboard_l_mean', 'acc_z_dashboard_l_max', 'acc_x_dashboard_l_STD_', 'acc_y_dashboard_l_STD_', 'acc_y_dashboard_l_max', 'acc_x_dashboard_l_min', 'gyro_x_dashboard_l_min', 'speed_mean', 'acc_z_dashboard_l_min', 'acc_y_dashboard_l_min', 'gyro_y_dashboard_l_min', 'gyro_x_dashboard_l_STD_'}
new dataframe has shape (269336, 19)
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\bruker\venv\lib\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\bruker\AppData\Local\Temp\ipykernel_38728\2447120228.py", line 3, in <module>
    final_metrics_correlation = {name: compute_correlation(cleaned_final_df[name]) for name in cleaned_final_df.columns}
  File "C:\Users\bruker\AppData\Local\Temp\ipykernel_38728\2447120228.py", line 3, in <dictcomp>
    final_metrics_correlation = {name: compute_correlation(cleaned_final_df[name]) for name in cleaned_final_df.columns}
  File "C:\Users\bruker\AppData\Local\Temp\ipykernel_38728\3691109450.py", line 2, in compute_correlation
    numeric_df = df.select_dtypes(include=['number'])
  File "c:\Users\bruker\venv\lib\site-packages\pandas\core\generic.py", line 5575, in __getattr__
    return object.__getattribute__(self, name)
AttributeError: 'Series' object has no attribute 'select_dtypes'

D

In [17]:
from sklearn.model_selection import KFold
num_folds = 5
kf = KFold(n_splits=num_folds,shufX_train,y_train,X_test,y_testfle=True,random_state=42)



#APPLY CROSS-FOLDER EVALUATION
X_train,y_train,X_test,y_test=create_final_bis(datasets_reduced,100)

model=create_model(model_type)
for window_size in window_sizes:
        print(f"--------------------")
        print(f"EVALUATE window_size: {window_size}")
        X_train,y_train,X_test,y_test=create_final_bis(datasets_reduced,window_size)
        
        accuracy=evaluate(X_train,y_train,X_test,y_test,num_folds,"SVM")
        tot_accuracies.append(accuracy)
        
        print(f"accuracy is: {accuracy}")
        
        #update optimal results if needed
        if accuracy>best_accuracy:
            best_accuracy=accuracy
            best_window_size=window_size


#print(X.to_numpy().shape)
#evaluate(X_train.to_numpy(),y_train.to_numpy(),X_test.to_numpy(),y_test.to_numpy(),num_folds,"SVM")

SyntaxError: positional argument follows keyword argument (1572955169.py, line 3)

In [None]:
# Plotting correlation matrix function

def plot_corr_mat(numeric_df):

    #print(numeric_df.columns)
    correlation_matrix=numeric_df.corr()
        
    # Create the heatmap
    plt.figure(figsize=(18, 14))  # Increase the size of the figure
    
    # Plot the heatmap with adjusted figure size
    sns.heatmap(correlation_matrix, annot=False, fmt='.2f', cmap='coolwarm', linewidths=0.5,vmin=-1, vmax=1)
    
    # Rotate the tick labels for better readability and shrink font size
    plt.xticks(rotation=90, fontsize=8)  # Rotate x-axis labels by 90 degrees and adjust font size
    plt.yticks(rotation=0, fontsize=8)   # Keep y-axis labels horizontal and adjust font size
    
    # Add a title with smaller font size
    plt.title("Correlation Matrix", fontsize=12)
    
    # Show the plot with a tight layout
    plt.tight_layout()  # Adjust layout to prevent clipping
    # Save to a file
    plt.savefig('features_correlation.png')
    
    # Optionally, show the plot if you want to see it in Colab
    plt.show()