In [1]:
import os
import pandas as pd
import dask.dataframe as dd
from sqlalchemy import create_engine
from datetime import datetime


from warnings import filterwarnings 
filterwarnings('ignore')

In [2]:
%%time

# set to your own desktop name
PC_name = os.environ.get('MANGOSTEEN')

# connect to database in mssql
# server = f"{PC_name}\SQLEXPRESS" # SQL Server Name
server = 'MANGOSTEEN\SQLEXPRESS'
database = "PAI" # database name
# con_string = f'mssql+pyodbc://{server}/{database}?driver=SQL Server'
con_string = ('mssql+pyodbc://{}/{}?driver=SQL Server'.format(server, database))
engine = create_engine(con_string)
# engine = create_engine(con_string)

# retrieve data from the database
connection = engine.connect()

# driver data
driver = connection.execute('SELECT * FROM Tempdriver')
driver_data = pd.DataFrame(data=driver.fetchall(), columns=driver.keys())

# trip data
safety = connection.execute('SELECT * FROM Tempsafety')
safety_data = pd.DataFrame(data=safety.fetchall(), columns=safety.keys())

connection.close() # close connection explicitly

CPU times: total: 31.2 ms
Wall time: 659 ms


In [3]:
%%time

# get sensor data by chunksize
engine = create_engine('mssql+pyodbc://{}/{}?driver=SQL Server'.format(server, database))

connection = engine.connect().execution_options(stream_results=True)

sensor_data_generator = pd.read_sql_query('SELECT * FROM TempSensor', con_string, chunksize=10**5)
sensor_data = pd.concat([chunk for chunk in sensor_data_generator])

CPU times: total: 13.5 s
Wall time: 1min 8s


In [4]:
driver_data = driver_data.drop_duplicates()
trip_data = safety_data.drop_duplicates()
sensor_data = sensor_data.drop_duplicates()

In [5]:
%%time

# merge driver and safety data
driver_safety = safety_data.merge(driver_data, on='driver_id', how='left')

# merge driver_trips and sensor data
driver_safety_sensor = sensor_data.merge(driver_safety, on='bookingID', how='left')

CPU times: total: 1.42 s
Wall time: 3.43 s


In [6]:
driver_safety_sensor.head()


Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,speed,driver_id,label,driver_name,date_of_birth,years_of_exp,gender,car_brand,car_model_year,driver_rating
0,1176821039233,12.0,254.91,-0.087,-9.521,2.28,-0.0775,0.128,-0.0018,653.0,20.2,173,1,Colin Loudiane,1978-08-04,18,Male,Mazda,1992,4.3
1,712964571194,8.0,156.62,-0.265,-9.44,-1.28,-0.111,-0.1137,-0.0608,314.0,17.54,211,0,Hymie Hunnicutt,1981-05-20,17,Male,Hyundai,1994,3.7
2,678604832838,3.9,67.0,0.165,10.082,2.112,0.0317,0.0435,-0.0142,454.0,8.03,449,0,Caitrin Ardling,1979-02-28,14,Female,Audi,2004,4.4
3,558345748481,3.98,93.0,-0.329,9.648,-2.441,-0.0076,-0.0021,-0.0002,519.0,0.02,477,1,Sarita Mapples,1974-05-18,21,Female,BMW,2012,4.9
4,738734374915,3.9,4.0,0.947,9.694,2.628,0.0207,-0.0019,-0.0011,608.0,14.02,94,1,Boony Keat,1982-03-05,14,Male,Volkswagen,1999,2.6


In [7]:
driver_safety_sensor.info(show_counts=True)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7346552 entries, 0 to 7346551
Data columns (total 20 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   bookingID       7346552 non-null  object 
 1   Accuracy        7206507 non-null  float64
 2   Bearing         7157675 non-null  float64
 3   acceleration_x  7152447 non-null  float64
 4   acceleration_y  7136515 non-null  float64
 5   acceleration_z  7267506 non-null  float64
 6   gyro_x          7189579 non-null  float64
 7   gyro_y          7209823 non-null  float64
 8   gyro_z          7180187 non-null  float64
 9   second          7346552 non-null  float64
 10  speed           7234726 non-null  float64
 11  driver_id       7346552 non-null  int64  
 12  label           7346552 non-null  int64  
 13  driver_name     7346552 non-null  object 
 14  date_of_birth   7346552 non-null  object 
 15  years_of_exp    7346552 non-null  int64  
 16  gender          7346552 non-null  ob

# **ETL Above, Data Cleansing Starts Here**

In [8]:
taxi_data = driver_safety_sensor[['bookingID', 'Accuracy', 'Bearing', 'acceleration_x',
                                'acceleration_y', 'acceleration_z', 'gyro_x', 'gyro_y',
                                'gyro_z', 'second', 'speed', 'driver_id',
                                'label', 'driver_name', 'date_of_birth', 'years_of_exp', 'gender', 'car_brand',
                                'car_model_year', 'driver_rating']]
taxi_data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7346552 entries, 0 to 7346551
Data columns (total 20 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   bookingID       7346552 non-null  object 
 1   Accuracy        7206507 non-null  float64
 2   Bearing         7157675 non-null  float64
 3   acceleration_x  7152447 non-null  float64
 4   acceleration_y  7136515 non-null  float64
 5   acceleration_z  7267506 non-null  float64
 6   gyro_x          7189579 non-null  float64
 7   gyro_y          7209823 non-null  float64
 8   gyro_z          7180187 non-null  float64
 9   second          7346552 non-null  float64
 10  speed           7234726 non-null  float64
 11  driver_id       7346552 non-null  int64  
 12  label           7346552 non-null  int64  
 13  driver_name     7346552 non-null  object 
 14  date_of_birth   7346552 non-null  object 
 15  years_of_exp    7346552 non-null  int64  
 16  gender          7346552 non-null  ob

In [None]:
taxi_data = taxi_data.dropna(subset=['second'])

In [11]:
null_counts = taxi_data.isnull().sum()
print(null_counts)

bookingID              0
Accuracy          140045
Bearing           188877
acceleration_x    194105
acceleration_y    210037
acceleration_z     79046
gyro_x            156973
gyro_y            136729
gyro_z            166365
second                 0
speed             111826
driver_id              0
label                  0
driver_name            0
date_of_birth          0
years_of_exp           0
gender                 0
car_brand              0
car_model_year         0
driver_rating          0
dtype: int64


In [9]:
custom_copy = taxi_data.copy()

In [10]:
custom_copy = custom_copy.sort_values(['bookingID', 'second'])
custom_copy = custom_copy.reset_index(drop=True)

In [11]:
custom_copy.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7346552 entries, 0 to 7346551
Data columns (total 20 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   bookingID       7346552 non-null  object 
 1   Accuracy        7206507 non-null  float64
 2   Bearing         7157675 non-null  float64
 3   acceleration_x  7152447 non-null  float64
 4   acceleration_y  7136515 non-null  float64
 5   acceleration_z  7267506 non-null  float64
 6   gyro_x          7189579 non-null  float64
 7   gyro_y          7209823 non-null  float64
 8   gyro_z          7180187 non-null  float64
 9   second          7346552 non-null  float64
 10  speed           7234726 non-null  float64
 11  driver_id       7346552 non-null  int64  
 12  label           7346552 non-null  int64  
 13  driver_name     7346552 non-null  object 
 14  date_of_birth   7346552 non-null  object 
 15  years_of_exp    7346552 non-null  int64  
 16  gender          7346552 non-null  ob

In [12]:
# main program
# custom_copy = custom_copy.sort_values(['bookingID', 'second'])
# custom_copy = custom_copy.reset_index(drop=True)

def interpolate_missing(group):
    # make sure that the first row's null gets converted to 0
    if group.iloc[0].isna().any():
        group.iloc[0] = group.iloc[0].fillna(0)

    if group.iloc[-1].isna().any():
        group.iloc[-1] = group.iloc[-1].fillna(0)

    group['Accuracy'] = group['Accuracy'].interpolate()
    group['Bearing'] = group['Bearing'].interpolate()
    group['acceleration_x'] = group['acceleration_x'].interpolate()
    group['acceleration_y'] = group['acceleration_y'].interpolate()
    group['acceleration_z'] = group['acceleration_z'].interpolate()
    group['gyro_x'] = group['gyro_x'].interpolate()
    group['gyro_y'] = group['gyro_y'].interpolate()
    group['gyro_z'] = group['gyro_z'].interpolate()
    group['speed'] = group['speed'].interpolate()


    return group

df_interpolated = custom_copy.groupby('bookingID').apply(interpolate_missing)

df_interpolated.reset_index(drop=True, inplace=True)

In [14]:
df_interpolated.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7346552 entries, 0 to 7346551
Data columns (total 20 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   bookingID       7346552 non-null  object 
 1   Accuracy        7346552 non-null  float64
 2   Bearing         7346552 non-null  float64
 3   acceleration_x  7346552 non-null  float64
 4   acceleration_y  7346552 non-null  float64
 5   acceleration_z  7346552 non-null  float64
 6   gyro_x          7346552 non-null  float64
 7   gyro_y          7346552 non-null  float64
 8   gyro_z          7346552 non-null  float64
 9   second          7346552 non-null  float64
 10  speed           7346552 non-null  float64
 11  driver_id       7346552 non-null  int64  
 12  label           7346552 non-null  int64  
 13  driver_name     7346552 non-null  object 
 14  date_of_birth   7346552 non-null  object 
 15  years_of_exp    7346552 non-null  int64  
 16  gender          7346552 non-null  ob

In [15]:
null_counts = df_interpolated.isnull().sum()
print(null_counts)

bookingID         0
Accuracy          0
Bearing           0
acceleration_x    0
acceleration_y    0
acceleration_z    0
gyro_x            0
gyro_y            0
gyro_z            0
second            0
speed             0
driver_id         0
label             0
driver_name       0
date_of_birth     0
years_of_exp      0
gender            0
car_brand         0
car_model_year    0
driver_rating     0
dtype: int64


In [15]:
# Assuming 'date_column' is the name of your date column
df_interpolated['date_of_birth'] = pd.to_datetime(df_interpolated['date_of_birth'])

# Calculate age
df_interpolated['age'] = (datetime.now() - df_interpolated['date_of_birth']).dt.days // 365

df_interpolated.head(20)

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,...,driver_id,label,driver_name,date_of_birth,years_of_exp,gender,car_brand,car_model_year,driver_rating,age
0,0,8.0,143.3,-1.706,-9.271,-1.209,-0.029,-0.0327,0.0154,2.0,...,359,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53
1,0,8.0,143.3,-1.417,-9.548,-1.861,-0.0224,0.005,-0.0258,3.0,...,359,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53
2,0,8.0,143.3,-0.347,-9.533,-1.205,0.015,-0.05,0.0251,9.0,...,359,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53
3,0,8.0,143.3,-0.601,-9.452,-2.158,0.0045,-0.0117,-0.0041,11.0,...,359,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53
4,0,8.0,143.3,-0.598,-9.863,-1.673,-0.0004,0.0003,-0.0098,12.0,...,359,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53
5,0,8.0,143.3,-1.801,-9.196,-2.05,-0.00665,0.131,-0.0826,14.0,...,359,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53
6,0,8.0,143.3,0.045,-10.071,-1.198,-0.0129,-0.1066,0.0835,15.0,...,359,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53
7,0,8.0,143.3,0.499,-9.877,-2.003,-0.046,-0.0794,0.0368,16.0,...,359,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53
8,0,8.0,143.3,-1.324,-9.554,-1.951,0.0644,0.1137,-0.0836,17.0,...,359,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53
9,0,8.0,143.3,-0.493,-9.516,-1.826,0.0186,0.0,-0.0019,18.0,...,359,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53


In [16]:
current_year = datetime.now().year
df_interpolated['age_of_car'] = current_year - df_interpolated['car_model_year']

df_interpolated.head(20)

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,...,label,driver_name,date_of_birth,years_of_exp,gender,car_brand,car_model_year,driver_rating,age,age_of_car
0,0,8.0,143.3,-1.706,-9.271,-1.209,-0.029,-0.0327,0.0154,2.0,...,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53,20
1,0,8.0,143.3,-1.417,-9.548,-1.861,-0.0224,0.005,-0.0258,3.0,...,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53,20
2,0,8.0,143.3,-0.347,-9.533,-1.205,0.015,-0.05,0.0251,9.0,...,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53,20
3,0,8.0,143.3,-0.601,-9.452,-2.158,0.0045,-0.0117,-0.0041,11.0,...,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53,20
4,0,8.0,143.3,-0.598,-9.863,-1.673,-0.0004,0.0003,-0.0098,12.0,...,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53,20
5,0,8.0,143.3,-1.801,-9.196,-2.05,-0.00665,0.131,-0.0826,14.0,...,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53,20
6,0,8.0,143.3,0.045,-10.071,-1.198,-0.0129,-0.1066,0.0835,15.0,...,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53,20
7,0,8.0,143.3,0.499,-9.877,-2.003,-0.046,-0.0794,0.0368,16.0,...,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53,20
8,0,8.0,143.3,-1.324,-9.554,-1.951,0.0644,0.1137,-0.0836,17.0,...,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53,20
9,0,8.0,143.3,-0.493,-9.516,-1.826,0.0186,0.0,-0.0019,18.0,...,0,Jemmie Cardew,1970-05-12,9,Female,Mercedes-Benz,2003,4.7,53,20


In [17]:
df_interpolated.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7346552 entries, 0 to 7346551
Data columns (total 22 columns):
 #   Column          Non-Null Count    Dtype         
---  ------          --------------    -----         
 0   bookingID       7346552 non-null  object        
 1   Accuracy        7346552 non-null  float64       
 2   Bearing         7346552 non-null  float64       
 3   acceleration_x  7346552 non-null  float64       
 4   acceleration_y  7346552 non-null  float64       
 5   acceleration_z  7346552 non-null  float64       
 6   gyro_x          7346552 non-null  float64       
 7   gyro_y          7346552 non-null  float64       
 8   gyro_z          7346552 non-null  float64       
 9   second          7346552 non-null  float64       
 10  speed           7346552 non-null  float64       
 11  driver_id       7346552 non-null  int64         
 12  label           7346552 non-null  int64         
 13  driver_name     7346552 non-null  object        
 14  date_of_birth   73

In [18]:
df_interpolated = df_interpolated.drop(columns=['car_model_year', 'date_of_birth'])

In [20]:
df_interpolated.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7346552 entries, 0 to 7346551
Data columns (total 20 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   bookingID       7346552 non-null  object 
 1   Accuracy        7346552 non-null  float64
 2   Bearing         7346552 non-null  float64
 3   acceleration_x  7346552 non-null  float64
 4   acceleration_y  7346552 non-null  float64
 5   acceleration_z  7346552 non-null  float64
 6   gyro_x          7346552 non-null  float64
 7   gyro_y          7346552 non-null  float64
 8   gyro_z          7346552 non-null  float64
 9   second          7346552 non-null  float64
 10  speed           7346552 non-null  float64
 11  driver_id       7346552 non-null  int64  
 12  label           7346552 non-null  int64  
 13  driver_name     7346552 non-null  object 
 14  years_of_exp    7346552 non-null  int64  
 15  gender          7346552 non-null  object 
 16  car_brand       7346552 non-null  ob

In [19]:
df_interpolated.dtypes

bookingID          object
Accuracy          float64
Bearing           float64
acceleration_x    float64
acceleration_y    float64
acceleration_z    float64
gyro_x            float64
gyro_y            float64
gyro_z            float64
second            float64
speed             float64
driver_id           int64
label               int64
driver_name        object
years_of_exp        int64
gender             object
car_brand          object
driver_rating     float64
age                 int64
age_of_car          int64
dtype: object

In [21]:
df_interpolated['bookingID'] = df_interpolated['bookingID'].astype('int64')

In [22]:
df_interpolated.info(show_counts=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7346552 entries, 0 to 7346551
Data columns (total 20 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   bookingID       7346552 non-null  int64  
 1   Accuracy        7346552 non-null  float64
 2   Bearing         7346552 non-null  float64
 3   acceleration_x  7346552 non-null  float64
 4   acceleration_y  7346552 non-null  float64
 5   acceleration_z  7346552 non-null  float64
 6   gyro_x          7346552 non-null  float64
 7   gyro_y          7346552 non-null  float64
 8   gyro_z          7346552 non-null  float64
 9   second          7346552 non-null  float64
 10  speed           7346552 non-null  float64
 11  driver_id       7346552 non-null  int64  
 12  label           7346552 non-null  int64  
 13  driver_name     7346552 non-null  object 
 14  years_of_exp    7346552 non-null  int64  
 15  gender          7346552 non-null  object 
 16  car_brand       7346552 non-null  ob

In [21]:
df_interpolated.head(20)

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,speed,driver_id,label,driver_name,years_of_exp,gender,car_brand,driver_rating,age,age_of_car
0,0,8.0,143.3,-1.706,-9.271,-1.209,-0.029,-0.0327,0.0154,2.0,0.0,359,0,Jemmie Cardew,9,Female,Mercedes-Benz,4.7,53,20
1,0,8.0,143.3,-1.417,-9.548,-1.861,-0.0224,0.005,-0.0258,3.0,0.23,359,0,Jemmie Cardew,9,Female,Mercedes-Benz,4.7,53,20
2,0,8.0,143.3,-0.347,-9.533,-1.205,0.015,-0.05,0.0251,9.0,0.23,359,0,Jemmie Cardew,9,Female,Mercedes-Benz,4.7,53,20
3,0,8.0,143.3,-0.601,-9.452,-2.158,0.0045,-0.0117,-0.0041,11.0,0.23,359,0,Jemmie Cardew,9,Female,Mercedes-Benz,4.7,53,20
4,0,8.0,143.3,-0.598,-9.863,-1.673,-0.0004,0.0003,-0.0098,12.0,0.23,359,0,Jemmie Cardew,9,Female,Mercedes-Benz,4.7,53,20
5,0,8.0,143.3,-1.801,-9.196,-2.05,-0.00665,0.131,-0.0826,14.0,0.23,359,0,Jemmie Cardew,9,Female,Mercedes-Benz,4.7,53,20
6,0,8.0,143.3,0.045,-10.071,-1.198,-0.0129,-0.1066,0.0835,15.0,0.23,359,0,Jemmie Cardew,9,Female,Mercedes-Benz,4.7,53,20
7,0,8.0,143.3,0.499,-9.877,-2.003,-0.046,-0.0794,0.0368,16.0,0.23,359,0,Jemmie Cardew,9,Female,Mercedes-Benz,4.7,53,20
8,0,8.0,143.3,-1.324,-9.554,-1.951,0.0644,0.1137,-0.0836,17.0,0.23,359,0,Jemmie Cardew,9,Female,Mercedes-Benz,4.7,53,20
9,0,8.0,143.3,-0.493,-9.516,-1.826,0.0186,0.0,-0.0019,18.0,0.23,359,0,Jemmie Cardew,9,Female,Mercedes-Benz,4.7,53,20


In [22]:
df_interpolated.dtypes

bookingID           int64
Accuracy          float64
Bearing           float64
acceleration_x    float64
acceleration_y    float64
acceleration_z    float64
gyro_x            float64
gyro_y            float64
gyro_z            float64
second            float64
speed             float64
driver_id           int64
label               int64
driver_name        object
years_of_exp        int64
gender             object
car_brand          object
driver_rating     float64
age                 int64
age_of_car          int64
dtype: object

In [23]:
df_interpolated.info(show_counts=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7346552 entries, 0 to 7346551
Data columns (total 20 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   bookingID       7346552 non-null  int64  
 1   Accuracy        7346552 non-null  float64
 2   Bearing         7346552 non-null  float64
 3   acceleration_x  7346552 non-null  float64
 4   acceleration_y  7346552 non-null  float64
 5   acceleration_z  7346552 non-null  float64
 6   gyro_x          7346552 non-null  float64
 7   gyro_y          7346552 non-null  float64
 8   gyro_z          7346552 non-null  float64
 9   second          7346552 non-null  float64
 10  speed           7346552 non-null  float64
 11  driver_id       7346552 non-null  int64  
 12  label           7346552 non-null  int64  
 13  driver_name     7346552 non-null  object 
 14  years_of_exp    7346552 non-null  int64  
 15  gender          7346552 non-null  object 
 16  car_brand       7346552 non-null  ob

In [24]:
df_interpolated.to_csv('taxi_data.csv', index=False)

In [24]:
%%time
# if directory does not exist, create it
if not os.path.exists('../Datasets/cleaned/'):
    os.makedirs('../Datasets/cleaned/')

# save data to csv
df_interpolated.to_csv('../Datasets/cleaned/taxi_data.csv', index=False)

KeyboardInterrupt: 

In [25]:
df1 = pd.read_csv('./taxi_data.csv')



In [26]:
df1.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7346552 entries, 0 to 7346551
Data columns (total 20 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   bookingID       7346552 non-null  int64  
 1   Accuracy        7346552 non-null  float64
 2   Bearing         7346552 non-null  float64
 3   acceleration_x  7346552 non-null  float64
 4   acceleration_y  7346552 non-null  float64
 5   acceleration_z  7346552 non-null  float64
 6   gyro_x          7346552 non-null  float64
 7   gyro_y          7346552 non-null  float64
 8   gyro_z          7346552 non-null  float64
 9   second          7346552 non-null  float64
 10  speed           7346552 non-null  float64
 11  driver_id       7346552 non-null  int64  
 12  label           7346552 non-null  int64  
 13  driver_name     7346552 non-null  object 
 14  years_of_exp    7346552 non-null  int64  
 15  gender          7346552 non-null  object 
 16  car_brand       7346552 non-null  ob