In [12]:
# run `pip install category_encoders`

import pandas as pd
import category_encoders as ce
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import StandardScaler

## Data Import

In [13]:
diagnostic_df = pd.read_csv(f'data/diagnostic_100k_2.csv', low_memory=False)
warranty_df = pd.read_csv(f'data/claims_all.csv', low_memory=False)

## Feature Engineering: Consultation ID
# !Add description!

In [14]:
diagnostic_df['timestamp'] = pd.to_datetime(diagnostic_df['sessiontimestamp'], format="%Y-%m-%d %H:%M:%S.%f %Z")

# Create temporary 'date' column from 'timestamp'
diagnostic_df['date'] = diagnostic_df['timestamp'].dt.date
diagnostic_df.sort_values(['anonymised_vin', 'date'], kind='mergesort', inplace=True)

# Create 'consultationId' column
diagnostic_df['consultationid'] = (diagnostic_df['anonymised_vin'] != diagnostic_df['anonymised_vin'].shift()).astype(int)
diagnostic_df['consultationid'] += (diagnostic_df['date'] != diagnostic_df['date'].shift()).astype(int)
diagnostic_df['consultationid'] = diagnostic_df['consultationid'].cumsum()

## Data Restructuring: Derive Vehicle Data, Diagnostic Reads and Subsequent Diagnostic Actions
# !Add description!

In [15]:
# Create a DataFrame that only includes rows where 'otxSequence' is 'G2725772' (DTC Read)
cols_to_keep = [col for col in diagnostic_df.columns if col not in ['otxsequence', 'date', 'sessionid', 'timestamp', 'sessiontimestamp']]
vehicle_current_state_df = diagnostic_df.loc[diagnostic_df['otxsequence'] == 'G2725772', cols_to_keep].copy()

# Create a DataFrame that includes all diagnostic actions performed
cols_to_keep = ['anonymised_vin', 'consultationid', 'timestamp', 'otxsequence']
diagnostic_actions_df = diagnostic_df.loc[diagnostic_df['otxsequence'] != 'G2725772', cols_to_keep].copy()

# Merge the vehicle data and current state reads and diagnostic actions performed
diagnostic_df = pd.merge(vehicle_current_state_df, diagnostic_actions_df, how='inner', on=['anonymised_vin', 'consultationid'])
diagnostic_df

Unnamed: 0,anonymised_vin,model,modelyear,driver,plant,engine,transmission,module,dtcbase,dtcfull,faulttype,dtcdescription,odomiles,softwarepartnumber,hardwarepartnumber,builddate,warrantydate,consultationid,timestamp,otxsequence
0,SALEA6AW9M2-07f417faedca36ca324e0d7e5536a6bd,L663,2021,RHD,NITRA,DT306 - 3.0 i6 Diesel,Automatic,PSCMB,B1304,B1304-04,4,Electronic Power Assisted Steering System,15962.0,,M8B2-14F075-AA,10/02/2021,31/03/2021,5701,2023-04-12 14:06:13.675831+00:00,G3222026
1,SALEA6AW9M2-07f417faedca36ca324e0d7e5536a6bd,L663,2021,RHD,NITRA,DT306 - 3.0 i6 Diesel,Automatic,PSCMB,B1304,B1304-04,4,Electronic Power Assisted Steering System,15962.0,,M8B2-14F075-AA,10/02/2021,31/03/2021,5701,2023-04-12 14:06:13.675831+00:00,G3222026
2,SALEA6AW9M2-07f417faedca36ca324e0d7e5536a6bd,L663,2021,RHD,NITRA,DT306 - 3.0 i6 Diesel,Automatic,PSCMB,B1304,B1304-04,4,Electronic Power Assisted Steering System,15962.0,,M8B2-14F075-AA,10/02/2021,31/03/2021,5701,2023-04-12 14:08:10.196466+00:00,G3245520
3,SALEA6AW9M2-07f417faedca36ca324e0d7e5536a6bd,L663,2021,RHD,NITRA,DT306 - 3.0 i6 Diesel,Automatic,PSCMB,B1304,B1304-17,17,Electronic Power Assisted Steering System,15962.0,,M8B2-14F075-AA,10/02/2021,31/03/2021,5701,2023-04-12 14:06:13.675831+00:00,G3222026
4,SALEA6AW9M2-07f417faedca36ca324e0d7e5536a6bd,L663,2021,RHD,NITRA,DT306 - 3.0 i6 Diesel,Automatic,PSCMB,B1304,B1304-17,17,Electronic Power Assisted Steering System,15962.0,,M8B2-14F075-AA,10/02/2021,31/03/2021,5701,2023-04-12 14:06:13.675831+00:00,G3222026
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,SALZA2BT7LH-e01788438fe6e8b201986d27e2f683f4,L551,2020,LHD,HALEWOOD,AJ20-P3 1.5 Petrol PHEV,Automatic,IMC,B14A4,B14A4-02,2,APIX Display Link - Instrument Pack,26539.0,MX53-19C204-BG,J9C3-19C210-PC,11/11/2020,28/01/2021,114919,2022-11-09 07:42:04.901384+00:00,G3204479
133,SALZA2BT7LH-e01788438fe6e8b201986d27e2f683f4,L551,2020,LHD,HALEWOOD,AJ20-P3 1.5 Petrol PHEV,Automatic,IMC,B14A4,B14A4-02,2,APIX Display Link - Instrument Pack,26539.0,MX53-19C204-BG,J9C3-19C210-PC,11/11/2020,28/01/2021,114919,2022-11-09 07:40:48.106001+00:00,G3204479
134,SALZP2FX6LH-65d8428c707a8a2b60866ee9afbcf6d1,L551,2020,LHD,HALEWOOD,2.0 AJ200P Petrol,Automatic,IMC,B137F,B137F-31,31,Steering Wheel Left Switch Pack,18165.0,MX53-19C204-BG,J9C3-19C210-SB,19/02/2019,31/05/2019,126567,2023-05-12 12:29:46.576106+00:00,G3355275
135,SALZP2FX6LH-65d8428c707a8a2b60866ee9afbcf6d1,L551,2020,LHD,HALEWOOD,2.0 AJ200P Petrol,Automatic,IMC,B1087,B1087-83,83,LIN Bus 'A',18165.0,MX53-19C204-BG,J9C3-19C210-SB,19/02/2019,31/05/2019,126567,2023-05-12 12:29:46.576106+00:00,G3355275


## Merge Warranty Data
# !Add description!

In [16]:
diagnostic_df['timestamp'] = pd.to_datetime(diagnostic_df['timestamp'], utc=True)
warranty_df['i_incident_date'] = pd.to_datetime(warranty_df['i_incident_date'], utc=True)

# Rename 'anonymised_vin' in warranty_df
warranty_df = warranty_df.rename(columns={'anonymised_vin': 'warranty_anonymised_vin'})

df_list = []
for idx, row in diagnostic_df.iterrows():
    vin = row['anonymised_vin']
    diag_time = row['timestamp']

    # Filter warranty records for this VIN and within 7 days after the diagnostic session
    mask = ((warranty_df['warranty_anonymised_vin'] == vin) &
            (warranty_df['i_incident_date'] >= diag_time) &
            (warranty_df['i_incident_date'] <= diag_time + pd.Timedelta(days=7)))

    temp_warranty_df = warranty_df[mask]

    # Add the diagnostic data to these warranty records
    for _, warranty_row in temp_warranty_df.iterrows():
        merged_row = pd.concat([row, warranty_row])
        df_list.append(merged_row)

# Combine all dataframes
merged_df = pd.concat(df_list, axis=1).T
merged_df

Unnamed: 0,anonymised_vin,model,modelyear,driver,plant,engine,transmission,module,dtcbase,dtcfull,...,ic_eng_part_number,ic_serv_part_number,ic_replaced_yn,ic_accepted_date,i_p_css_description,i_original_ccc_description,i_cpsc_description,i_css_description,ic_customer_verbatim,ic_technical_verbatim
0,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-04,...,,,Y,2023-04-11,"Steering, Wheels and Tyres",Vehicle Is Not Easy To Steer,Steering Gear,"Steering, Wheels and Tyres",Customers reported that the steering has no po...,V: The vehicle is towed to the store\t and the...
1,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-04,...,,,N,2023-04-14,"Steering, Wheels and Tyres",Vehicle Is Not Easy To Steer,Steering Gear,"Steering, Wheels and Tyres",Primary Warranty Claim No. 304067A Parts Inven...,Primary Warranty Claim No. 304067A Parts Inven...
2,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-04,...,,,Y,2023-04-11,"Steering, Wheels and Tyres",Vehicle Is Not Easy To Steer,Steering Gear,"Steering, Wheels and Tyres",Customers reported that the steering has no po...,V: The vehicle is towed to the store\t and the...
3,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-04,...,,,N,2023-04-14,"Steering, Wheels and Tyres",Vehicle Is Not Easy To Steer,Steering Gear,"Steering, Wheels and Tyres",Primary Warranty Claim No. 304067A Parts Inven...,Primary Warranty Claim No. 304067A Parts Inven...
4,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-17,...,,,Y,2023-04-11,"Steering, Wheels and Tyres",Vehicle Is Not Easy To Steer,Steering Gear,"Steering, Wheels and Tyres",Customers reported that the steering has no po...,V: The vehicle is towed to the store\t and the...
5,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-17,...,,,N,2023-04-14,"Steering, Wheels and Tyres",Vehicle Is Not Easy To Steer,Steering Gear,"Steering, Wheels and Tyres",Primary Warranty Claim No. 304067A Parts Inven...,Primary Warranty Claim No. 304067A Parts Inven...
6,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-17,...,,,Y,2023-04-11,"Steering, Wheels and Tyres",Vehicle Is Not Easy To Steer,Steering Gear,"Steering, Wheels and Tyres",Customers reported that the steering has no po...,V: The vehicle is towed to the store\t and the...
7,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-17,...,,,N,2023-04-14,"Steering, Wheels and Tyres",Vehicle Is Not Easy To Steer,Steering Gear,"Steering, Wheels and Tyres",Primary Warranty Claim No. 304067A Parts Inven...,Primary Warranty Claim No. 304067A Parts Inven...
8,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-04,...,,,Y,2023-04-11,"Steering, Wheels and Tyres",Vehicle Is Not Easy To Steer,Steering Gear,"Steering, Wheels and Tyres",Customers reported that the steering has no po...,V: The vehicle is towed to the store\t and the...
9,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-04,...,,,N,2023-04-14,"Steering, Wheels and Tyres",Vehicle Is Not Easy To Steer,Steering Gear,"Steering, Wheels and Tyres",Primary Warranty Claim No. 304067A Parts Inven...,Primary Warranty Claim No. 304067A Parts Inven...


## Feature Engineering: Extract Temporal Features

In this section, we perform feature engineering on the 'timestamp' field to extract valuable temporal information about each diagnostic activity and some additional features from the vehicle's 'warrantydate' and 'builddate' data fields. The temporal features we derive are:

1. **Year**: The year the diagnostic activity was performed. This can help detect yearly trends in the data.
2. **Month**: The month the diagnostic activity was performed. This can help identify any monthly patterns.
3. **Day of Week**: The day of the week the diagnostic activity was performed. This can reveal weekly trends, such as certain activities being more common on certain days of the week.
4. **Week of Year**: The ISO week number of the year the diagnostic activity was performed. This can provide a more granular view of yearly trends.
5. **Time Since Last Activity**: The time in seconds since the last diagnostic activity for each consultation. This can help gauge the frequency of activities.
6. **Elapsed Time**: The time in seconds since the first diagnostic activity in each consultation. This can provide insight into the duration of consultations.
7. **Season of the Year**: The season (Winter, Spring, Summer, Autumn) when the diagnostic activity was performed. This can help identify seasonal trends, such as certain activities being more common in certain seasons.
8. **Age of Vehicle at the Time of Diagnostic Session**: The vehicle's age in years at the time of each diagnostic session. This might be a useful feature because older vehicles might have different diagnostic needs than newer ones.
9. **Time in Days since Warranty Started**: This feature might be useful as vehicles might have different diagnostic needs before and after their warranty starts. Also, customers might behave differently before and after their warranty starts.

The resulting dataframe now contains several new features that provide additional temporal context about each diagnostic activity.

In [17]:
merged_df.sort_values(['consultationid', 'timestamp'], inplace=True)

merged_df['year'] = merged_df['timestamp'].dt.year
merged_df['month'] = merged_df['timestamp'].dt.month
merged_df['dayOfWeek'] = merged_df['timestamp'].dt.dayofweek
merged_df['weekOfYear'] = merged_df['timestamp'].dt.isocalendar().week
merged_df['timeSinceLastActivitySec'] = (merged_df.groupby('consultationid')['timestamp'].diff().dt.total_seconds()).fillna(0)
merged_df['elapsedTimeSec'] = merged_df.groupby('consultationid')['timestamp'].transform(lambda x: (x - x.min())).dt.total_seconds()

# Derive 'Season of the Year'
def month_to_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Autumn'

merged_df['season'] = merged_df['month'].apply(month_to_season)

merged_df['builddate'] = pd.to_datetime(merged_df['builddate'], format='%d/%m/%Y').dt.tz_localize('UTC')
merged_df['warrantydate'] = pd.to_datetime(merged_df['warrantydate'], format='%d/%m/%Y').dt.tz_localize('UTC')
merged_df['vehicleAgeAtSession'] = (merged_df['timestamp'] - merged_df['builddate']).dt.days / 365
merged_df['daysSinceWarrantyStart'] = (merged_df['timestamp'] - merged_df['warrantydate']).dt.days

# Drop non-required columns
merged_df.drop(columns=['builddate', 'warrantydate', 'dtcdescription', 'v_warr_date_event', 'i_p_css_description',
                            'i_original_ccc_description', 'i_cpsc_description', 'i_css_description', 'ic_customer_verbatim',
                            'ic_technical_verbatim', 'i_incident_date', 'ic_accepted_date', 'warranty_anonymised_vin'],
               inplace=True)
merged_df

Unnamed: 0,anonymised_vin,model,modelyear,driver,plant,engine,transmission,module,dtcbase,dtcfull,...,ic_replaced_yn,year,month,dayOfWeek,weekOfYear,timeSinceLastActivitySec,elapsedTimeSec,season,vehicleAgeAtSession,daysSinceWarrantyStart
0,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-04,...,Y,2023,3,5,12,0.0,0.0,Spring,1.309589,362
1,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-04,...,N,2023,3,5,12,0.0,0.0,Spring,1.309589,362
4,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-17,...,Y,2023,3,5,12,0.0,0.0,Spring,1.309589,362
5,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-17,...,N,2023,3,5,12,0.0,0.0,Spring,1.309589,362
8,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-04,...,Y,2023,3,5,12,0.0,0.0,Spring,1.309589,362
9,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-04,...,N,2023,3,5,12,0.0,0.0,Spring,1.309589,362
12,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-17,...,Y,2023,3,5,12,0.0,0.0,Spring,1.309589,362
13,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-17,...,N,2023,3,5,12,0.0,0.0,Spring,1.309589,362
16,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-17,...,Y,2023,3,5,12,0.0,0.0,Spring,1.309589,362
17,SALEA6RX0N2-83269d76be07ad191acf751fac0d4c46,L663,2022,LHD,NITRA,204PT - 2.0 AJ200P Petrol,Automatic,PSCMB,B1304,B1304-17,...,N,2023,3,5,12,0.0,0.0,Spring,1.309589,362


## Feature Engineering: Removing Outlier Diagnostic Activities

In our dataset, certain diagnostic activities performed by the technicians are extremely common and are recorded in virtually every consultation. While these activities are a routine part of the consultation process, they do not carry significant diagnostic information for our model, and therefore, may not be useful in predicting recommendations. For instance, the 'CONSULTATION_START' activity is logged in every consultation but doesn't contribute meaningful information towards diagnosing a specific vehicle issue.

To identify and remove these non-informative activities, we follow a statistical outlier detection approach:

1. **Calculate Commonality**: First, we calculate the commonality score for each activity, which is the frequency of the activity divided by the total number of activities.

2. **Calculate Mean and Standard Deviation**: We then calculate the mean and standard deviation of these commonality scores.

3. **Identify Outliers**: Any activity whose commonality score lies beyond two standard deviations from the mean is considered an outlier. This threshold is based on the empirical rule, which states that for a normal distribution, about 95% of the data lies within two standard deviations of the mean.

4. **Remove Outliers**: Finally, we remove these outlier activities from our dataset, leaving us with a set of activities that are varied enough to provide meaningful information for our model.

In [18]:
activity_commonality = merged_df.value_counts('otxsequence')/merged_df['otxsequence'].count()
activity_commonality = activity_commonality.reset_index()
activity_commonality.columns = ['otxsequence', 'commonalityScore']

mean = activity_commonality.commonalityScore.mean()
std = activity_commonality.commonalityScore.std()
print(f'MEAN: {mean}  STD: {std}')

lower = mean - (2 * std)
upper = mean + (2 * std)

# Identify the outliers by checking for commonality score less than or greater than lower and upper bounds respectively.
outliers_condition = (activity_commonality.commonalityScore < lower) | (upper < activity_commonality.commonalityScore)
most_common_activities = activity_commonality[outliers_condition]

print(f"Most common activities with their commonality score (activities to be removed):\n{most_common_activities}")

MEAN: 0.16666666666666666  STD: 0.10458250331675945
Most common activities with their commonality score (activities to be removed):
Empty DataFrame
Columns: [otxsequence, commonalityScore]
Index: []


In [19]:
# Remove identified outlier (the most common) activities
num_records_initial = len(merged_df)
merged_df = merged_df[~merged_df.otxsequence.isin(most_common_activities.otxsequence)]

print(f'Number of records removed: {num_records_initial - len(merged_df)}')

Number of records removed: 0


## Data Cleaning: Removing Duplicate Records

In this step of the data preprocessing, we aim to remove any duplicate entries in the dataset.

We utilize the `drop_duplicates()` function from pandas library for this purpose. The `inplace=True` parameter ensures that the operation is performed on the dataset directly, without the need to assign the result to a new variable.

In [20]:
num_records_initial = len(merged_df)
merged_df.drop_duplicates()

print(f'Number of duplicate records removed: {num_records_initial - len(merged_df)}')

Number of duplicate records removed: 0


## Data Cleaning: Setting Appropriate Data Types

As part of this step we ensure that the data is represented in the correct format, using appropriate data type for efficient data manipulation and accurate model training.

In this code block, we are setting the data types for each column as follows:

* The 'elapsedTimeSec', 'timeSinceLastActivitySec', 'odoMiles', 'vehicleAgeAtSession' and 'daysSinceWarrantyStart'  columns are set to float64 as they contain numerical continuous data with a floating point.
* The 'year', 'month', 'dayOfWeek', and 'weekOfYear' columns are set to CategoricalDtype(ordered=True) since they contain categorical temporal data that has a natural order
* The rest of the columns are set to CategoricalDtype(ordered=False) since they contain categorical data without a natural order

In [21]:
float_cols = ['elapsedTimeSec', 'timeSinceLastActivitySec', 'odomiles', 'vehicleAgeAtSession',
              'daysSinceWarrantyStart', 'i_mileage', 'i_time_in_service', 'i_months_in_service']
ordered_cat_cols = ['year', 'month', 'dayOfWeek', 'weekOfYear', 'season']
unordered_cat_cols = ['anonymised_vin', 'consultationid', 'otxsequence', 'model', 'modelyear', 'driver',
                      'plant', 'engine', 'transmission', 'module', 'dtcbase', 'faulttype', 'dtcfull',
                      'softwarepartnumber', 'hardwarepartnumber', 'i_p_css_code', 'i_original_ccc_code', 'i_original_vfg_code',
                      'i_original_function_code', 'i_original_vrt_code', 'i_current_vfg_code', 'i_current_function_code',
                      'i_current_vrt_code',	'i_cpsc_code', 'i_cpsc_vfg_code', 'i_css_code', 'v_transmission_code',
                      'v_drive_code', 'v_engine_code', 'ic_repair_dealer_id', 'ic_eng_part_number', 'ic_serv_part_number',
                      'ic_part_suffix', 'ic_part_base', 'ic_part_prefix', 'ic_causal_part_id', 'ic_repair_country_code']

# Change dtype of the columns
for col in float_cols:
    merged_df[col] = merged_df[col].astype('float64')

for col in ordered_cat_cols:
    merged_df[col] = merged_df[col].astype(CategoricalDtype(ordered=True))

for col in unordered_cat_cols:
    merged_df[col] = merged_df[col].astype(CategoricalDtype(ordered=False)).cat.add_categories(['Unknown']).fillna('Unknown')

## Data Cleaning: Identify and Handle Missing Values
# !UPDATE!
The isna().sum() call returns a pandas Series with column names as the index and the count of NaN values as the values. We then filter this series to only include columns with more than 0 NaN values and print them.

In [22]:
merged_df['daysSinceWarrantyStart'].fillna(0) # If NaN, then it is likely that warranty has not started on the vehicle, thus warrantydate is empty

na_counts = merged_df.isna().sum()
na_columns = na_counts[na_counts > 0]

if len(na_columns) > 0:
    print(f"Data fields with NaN values:\n{na_columns}")
    print(f'Total number of records in the DataFrame: {len(merged_df)}')
else:
    print('There are no missing values in the DataFrame.')

There are no missing values in the DataFrame.


## Data Normalisation: Standardise Numerical Data

In this step, we are standardising the values of the 'elapsedTimeSec', 'timeSinceLastActivitySec', 'odomiles', 'vehicleAgeAtSession', and 'daysSinceWarrantyStart' columns. These columns represent continuous numerical data (temporal data and odometer readings), which we expect to follow a normal-like distribution.

We are using sklearn's StandardScaler for this task. This method standardizes features by removing the mean and scaling to unit variance. This transformation helps to achieve properties of a standard normal distribution where the mean (average) of each feature is 0 and the standard deviation is 1.

By doing this, we are ensuring that these features have the same scale and thus contributing equally to the model's performance.

In [23]:
data_scaler = StandardScaler()
merged_df[float_cols] = data_scaler.fit_transform(merged_df[float_cols])

## Data Encoding: Encode Categorical Features

In this step, we are using binary encoding to convert all our categorical data variables into a form that can be provided to our model to improve its performance.

Binary encoding is a combination of Hash encoding and one-hot encoding. In binary encoding, first the categories are encoded as ordinal, then those integers are converted into binary code, then the digits from that binary string are split into separate columns. This makes binary encoding more space efficient than one-hot encoding, especially for high cardinality variables.

In [24]:
encoder = ce.BinaryEncoder(cols = ordered_cat_cols + unordered_cat_cols, return_df = True)
encoded_data = encoder.fit_transform(merged_df)

## Save Preprocessed Data
Here we are saving the preprocessed data into a CSV file in the 'data' directory. The index=True parameter ensures that the index column is saved in the CSV file.

In [25]:
merged_df.to_csv('data_out/prepared_data.csv', index=True)
encoded_data.to_csv('data_out/encoded_prepared_data.csv', index=True)