In [None]:
import pandas as pd

# Prepare data for continuous learning demo setup

In [None]:
# Load the datasets
claims_all = pd.read_csv('./data/claims_all.csv', low_memory=False)
diagnostic_chunk2 = pd.read_csv('./data/diagnostic_chunk2.csv', low_memory=False)

# Filter the diagnostic_chunk2 dataframe to get vehicles with >30 claim records
vin_counts = claims_all['anonymised_vin'].value_counts()
vins_more_than_30 = vin_counts[vin_counts > 30].index
filtered = diagnostic_chunk2[diagnostic_chunk2['anonymised_vin'].isin(vins_more_than_30)]

# Save the filtered dataframe to a new CSV file
filtered.to_csv('continuous_learning_diagnostics.csv', index=False)

In [None]:
filtered = pd.read_csv('continuous_learning_claims.csv', low_memory=False)
filtered2 = pd.read_csv('continuous_learning_diagnostics.csv', low_memory=False)
print(f"There are {filtered['anonymised_vin'].nunique()} unique anonymised_vin values in the claims data.")
print(f"There are {filtered2['anonymised_vin'].nunique()} unique anonymised_vin values in the diag data.")

# Prepare data for recommender demo setup

In [None]:
diagnostic_chunk3 = pd.read_csv('../data/diagnostic_chunk3.csv', low_memory=False)
diagnostic_chunk3['sessiontimestamp'] = pd.to_datetime(diagnostic_chunk3['sessiontimestamp'])

# Group by 'anonymised_vin' and 'day', then filter for groups with more than one unique 'dtcfull'
diagnostic_chunk3['day'] = diagnostic_chunk3['sessiontimestamp'].dt.date

In [None]:
# Filter based on 'otxsequence' being DTC Read (G2725772)
filtered_otx = diagnostic_chunk3[diagnostic_chunk3['otxsequence'] == 'G2725772']
# Filter out all vins where there are less than 2 unique dtcfull values
groups = diagnostic_chunk3.groupby(['anonymised_vin', 'day'])
filtered_groups = groups.filter(lambda x: x['dtcfull'].nunique() > 1)

In [None]:
# Drop duplicates based on 'anonymised_vin', 'dtcfull', and 'day'
filtered_groups = filtered_groups.drop_duplicates(subset=['anonymised_vin', 'dtcfull', 'day'])
# Drop the 'day' column as it was only needed for the grouping
filtered_groups = filtered_groups.drop(columns='day')
filtered_groups.to_csv('recommender_testing_data.csv', index=False)
print(f"There are {filtered_groups['anonymised_vin'].nunique()} unique anonymised_vin values in the data.")

In [None]:
# Randomly select 30 unique anonymised_vin values
selected_vins = filtered_groups['anonymised_vin'].drop_duplicates().sample(n=30).values
# Filter the dataset to keep only these 30 anonymised_vin values
subset = filtered_groups[filtered_groups['anonymised_vin'].isin(selected_vins)]

In [None]:
subset.to_csv('recommender_testing_data_30vehicles.csv', index=False)
print(f"There are now {subset['anonymised_vin'].nunique()} unique anonymised_vin values in the subset.")
subset