<a href="https://colab.research.google.com/github/fjadidi2001/Artificial_Intelligence_Learning/blob/master/Copy_of_Copy_of_Copy_of_telematics_syn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv('telematics_syn.csv')

# 1. Handling Missing Values
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Impute missing values (using median for numerical and most frequent for categorical)
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Imputation transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine transformers into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Apply the preprocessing steps
df_preprocessed = preprocessor.fit_transform(df)

# Convert the result back to a DataFrame
df_preprocessed = pd.DataFrame(df_preprocessed, columns=numerical_features + list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)))

# Check the shape of the preprocessed data
print("Shape of preprocessed data:", df_preprocessed.shape)

# Save the preprocessed DataFrame to a new CSV file
df_preprocessed.to_csv('telematics_syn_preprocessed.csv', index=False)

print("Preprocessing completed and saved to 'telematics_syn_preprocessed.csv'")


Missing values in each column:
 Duration                  0
Insured.age               0
Insured.sex               0
Car.age                   0
Marital                   0
Car.use                   0
Credit.score              0
Region                    0
Annual.miles.drive        0
Years.noclaims            0
Territory                 0
Annual.pct.driven         0
Total.miles.driven        0
Pct.drive.mon             0
Pct.drive.tue             0
Pct.drive.wed             0
Pct.drive.thr             0
Pct.drive.fri             0
Pct.drive.sat             0
Pct.drive.sun             0
Pct.drive.2hrs            0
Pct.drive.3hrs            0
Pct.drive.4hrs            0
Pct.drive.wkday           0
Pct.drive.wkend           0
Pct.drive.rush am         0
Pct.drive.rush pm         0
Avgdays.week              0
Accel.06miles             0
Accel.08miles             0
Accel.09miles             0
Accel.11miles             0
Accel.12miles             0
Accel.14miles             0
Brake.06miles   

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the preprocessed dataset
df_preprocessed = pd.read_csv('telematics_syn_preprocessed.csv')

# Split the dataset into train (70%), validation (15%), and test (15%) sets
train_size = 0.7
val_size = 0.15
test_size = 0.15

# Split the data into train and temp sets (train + validation + test)
train_df, temp_df = train_test_split(df_preprocessed, train_size=train_size, random_state=42)

# Split the temp set into validation and test sets
val_df, test_df = train_test_split(temp_df, test_size=test_size/(val_size + test_size), random_state=42)

# Print the sizes of each set
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

# Save the splits to CSV files (optional)
train_df.to_csv('telematics_train.csv', index=False)
val_df.to_csv('telematics_val.csv', index=False)
test_df.to_csv('telematics_test.csv', index=False)

print("Dataset splits saved to 'telematics_train.csv', 'telematics_val.csv', and 'telematics_test.csv'")


Training set size: 70000
Validation set size: 15000
Test set size: 15000
Dataset splits saved to 'telematics_train.csv', 'telematics_val.csv', and 'telematics_test.csv'
