In [1]:
import pandas as pd

# Load the data
data = pd.read_csv('../data/cleaned_data.csv')

# Inspect the data
print(data.head())
print(data.info())
print(data.describe())


  data = pd.read_csv('../data/cleaned_data.csv')


   UnderwrittenCoverID  PolicyID TransactionMonth  IsVATRegistered  \
0               145249     12827       2015-03-01             True   
1               145249     12827       2015-05-01             True   
2               145249     12827       2015-07-01             True   
3               145255     12827       2015-05-01             True   
4               145255     12827       2015-07-01             True   

  Citizenship          LegalType Title Language                 Bank  \
0     Unknown  Close Corporation    Mr  English  First National Bank   
1     Unknown  Close Corporation    Mr  English  First National Bank   
2     Unknown  Close Corporation    Mr  English  First National Bank   
3     Unknown  Close Corporation    Mr  English  First National Bank   
4     Unknown  Close Corporation    Mr  English  First National Bank   

       AccountType  ...            CoverGroup              Section  \
0  Current account  ...  Comprehensive - Taxi  Motor Comprehensive   
1  Cur

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define feature columns and target
target_column = 'TotalClaims'  # Replace with your target column name
X = data.drop(columns=[target_column])
y = data[target_column]

# Identify numerical and categorical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [5]:
# Check data types of columns
print(X.dtypes)

# Convert columns to appropriate types
# Convert categorical columns to strings
for col in categorical_features:
    X[col] = X[col].astype(str)

# Ensure numerical columns are of numeric type
for col in numerical_features:
    X[col] = pd.to_numeric(X[col], errors='coerce')  # Convert to numeric, coercing errors to NaN


UnderwrittenCoverID           int64
PolicyID                      int64
TransactionMonth             object
IsVATRegistered                bool
Citizenship                  object
LegalType                    object
Title                        object
Language                     object
Bank                         object
AccountType                  object
MaritalStatus                object
Gender                       object
Country                      object
Province                     object
PostalCode                    int64
MainCrestaZone               object
SubCrestaZone                object
ItemType                     object
mmcode                       object
VehicleType                  object
RegistrationYear              int64
make                         object
Model                        object
Cylinders                   float64
cubiccapacity               float64
kilowatts                   float64
bodytype                     object
NumberOfDoors               

In [6]:
# Check for missing values
print(X.isnull().sum())

# Fill missing values for numerical columns with mean
X[numerical_features] = X[numerical_features].fillna(X[numerical_features].mean())

# Fill missing values for categorical columns with the mode
X[categorical_features] = X[categorical_features].fillna(X[categorical_features].mode().iloc[0])


UnderwrittenCoverID              0
PolicyID                         0
TransactionMonth                 0
IsVATRegistered                  0
Citizenship                      0
LegalType                        0
Title                            0
Language                         0
Bank                             0
AccountType                      0
MaritalStatus                    0
Gender                           0
Country                          0
Province                         0
PostalCode                       0
MainCrestaZone                   0
SubCrestaZone                    0
ItemType                         0
mmcode                           0
VehicleType                      0
RegistrationYear                 0
make                             0
Model                            0
Cylinders                        0
cubiccapacity                    0
kilowatts                        0
bodytype                         0
NumberOfDoors                    0
VehicleIntroDate    

In [7]:
# Convert categorical columns to strings
categorical_features = [
    'TransactionMonth', 'Citizenship', 'LegalType', 'Title', 'Language',
    'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province',
    'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode', 'VehicleType',
    'bodytype', 'AlarmImmobiliser', 'TrackingDevice', 'NewVehicle', 'WrittenOff',
    'Rebuilt', 'Converted', 'TermFrequency', 'ExcessSelected', 'CoverCategory',
    'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType'
]

for col in categorical_features:
    X[col] = X[col].astype(str)


In [8]:
from sklearn.impute import SimpleImputer

# Impute missing values for numerical columns
numerical_features = [
    'CustomValueEstimate', 'CapitalOutstanding'
]

# Imputer for numerical features
num_imputer = SimpleImputer(strategy='mean')
X[numerical_features] = num_imputer.fit_transform(X[numerical_features])

# Check for any remaining missing values
print(X.isnull().sum())


UnderwrittenCoverID         0
PolicyID                    0
TransactionMonth            0
IsVATRegistered             0
Citizenship                 0
LegalType                   0
Title                       0
Language                    0
Bank                        0
AccountType                 0
MaritalStatus               0
Gender                      0
Country                     0
Province                    0
PostalCode                  0
MainCrestaZone              0
SubCrestaZone               0
ItemType                    0
mmcode                      0
VehicleType                 0
RegistrationYear            0
make                        0
Model                       0
Cylinders                   0
cubiccapacity               0
kilowatts                   0
bodytype                    0
NumberOfDoors               0
VehicleIntroDate            0
CustomValueEstimate         0
AlarmImmobiliser            0
TrackingDevice              0
CapitalOutstanding          0
NewVehicle

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Define the preprocessing for categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Define the model pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)

# Evaluate the model
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Training R^2 score: {train_r2}")
print(f"Test R^2 score: {test_r2}")
print(f"Training MSE: {train_mse}")
print(f"Test MSE: {test_mse}")


Training R^2 score: 0.0071635126624041146
Test R^2 score: 0.004492887130918666
Training MSE: 5841017.590901817
Test MSE: 4864433.100709145
