In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import *
import xgboost as xgb
import math

from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

from sklearn.model_selection import train_test_split
import datetime


import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

# ***`Ouput from webpage form`***

In [2]:
output = {
    "Age"                            : 21,
    "Gender"                         : "Male - :material/male:",
    "Annual Income"                  : 5124,
    "Marital Status"                 : "Single - :material/man_4:",
    "Number of Dependents"           : 3,
    "Education Level"                : "Master's",
    "Occupation"                     : "Unemployed",
    "Health Score"                   : 784,
    "Location"                       : "Urban",
    "Policy Type"                    : "Basic",
    "Previous Claims"                : 2,
    "Vehicle Age"                    : 8,
    "Credit Score"                   : 7844.99,
    "Insurance Duration"             : 8,
    "Policy Start Date"              : datetime.date(2022, 8, 17),
    "Customer Feedback"              : "Good",
    "Smoking Status"                 : "No",
    "Exercise Frequency"             : "Weekly",
    "Property Type"                  : "House",
}

In [3]:
output["IsNull_Age"] = 0
output["IsNull_Annual Income"] = 0
output["IsNull_Marital Status"] = 0
output["IsNull_Number of Dependents"] = 0
output["IsNull_Occupation"] = 0
output["IsNull_Health Score"] = 0
output["IsNull_Previous Claims"] = 0
output["IsNull_Vehicle Age"] = 0
output["IsNull_Credit Score"] = 0
output["IsNull_Insurance Duration"] = 0
output["IsNull_Customer Feedback"] = 0

output["Gender"] = output["Gender"].split(" - ")[0]

output["Marital Status"] = output["Marital Status"].split(" - ")[0]

In [4]:
output

{'Age': 21,
 'Gender': 'Male',
 'Annual Income': 5124,
 'Marital Status': 'Single',
 'Number of Dependents': 3,
 'Education Level': "Master's",
 'Occupation': 'Unemployed',
 'Health Score': 784,
 'Location': 'Urban',
 'Policy Type': 'Basic',
 'Previous Claims': 2,
 'Vehicle Age': 8,
 'Credit Score': 7844.99,
 'Insurance Duration': 8,
 'Policy Start Date': datetime.date(2022, 8, 17),
 'Customer Feedback': 'Good',
 'Smoking Status': 'No',
 'Exercise Frequency': 'Weekly',
 'Property Type': 'House',
 'IsNull_Age': 0,
 'IsNull_Annual Income': 0,
 'IsNull_Marital Status': 0,
 'IsNull_Number of Dependents': 0,
 'IsNull_Occupation': 0,
 'IsNull_Health Score': 0,
 'IsNull_Previous Claims': 0,
 'IsNull_Vehicle Age': 0,
 'IsNull_Credit Score': 0,
 'IsNull_Insurance Duration': 0,
 'IsNull_Customer Feedback': 0}

In [5]:
df = pd.DataFrame(output, index=[0])

In [6]:
df

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback
0,21,Male,5124,Single,3,Master's,Unemployed,784,Urban,Basic,2,8,7844.99,8,2022-08-17,Good,No,Weekly,House,0,0,0,0,0,0,0,0,0,0,0


In [7]:
df.shape

(1, 30)

In [8]:
smoke = df["Smoking Status"].replace({"Yes" : 0, "No" : 1})


ex = df["Exercise Frequency"].replace({"Rarely" : 0, "Monthly" : 1, "Weekly" : 2, "Daily" : 3})


bins = [0, 30, 53, float('inf')]
labels = [0, 1, 2]
age = pd.cut(df['Age'], bins=bins, labels=labels, right=False)


bins = [0, 16.285503904803008, 33.959695457149195, float('inf')]
labels = [0, 1, 2]
health = pd.cut(df['Health Score'], bins=bins, labels=labels, right=False)

# ==============================================================================

df["Health Conscious Level"] = smoke + ex + age.astype('int') + health.astype('int')

# ==============================================================================

smoke = df["Smoking Status"].replace({"Yes" : 2, "No" : 4})


ex = df["Exercise Frequency"].replace({"Rarely" : 2, "Monthly" : 4, "Weekly" : 8, "Daily" : 16})


age = df['Age']


health = df['Health Score']

# ==============================================================================

df["Health Conscious Level1"] = smoke * ex * age * health

# ==============================================================================

df["Money Per Head"] = df["Annual Income"] / df["Number of Dependents"].where(df["Number of Dependents"] != 0, 1)

# ==============================================================================

df["Money Handling Level"] = df["Annual Income"] * df["Credit Score"]

# ==============================================================================

df["Money Handling Level1"] = df["Annual Income"] / df["Credit Score"]

# ==============================================================================

df["Growth"] = df["Education Level"].replace({"High School" : 1, "Bachelor's" : 2, "Master's" : 3, "PhD" : 4}) * df["Annual Income"]

# ==============================================================================

df["Growth1"] = df["Annual Income"] / df["Education Level"].replace({"High School" : 1, "Bachelor's" : 2, "Master's" : 3, "PhD" : 4})

# ==============================================================================

df["Determinstic"] = df["Annual Income"] * (1 / df["Age"])

# ==============================================================================

df["Day_Name"] = df["Policy Start Date"][0].strftime("%A")

# ==============================================================================

df["Credit by Score"] = df["Credit Score"]/df["Previous Claims"].where(df["Previous Claims"] != 0, 1)

# ==============================================================================

df['CreditInsurance'] = df['Credit Score'] * df['Insurance Duration']

# ==============================================================================

df['Health_Risk_Score'] = df['Smoking Status'].apply(lambda x: 1 if x == 'Smoker' else 0) + df['Exercise Frequency'].apply(lambda x: 1 if x == 'Low' else (0.5 if x == 'Medium' else 0)) + (100 - df['Health Score']) / 20

# ==============================================================================

df['Credit_Health_Score'] = df['Credit Score'] * df['Health Score']

# ==============================================================================

df['Health_Age_Interaction'] = df['Health Score'] * df['Age']

# ==============================================================================

df["Feedback1"] = df["Annual Income"] * df["Customer Feedback"].replace({"Poor" : 2, "Average" : 4, "Good" : 8})

# ==============================================================================

df["Feedback2"] = df["Credit Score"] * df["Customer Feedback"].replace({"Poor" : 2, "Average" : 4, "Good" : 8})

# ==============================================================================

df["Feedback3"] = df["Previous Claims"] * df["Customer Feedback"].replace({"Poor" : 2, "Average" : 4, "Good" : 8})

# ==============================================================================

df["Feedback4"] = df["Health Score"] * df["Customer Feedback"].replace({"Poor" : 2, "Average" : 4, "Good" : 8})

# ==============================================================================

df['Total Nulls'] = 0


In [9]:
df.dtypes

Age                              int64
Gender                          object
Annual Income                    int64
Marital Status                  object
Number of Dependents             int64
Education Level                 object
Occupation                      object
Health Score                     int64
Location                        object
Policy Type                     object
Previous Claims                  int64
Vehicle Age                      int64
Credit Score                   float64
Insurance Duration               int64
Policy Start Date               object
Customer Feedback               object
Smoking Status                  object
Exercise Frequency              object
Property Type                   object
IsNull_Age                       int64
IsNull_Annual Income             int64
IsNull_Marital Status            int64
IsNull_Number of Dependents      int64
IsNull_Occupation                int64
IsNull_Health Score              int64
IsNull_Previous Claims   

In [10]:
df.shape

(1, 49)

In [11]:
df["Policy Start Date - Day"] = df["Policy Start Date"][0].day
df["Policy Start Date - Month"] = df["Policy Start Date"][0].month
df["Policy Start Date - Year"] = df["Policy Start Date"][0].year

In [12]:
df["Policy Start Date - Quarter"] = str(df["Policy Start Date"][0].year) + " Q" + str(math.ceil(df["Policy Start Date"][0].month/3))

In [13]:
df["Sin_Year"] = np.sin(2 * np.pi * df["Policy Start Date - Year"].astype('int64'))
df["Cos_Year"] = np.cos(2 * np.pi * df["Policy Start Date - Year"].astype('int64'))

In [14]:
df["Sin_Month"] = np.sin(2 * np.pi * df["Policy Start Date - Month"].astype('int64'))
df["Cos_Month"] = np.cos(2 * np.pi * df["Policy Start Date - Month"].astype('int64'))

In [15]:
df.drop(columns="Policy Start Date", inplace=True)

In [16]:
df["Health Conscious Level"] = df["Health Conscious Level"].astype("O")

In [17]:
magics = [
        "Number of Dependents_MIN_Premium Amount", 
        "Number of Dependents_MEAN_Premium Amount",
        "Number of Dependents_Q1_Premium Amount",               
        "Number of Dependents_MEDIAN_Premium Amount",           
        "Number of Dependents_Q3_Premium Amount",               
        "Number of Dependents_STD_Premium Amount",              
        "Number of Dependents_MAX_Premium Amount",              
        "Occupation_MIN_Premium Amount",                        
        "Occupation_MEAN_Premium Amount",                       
        "Occupation_Q1_Premium Amount",                         
        "Occupation_MEDIAN_Premium Amount",                     
        "Occupation_Q3_Premium Amount",                         
        "Occupation_STD_Premium Amount",                        
        "Occupation_MAX_Premium Amount",                        
        "Education Level_MIN_Premium Amount",                   
        "Education Level_MEAN_Premium Amount",                  
        "Education Level_Q1_Premium Amount",                    
        "Education Level_MEDIAN_Premium Amount",                
        "Education Level_Q3_Premium Amount",                    
        "Education Level_STD_Premium Amount",                   
        "Education Level_MAX_Premium Amount",                   
        "Previous Claims_MIN_Premium Amount",                   
        "Previous Claims_MEAN_Premium Amount",                  
        "Previous Claims_Q1_Premium Amount",                    
        "Previous Claims_MEDIAN_Premium Amount",                
        "Previous Claims_Q3_Premium Amount",                    
        "Previous Claims_STD_Premium Amount",                   
        "Previous Claims_MAX_Premium Amount",                   
        "Health Conscious Level_MIN_Premium Amount",            
        "Health Conscious Level_MEAN_Premium Amount",           
        "Health Conscious Level_Q1_Premium Amount",             
        "Health Conscious Level_MEDIAN_Premium Amount",         
        "Health Conscious Level_Q3_Premium Amount",             
        "Health Conscious Level_STD_Premium Amount",            
        "Health Conscious Level_MAX_Premium Amount",            
        "Insurance Duration_MIN_Premium Amount",                
        "Insurance Duration_MEAN_Premium Amount",               
        "Insurance Duration_Q1_Premium Amount",                 
        "Insurance Duration_MEDIAN_Premium Amount",             
        "Insurance Duration_Q3_Premium Amount",                 
        "Insurance Duration_STD_Premium Amount",                
        "Insurance Duration_MAX_Premium Amount"   
]


In [18]:
for i in magics:
    column, operation, target = i.split("_")
    dummy = pd.read_csv(f"do_magics/{i}.csv")
    df[i] = dummy[dummy[column] == df[column][0]][target].values[0]

In [19]:
df

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,CreditInsurance,Health_Risk_Score,Credit_Health_Score,Health_Age_Interaction,Feedback1,Feedback2,Feedback3,Feedback4,Total Nulls,Policy Start Date - Day,Policy Start Date - Month,Policy Start Date - Year,Policy Start Date - Quarter,Sin_Year,Cos_Year,Sin_Month,Cos_Month,Number of Dependents_MIN_Premium Amount,Number of Dependents_MEAN_Premium Amount,Number of Dependents_Q1_Premium Amount,Number of Dependents_MEDIAN_Premium Amount,Number of Dependents_Q3_Premium Amount,Number of Dependents_STD_Premium Amount,Number of Dependents_MAX_Premium Amount,Occupation_MIN_Premium Amount,Occupation_MEAN_Premium Amount,Occupation_Q1_Premium Amount,Occupation_MEDIAN_Premium Amount,Occupation_Q3_Premium Amount,Occupation_STD_Premium Amount,Occupation_MAX_Premium Amount,Education Level_MIN_Premium Amount,Education Level_MEAN_Premium Amount,Education Level_Q1_Premium Amount,Education Level_MEDIAN_Premium Amount,Education Level_Q3_Premium Amount,Education Level_STD_Premium Amount,Education Level_MAX_Premium Amount,Previous Claims_MIN_Premium Amount,Previous Claims_MEAN_Premium Amount,Previous Claims_Q1_Premium Amount,Previous Claims_MEDIAN_Premium Amount,Previous Claims_Q3_Premium Amount,Previous Claims_STD_Premium Amount,Previous Claims_MAX_Premium Amount,Health Conscious Level_MIN_Premium Amount,Health Conscious Level_MEAN_Premium Amount,Health Conscious Level_Q1_Premium Amount,Health Conscious Level_MEDIAN_Premium Amount,Health Conscious Level_Q3_Premium Amount,Health Conscious Level_STD_Premium Amount,Health Conscious Level_MAX_Premium Amount,Insurance Duration_MIN_Premium Amount,Insurance Duration_MEAN_Premium Amount,Insurance Duration_Q1_Premium Amount,Insurance Duration_MEDIAN_Premium Amount,Insurance Duration_Q3_Premium Amount,Insurance Duration_STD_Premium Amount,Insurance Duration_MAX_Premium Amount
0,21,Male,5124,Single,3,Master's,Unemployed,784,Urban,Basic,2,8,7844.99,8,Good,No,Weekly,House,0,0,0,0,0,0,0,0,0,0,0,5,526848,1708.0,40197728.76,0.653156,15372,1708.0,244.0,Wednesday,3922.495,62759.92,-34.2,6150472.16,16464,40992,62759.92,16,6272,0,17,8,2022,2022 Q3,-1.305266e-12,1.0,-1.959435e-15,1.0,20.0,1104.006551,514.0,875.0,1513.0,864.955881,4997.0,20.0,1103.361209,514.0,872.0,1508.0,867.02349,4997.0,20.0,1102.113989,513.0,871.0,1512.0,866.235322,4997.0,20.0,1151.583106,526.0,907.0,1606.0,898.40295,4988.0,20.0,1102.591279,514.0,872.0,1509.0,864.171931,4994.0,20.0,1105.876809,514.0,876.0,1514.0,868.009584,4994.0


In [20]:
df.shape

(1, 98)

# Unpack Pickle

In [21]:
import gzip
import pickle

def unpack_pickle(foldername, filename, columnname):
    with gzip.open(f"{foldername}/{filename}.pkl.gz", 'rb') as f:
        model = pickle.load(f)
        return model.transform(df[[columnname]]).flatten()


# Encoding

In [22]:
df["Policy Start Date - Year"] = df["Policy Start Date - Year"].astype("O")

############################################
df["ENCODED_Policy Start Date - Year"] = unpack_pickle(foldername="do_encodings", filename="ENCODED_Policy Start Date - Year", columnname="Policy Start Date - Year")
df.drop(columns="Policy Start Date - Year", inplace=True)

In [23]:
############################################
df["ENCODED_Policy Start Date - Quarter"] = unpack_pickle(foldername="do_encodings", filename="ENCODED_Policy Start Date - Quarter", columnname="Policy Start Date - Quarter")
df.drop(columns="Policy Start Date - Quarter", inplace=True)

In [24]:
############################################
df["ENCODED_Customer Feedback"] = unpack_pickle(foldername="do_encodings", filename="ENCODED_Customer Feedback", columnname="Customer Feedback")
df.drop(columns="Customer Feedback", inplace=True)

In [25]:
with gzip.open("do_encodings/ENCODED_Occupation.pkl.gz", 'rb') as f:
        model = pickle.load(f)

b = pd.DataFrame(
        model.transform(df[["Occupation"]]),
        columns="ENCODED_" + model.get_feature_names_out()
    )

df = pd.concat([df, b], axis=1)
df.drop(columns="Occupation", inplace=True)

In [26]:
with gzip.open("do_encodings/ENCODED_Marital Status.pkl.gz", 'rb') as f:
        model = pickle.load(f)

b = pd.DataFrame(
        model.transform(df[["Marital Status"]]),
        columns="ENCODED_" + model.get_feature_names_out()
    )

df = pd.concat([df, b], axis=1)
df.drop(columns="Marital Status", inplace=True)

In [27]:
df

Unnamed: 0,Age,Gender,Annual Income,Number of Dependents,Education Level,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Smoking Status,Exercise Frequency,Property Type,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,CreditInsurance,Health_Risk_Score,Credit_Health_Score,Health_Age_Interaction,Feedback1,Feedback2,Feedback3,Feedback4,Total Nulls,Policy Start Date - Day,Policy Start Date - Month,Sin_Year,Cos_Year,Sin_Month,Cos_Month,Number of Dependents_MIN_Premium Amount,Number of Dependents_MEAN_Premium Amount,Number of Dependents_Q1_Premium Amount,Number of Dependents_MEDIAN_Premium Amount,Number of Dependents_Q3_Premium Amount,Number of Dependents_STD_Premium Amount,Number of Dependents_MAX_Premium Amount,Occupation_MIN_Premium Amount,Occupation_MEAN_Premium Amount,Occupation_Q1_Premium Amount,Occupation_MEDIAN_Premium Amount,Occupation_Q3_Premium Amount,Occupation_STD_Premium Amount,Occupation_MAX_Premium Amount,Education Level_MIN_Premium Amount,Education Level_MEAN_Premium Amount,Education Level_Q1_Premium Amount,Education Level_MEDIAN_Premium Amount,Education Level_Q3_Premium Amount,Education Level_STD_Premium Amount,Education Level_MAX_Premium Amount,Previous Claims_MIN_Premium Amount,Previous Claims_MEAN_Premium Amount,Previous Claims_Q1_Premium Amount,Previous Claims_MEDIAN_Premium Amount,Previous Claims_Q3_Premium Amount,Previous Claims_STD_Premium Amount,Previous Claims_MAX_Premium Amount,Health Conscious Level_MIN_Premium Amount,Health Conscious Level_MEAN_Premium Amount,Health Conscious Level_Q1_Premium Amount,Health Conscious Level_MEDIAN_Premium Amount,Health Conscious Level_Q3_Premium Amount,Health Conscious Level_STD_Premium Amount,Health Conscious Level_MAX_Premium Amount,Insurance Duration_MIN_Premium Amount,Insurance Duration_MEAN_Premium Amount,Insurance Duration_Q1_Premium Amount,Insurance Duration_MEDIAN_Premium Amount,Insurance Duration_Q3_Premium Amount,Insurance Duration_STD_Premium Amount,Insurance Duration_MAX_Premium Amount,ENCODED_Policy Start Date - Year,ENCODED_Policy Start Date - Quarter,ENCODED_Customer Feedback,ENCODED_Occupation_Self-Employed,ENCODED_Occupation_Unemployed,ENCODED_Marital Status_Married,ENCODED_Marital Status_Single
0,21,Male,5124,3,Master's,784,Urban,Basic,2,8,7844.99,8,No,Weekly,House,0,0,0,0,0,0,0,0,0,0,0,5,526848,1708.0,40197728.76,0.653156,15372,1708.0,244.0,Wednesday,3922.495,62759.92,-34.2,6150472.16,16464,40992,62759.92,16,6272,0,17,8,-1.305266e-12,1.0,-1.959435e-15,1.0,20.0,1104.006551,514.0,875.0,1513.0,864.955881,4997.0,20.0,1103.361209,514.0,872.0,1508.0,867.02349,4997.0,20.0,1102.113989,513.0,871.0,1512.0,866.235322,4997.0,20.0,1151.583106,526.0,907.0,1606.0,898.40295,4988.0,20.0,1102.591279,514.0,872.0,1509.0,864.171931,4994.0,20.0,1105.876809,514.0,876.0,1514.0,868.009584,4994.0,3.0,12.0,2.0,0.0,1.0,0.0,1.0


In [28]:
df = df[[
         'Annual Income',
         'Credit Score',
         'IsNull_Annual Income',
         'Money Handling Level',
         'Money Handling Level1',
         'Money Per Head',
         'Growth',
         'Credit by Score',
         'Determinstic',
         'Growth1',
         'Feedback1',
         'Previous Claims_MEDIAN_Premium Amount',
         'IsNull_Health Score',
         'Previous Claims_MEAN_Premium Amount',
         'Previous Claims',
         'Previous Claims_STD_Premium Amount',
         'Previous Claims_Q3_Premium Amount',
         'Previous Claims_Q1_Premium Amount',
         'IsNull_Customer Feedback',
         'Previous Claims_MAX_Premium Amount',
         'Feedback3',
         'IsNull_Previous Claims',
         'IsNull_Marital Status',
         'Health Score',
         'Health_Risk_Score',
         'Feedback2',
         'CreditInsurance',
         'Sin_Year',
         'IsNull_Credit Score',
         'Health_Age_Interaction',
         'Total Nulls',
         'ENCODED_Policy Start Date - Year',
         'ENCODED_Policy Start Date - Quarter',
         'Feedback4',
         'IsNull_Number of Dependents',
         'IsNull_Occupation',
         'Health Conscious Level1',
         'Sin_Month',
         'Policy Start Date - Month',
         'Health Conscious Level',
         'Health Conscious Level_Q1_Premium Amount',
         'Health Conscious Level_MEAN_Premium Amount',
         'Health Conscious Level_MEDIAN_Premium Amount',
         'Number of Dependents_MEAN_Premium Amount',
         'Number of Dependents_MEDIAN_Premium Amount',
         'Number of Dependents_Q1_Premium Amount',
         'Number of Dependents_Q3_Premium Amount',
         'Number of Dependents_STD_Premium Amount',
         'Health Conscious Level_Q3_Premium Amount',
         'Insurance Duration_MEAN_Premium Amount',
         'Insurance Duration_MEDIAN_Premium Amount',
         'Insurance Duration_Q1_Premium Amount',
         'Insurance Duration_Q3_Premium Amount',
         'Health Conscious Level_MAX_Premium Amount',
         'Credit_Health_Score',
         'Occupation_Q3_Premium Amount',
         'Occupation_MEAN_Premium Amount',
         'Occupation_MAX_Premium Amount',
         'Occupation_MEDIAN_Premium Amount',
         'Occupation_Q1_Premium Amount',
         'Previous Claims_MIN_Premium Amount',
         'Insurance Duration_MAX_Premium Amount',
         'ENCODED_Occupation_Self-Employed',
         'Age',
         'Insurance Duration_STD_Premium Amount',
         'Occupation_STD_Premium Amount'
   ]]

In [29]:
df.shape

(1, 66)

In [30]:
df

Unnamed: 0,Annual Income,Credit Score,IsNull_Annual Income,Money Handling Level,Money Handling Level1,Money Per Head,Growth,Credit by Score,Determinstic,Growth1,Feedback1,Previous Claims_MEDIAN_Premium Amount,IsNull_Health Score,Previous Claims_MEAN_Premium Amount,Previous Claims,Previous Claims_STD_Premium Amount,Previous Claims_Q3_Premium Amount,Previous Claims_Q1_Premium Amount,IsNull_Customer Feedback,Previous Claims_MAX_Premium Amount,Feedback3,IsNull_Previous Claims,IsNull_Marital Status,Health Score,Health_Risk_Score,Feedback2,CreditInsurance,Sin_Year,IsNull_Credit Score,Health_Age_Interaction,Total Nulls,ENCODED_Policy Start Date - Year,ENCODED_Policy Start Date - Quarter,Feedback4,IsNull_Number of Dependents,IsNull_Occupation,Health Conscious Level1,Sin_Month,Policy Start Date - Month,Health Conscious Level,Health Conscious Level_Q1_Premium Amount,Health Conscious Level_MEAN_Premium Amount,Health Conscious Level_MEDIAN_Premium Amount,Number of Dependents_MEAN_Premium Amount,Number of Dependents_MEDIAN_Premium Amount,Number of Dependents_Q1_Premium Amount,Number of Dependents_Q3_Premium Amount,Number of Dependents_STD_Premium Amount,Health Conscious Level_Q3_Premium Amount,Insurance Duration_MEAN_Premium Amount,Insurance Duration_MEDIAN_Premium Amount,Insurance Duration_Q1_Premium Amount,Insurance Duration_Q3_Premium Amount,Health Conscious Level_MAX_Premium Amount,Credit_Health_Score,Occupation_Q3_Premium Amount,Occupation_MEAN_Premium Amount,Occupation_MAX_Premium Amount,Occupation_MEDIAN_Premium Amount,Occupation_Q1_Premium Amount,Previous Claims_MIN_Premium Amount,Insurance Duration_MAX_Premium Amount,ENCODED_Occupation_Self-Employed,Age,Insurance Duration_STD_Premium Amount,Occupation_STD_Premium Amount
0,5124,7844.99,0,40197728.76,0.653156,1708.0,15372,3922.495,244.0,1708.0,40992,907.0,0,1151.583106,2,898.40295,1606.0,526.0,0,4988.0,16,0,0,784,-34.2,62759.92,62759.92,-1.305266e-12,0,16464,0,3.0,12.0,6272,0,0,526848,-1.959435e-15,8,5,514.0,1102.591279,872.0,1104.006551,875.0,514.0,1513.0,864.955881,1509.0,1105.876809,876.0,514.0,1514.0,4994.0,6150472.16,1508.0,1103.361209,4997.0,872.0,514.0,20.0,4994.0,0.0,21,868.009584,867.02349


In [31]:
for i in df.columns:
    with gzip.open(f"do_scalings/SCALER_{i}.pkl.gz", 'rb') as f:
            model = pickle.load(f)

    df[f"SCALER_{i}"] = model.transform(df[[i]]).flatten()
    df.drop(columns=i, inplace=True)

In [32]:
df

Unnamed: 0,SCALER_Annual Income,SCALER_Credit Score,SCALER_IsNull_Annual Income,SCALER_Money Handling Level,SCALER_Money Handling Level1,SCALER_Money Per Head,SCALER_Growth,SCALER_Credit by Score,SCALER_Determinstic,SCALER_Growth1,SCALER_Feedback1,SCALER_Previous Claims_MEDIAN_Premium Amount,SCALER_IsNull_Health Score,SCALER_Previous Claims_MEAN_Premium Amount,SCALER_Previous Claims,SCALER_Previous Claims_STD_Premium Amount,SCALER_Previous Claims_Q3_Premium Amount,SCALER_Previous Claims_Q1_Premium Amount,SCALER_IsNull_Customer Feedback,SCALER_Previous Claims_MAX_Premium Amount,SCALER_Feedback3,SCALER_IsNull_Previous Claims,SCALER_IsNull_Marital Status,SCALER_Health Score,SCALER_Health_Risk_Score,SCALER_Feedback2,SCALER_CreditInsurance,SCALER_Sin_Year,SCALER_IsNull_Credit Score,SCALER_Health_Age_Interaction,SCALER_Total Nulls,SCALER_ENCODED_Policy Start Date - Year,SCALER_ENCODED_Policy Start Date - Quarter,SCALER_Feedback4,SCALER_IsNull_Number of Dependents,SCALER_IsNull_Occupation,SCALER_Health Conscious Level1,SCALER_Sin_Month,SCALER_Policy Start Date - Month,SCALER_Health Conscious Level,SCALER_Health Conscious Level_Q1_Premium Amount,SCALER_Health Conscious Level_MEAN_Premium Amount,SCALER_Health Conscious Level_MEDIAN_Premium Amount,SCALER_Number of Dependents_MEAN_Premium Amount,SCALER_Number of Dependents_MEDIAN_Premium Amount,SCALER_Number of Dependents_Q1_Premium Amount,SCALER_Number of Dependents_Q3_Premium Amount,SCALER_Number of Dependents_STD_Premium Amount,SCALER_Health Conscious Level_Q3_Premium Amount,SCALER_Insurance Duration_MEAN_Premium Amount,SCALER_Insurance Duration_MEDIAN_Premium Amount,SCALER_Insurance Duration_Q1_Premium Amount,SCALER_Insurance Duration_Q3_Premium Amount,SCALER_Health Conscious Level_MAX_Premium Amount,SCALER_Credit_Health_Score,SCALER_Occupation_Q3_Premium Amount,SCALER_Occupation_MEAN_Premium Amount,SCALER_Occupation_MAX_Premium Amount,SCALER_Occupation_MEDIAN_Premium Amount,SCALER_Occupation_Q1_Premium Amount,SCALER_Previous Claims_MIN_Premium Amount,SCALER_Insurance Duration_MAX_Premium Amount,SCALER_ENCODED_Occupation_Self-Employed,SCALER_Age,SCALER_Insurance Duration_STD_Premium Amount,SCALER_Occupation_STD_Premium Amount
0,-0.525946,31.094378,0.0,1.252015,-0.579165,-0.440886,-0.373098,11.004838,-0.349291,-0.438079,-0.280343,52.0,0.0,57.590036,1.0,30.472002,32.5,9.0,0.0,-4.5,1.5,0.0,0.0,42.951011,-42.951011,26.691388,22.418662,-1.0,0.0,18.439739,-0.5,0.0,0.2,56.54399,0.0,0.0,21.366473,-4.898587e-16,0.4,0.5,0.0,0.0,0.2,0.0,0.125,0.0,0.2,0.0,0.0,0.260301,1.0,-1.0,0.625,0.0,543.60887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.869565,0.822189,0.469401


#
---
#

In [43]:
new_cols = ['SCALER_Annual_Income', 'SCALER_Credit_Score', 'SCALER_IsNull_Annual_Income', 'SCALER_Money_Handling_Level', 'SCALER_Money_Handling_Level1', 'SCALER_Money_Per_Head', 'SCALER_Growth', 'SCALER_Credit_by_Score', 'SCALER_Determinstic', 'SCALER_Growth1', 'SCALER_Feedback1', 'SCALER_Previous_Claims_MEDIAN_Premium_Amount', 'SCALER_IsNull_Health_Score', 'SCALER_Previous_Claims_MEAN_Premium_Amount', 'SCALER_Previous_Claims', 'SCALER_Previous_Claims_STD_Premium_Amount', 'SCALER_Previous_Claims_Q3_Premium_Amount', 'SCALER_Previous_Claims_Q1_Premium_Amount', 'SCALER_IsNull_Customer_Feedback', 'SCALER_Previous_Claims_MAX_Premium_Amount', 'SCALER_Feedback3', 'SCALER_IsNull_Previous_Claims', 'SCALER_IsNull_Marital_Status', 'SCALER_Health_Score', 'SCALER_Health_Risk_Score', 'SCALER_Feedback2', 'SCALER_CreditInsurance', 'SCALER_Sin_Year', 'SCALER_IsNull_Credit_Score', 'SCALER_Health_Age_Interaction', 'SCALER_Total_Nulls', 'SCALER_ENCODED_Policy_Start_Date_-_Year', 'SCALER_ENCODED_Policy_Start_Date_-_Quarter', 'SCALER_Feedback4', 'SCALER_IsNull_Number_of_Dependents', 'SCALER_IsNull_Occupation', 'SCALER_Health_Conscious_Level1', 'SCALER_Sin_Month', 'SCALER_Policy_Start_Date_-_Month', 'SCALER_Health_Conscious_Level', 'SCALER_Health_Conscious_Level_Q1_Premium_Amount', 'SCALER_Health_Conscious_Level_MEAN_Premium_Amount', 'SCALER_Health_Conscious_Level_MEDIAN_Premium_Amount', 'SCALER_Number_of_Dependents_MEAN_Premium_Amount', 'SCALER_Number_of_Dependents_MEDIAN_Premium_Amount', 'SCALER_Number_of_Dependents_Q1_Premium_Amount', 'SCALER_Number_of_Dependents_Q3_Premium_Amount', 'SCALER_Number_of_Dependents_STD_Premium_Amount', 'SCALER_Health_Conscious_Level_Q3_Premium_Amount', 'SCALER_Insurance_Duration_MEAN_Premium_Amount', 'SCALER_Insurance_Duration_MEDIAN_Premium_Amount', 'SCALER_Insurance_Duration_Q1_Premium_Amount', 'SCALER_Insurance_Duration_Q3_Premium_Amount', 'SCALER_Health_Conscious_Level_MAX_Premium_Amount', 'SCALER_Credit_Health_Score', 'SCALER_Occupation_Q3_Premium_Amount', 'SCALER_Occupation_MEAN_Premium_Amount', 'SCALER_Occupation_MAX_Premium_Amount', 'SCALER_Occupation_MEDIAN_Premium_Amount', 'SCALER_Occupation_Q1_Premium_Amount', 'SCALER_Previous_Claims_MIN_Premium_Amount', 'SCALER_Insurance_Duration_MAX_Premium_Amount', 'SCALER_ENCODED_Occupation_Self-Employed', 'SCALER_Age', 'SCALER_Insurance_Duration_STD_Premium_Amount', 'SCALER_Occupation_STD_Premium_Amount']
old_cols = ['SCALER_Annual Income', 'SCALER_Credit Score', 'SCALER_IsNull_Annual Income', 'SCALER_Money Handling Level', 'SCALER_Money Handling Level1', 'SCALER_Money Per Head', 'SCALER_Growth', 'SCALER_Credit by Score', 'SCALER_Determinstic', 'SCALER_Growth1', 'SCALER_Feedback1', 'SCALER_Previous Claims_MEDIAN_Premium Amount', 'SCALER_IsNull_Health Score', 'SCALER_Previous Claims_MEAN_Premium Amount', 'SCALER_Previous Claims', 'SCALER_Previous Claims_STD_Premium Amount', 'SCALER_Previous Claims_Q3_Premium Amount', 'SCALER_Previous Claims_Q1_Premium Amount', 'SCALER_IsNull_Customer Feedback', 'SCALER_Previous Claims_MAX_Premium Amount', 'SCALER_Feedback3', 'SCALER_IsNull_Previous Claims', 'SCALER_IsNull_Marital Status', 'SCALER_Health Score', 'SCALER_Health_Risk_Score', 'SCALER_Feedback2', 'SCALER_CreditInsurance', 'SCALER_Sin_Year', 'SCALER_IsNull_Credit Score', 'SCALER_Health_Age_Interaction', 'SCALER_Total Nulls', 'SCALER_ENCODED_Policy Start Date - Year', 'SCALER_ENCODED_Policy Start Date - Quarter', 'SCALER_Feedback4', 'SCALER_IsNull_Number of Dependents', 'SCALER_IsNull_Occupation', 'SCALER_Health Conscious Level1', 'SCALER_Sin_Month', 'SCALER_Policy Start Date - Month', 'SCALER_Health Conscious Level', 'SCALER_Health Conscious Level_Q1_Premium Amount', 'SCALER_Health Conscious Level_MEAN_Premium Amount', 'SCALER_Health Conscious Level_MEDIAN_Premium Amount', 'SCALER_Number of Dependents_MEAN_Premium Amount', 'SCALER_Number of Dependents_MEDIAN_Premium Amount', 'SCALER_Number of Dependents_Q1_Premium Amount', 'SCALER_Number of Dependents_Q3_Premium Amount', 'SCALER_Number of Dependents_STD_Premium Amount', 'SCALER_Health Conscious Level_Q3_Premium Amount', 'SCALER_Insurance Duration_MEAN_Premium Amount', 'SCALER_Insurance Duration_MEDIAN_Premium Amount', 'SCALER_Insurance Duration_Q1_Premium Amount', 'SCALER_Insurance Duration_Q3_Premium Amount', 'SCALER_Health Conscious Level_MAX_Premium Amount', 'SCALER_Credit_Health_Score', 'SCALER_Occupation_Q3_Premium Amount', 'SCALER_Occupation_MEAN_Premium Amount', 'SCALER_Occupation_MAX_Premium Amount', 'SCALER_Occupation_MEDIAN_Premium Amount', 'SCALER_Occupation_Q1_Premium Amount', 'SCALER_Previous Claims_MIN_Premium Amount', 'SCALER_Insurance Duration_MAX_Premium Amount', 'SCALER_ENCODED_Occupation_Self-Employed', 'SCALER_Age', 'SCALER_Insurance Duration_STD_Premium Amount', 'SCALER_Occupation_STD_Premium Amount']

In [46]:
df.rename(columns={old : new for old, new in zip(old_cols, new_cols)}, inplace=True)

In [47]:
with gzip.open('models.pkl.gz', 'rb') as f:
    ml_model = pickle.load(f)

In [49]:
avg_error = 0
for model in ml_model:
    avg_error += model.predict(df)

avg_error /= 10
avg_error

array([6.67539252])

In [50]:
np.expm1(avg_error)

array([791.65852442])

# ***`Input that ML Model needs`***

In [2]:
# Health Conscious Level                                 4.000000e+00
# SCALER_Annual_Income                                  -5.726975e-01
# SCALER_Credit_Score                                   -9.656652e-01
# SCALER_IsNull_Annual_Income                            0.000000e+00
# SCALER_Money_Handling_Level                           -6.168446e-01
# SCALER_Money_Handling_Level1                          -4.493340e-01
# SCALER_Money_Per_Head                                 -4.789053e-01
# SCALER_Growth                                         -4.271325e-01
# SCALER_Credit_by_Score                                -4.757282e-01
# SCALER_Determinstic                                   -4.750959e-01
# SCALER_Growth1                                        -4.692119e-01
# SCALER_Feedback1                                      -3.616695e-01
# SCALER_Previous_Claims_MEDIAN_Premium_Amount           0.000000e+00
# SCALER_IsNull_Health_Score                             0.000000e+00
# SCALER_Previous_Claims_MEAN_Premium_Amount             0.000000e+00
# SCALER_Previous_Claims                                 0.000000e+00
# SCALER_Previous_Claims_STD_Premium_Amount              0.000000e+00
# SCALER_Previous_Claims_Q3_Premium_Amount               0.000000e+00
# SCALER_Previous_Claims_Q1_Premium_Amount              -1.000000e+00
# SCALER_IsNull_Customer_Feedback                        0.000000e+00
# SCALER_Previous_Claims_MAX_Premium_Amount              0.000000e+00
# SCALER_Feedback3                                       5.000000e-01
# SCALER_IsNull_Previous_Claims                          1.000000e+00
# SCALER_IsNull_Marital_Status                           0.000000e+00
# SCALER_Health_Score                                   -3.007246e-01
# SCALER_Health_Risk_Score                               3.007246e-01
# SCALER_Feedback2                                       2.723254e-01
# SCALER_CreditInsurance                                -7.626168e-01
# SCALER_Sin_Year                                        1.507418e+00
# SCALER_IsNull_Credit_Score                             0.000000e+00
# SCALER_Health_Age_Interaction                         -4.408912e-01
# SCALER_Total_Nulls                                     5.000000e-01
# SCALER_ENCODED_Policy_Start_Date_-_Year               -1.000000e+00
# SCALER_ENCODED_Policy_Start_Date_-_Quarter            -9.000000e-01
# SCALER_Feedback4                                       5.869922e-01
# SCALER_IsNull_Number_of_Dependents                     0.000000e+00
# SCALER_IsNull_Occupation                               1.000000e+00
# SCALER_Health_Conscious_Level1                         1.654915e-01
# SCALER_Sin_Month                                      -8.330074e-15
# SCALER_Policy_Start_Date_-_Month                       1.000000e+00
# SCALER_Health_Conscious_Level                          0.000000e+00
# SCALER_Health_Conscious_Level_Q1_Premium_Amount        2.338313e-02
# SCALER_Health_Conscious_Level_MEAN_Premium_Amount      0.000000e+00
# SCALER_Health_Conscious_Level_MEDIAN_Premium_Amount   -1.073045e+00
# SCALER_Number_of_Dependents_MEAN_Premium_Amount       -1.250000e+00
# SCALER_Number_of_Dependents_MEDIAN_Premium_Amount     -1.666667e+00
# SCALER_Number_of_Dependents_Q1_Premium_Amount         -8.000000e-01
# SCALER_Number_of_Dependents_Q3_Premium_Amount         -4.597718e-01
# SCALER_Number_of_Dependents_STD_Premium_Amount         2.857143e-01
# SCALER_Health_Conscious_Level_Q3_Premium_Amount        4.589986e-01
# SCALER_Insurance_Duration_MEAN_Premium_Amount          1.500000e+00
# SCALER_Insurance_Duration_MEDIAN_Premium_Amount        3.000000e+00
# SCALER_Insurance_Duration_Q1_Premium_Amount            6.250000e-01
# SCALER_Insurance_Duration_Q3_Premium_Amount           -5.000000e-01
# SCALER_Health_Conscious_Level_MAX_Premium_Amount      -6.006211e-01
# SCALER_Credit_Health_Score                             8.571429e-01
# SCALER_Occupation_Q3_Premium_Amount                    3.804556e-01
# SCALER_Occupation_MEAN_Premium_Amount                 -6.000000e-01
# SCALER_Occupation_MAX_Premium_Amount                   6.666667e-01
# SCALER_Occupation_MEDIAN_Premium_Amount                1.000000e+00
# SCALER_Occupation_Q1_Premium_Amount                    0.000000e+00
# SCALER_Previous_Claims_MIN_Premium_Amount              3.333333e-01
# SCALER_Insurance_Duration_MAX_Premium_Amount           0.000000e+00
# SCALER_ENCODED_Occupation_Self-Employed               -5.217391e-01
# SCALER_Age                                            -7.818143e-01
# SCALER_Insurance_Duration_STD_Premium_Amount          -5.305991e-01

#
---
#

In [2]:
df = pd.read_csv("cleaned_df.csv")

In [3]:
df

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0,0,0,0,0,0,0,0,0,0,0,0
1,39.0,Female,31678.0,Divorced,3.0,Master's,Unemployed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0,0,0,0,0,1,0,0,0,0,0,0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,556.0,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0,0,0,0,0,0,0,0,0,1,0,0
3,21.0,Male,141855.0,Married,2.0,Bachelor's,Employed,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0,0,0,0,0,1,0,0,0,0,0,0
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999995,50.0,Female,38782.0,Married,1.0,Bachelor's,Unemployed,14.498639,Rural,Premium,1.0,8.0,309.0,2.0,2021-07-09 15:21:39.184157,Average,Yes,Daily,Condo,,0,0,0,0,1,0,1,0,0,0,0
1999996,34.0,Female,73462.0,Single,0.0,Master's,Self-Employed,8.145748,Rural,Basic,2.0,0.0,417.0,2.0,2023-03-28 15:21:39.250151,Good,No,Daily,Apartment,,1,0,0,0,1,0,0,0,1,0,0
1999997,26.0,Female,35178.0,Single,0.0,Master's,Employed,6.636583,Urban,Comprehensive,1.0,10.0,666.0,6.0,2019-09-30 15:21:39.132191,Poor,No,Monthly,Apartment,,0,0,0,0,0,0,1,0,1,0,0
1999998,34.0,Female,45661.0,Single,3.0,Master's,Unemployed,15.937248,Urban,Premium,2.0,17.0,467.0,7.0,2022-05-09 15:21:39.253660,Average,No,Weekly,Condo,,0,0,0,0,1,0,0,0,0,0,0


In [4]:
df.isnull().sum()

Age                                 0
Gender                              0
Annual Income                       0
Marital Status                      0
Number of Dependents                0
Education Level                     0
Occupation                          0
Health Score                        0
Location                            0
Policy Type                         0
Previous Claims                     0
Vehicle Age                         0
Credit Score                        0
Insurance Duration                  0
Policy Start Date                   0
Customer Feedback                   0
Smoking Status                      0
Exercise Frequency                  0
Property Type                       0
Premium Amount                 800000
IsNull_Age                          0
IsNull_Annual Income                0
IsNull_Marital Status               0
IsNull_Number of Dependents         0
IsNull_Occupation                   0
IsNull_Health Score                 0
IsNull_Previ

In [6]:
X = df.drop(columns="Premium Amount")
Y = df["Premium Amount"]

In [10]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=800000, shuffle=False)

In [13]:
xtrain.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,0,0,0,0,0,0,0,0,0,0,0
1,39.0,Female,31678.0,Divorced,3.0,Master's,Unemployed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,0,0,0,0,1,0,0,0,0,0,0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,556.0,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,0,0,0,0,0,0,0,0,1,0,0
3,21.0,Male,141855.0,Married,2.0,Bachelor's,Employed,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,0,0,0,0,1,0,0,0,0,0,0
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,0,0,0,0,0,0,0,0,0,0,0


In [14]:
ytest.head()

1200000   NaN
1200001   NaN
1200002   NaN
1200003   NaN
1200004   NaN
Name: Premium Amount, dtype: float64

#
# Pipeline
---
###

## Feature Engineering

In [18]:
def smoke(df):
    # Assume X is a DataFrame or numpy array
    smo = df["Smoking Status"].replace({"Yes" : 0, "No" : 1})  # Interaction of first two columns
    return np.c_[df, smo]    # Append interaction as a new column

smoke_transformer = FunctionTransformer(smoke, validate=False)

In [19]:
preprocessor = ColumnTransformer(
    transformers=[
        ('smoke', smoke_transformer),  # Columns 0 and 1 interact
    ]
)

In [23]:
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

In [24]:
pipeline.fit(xtrain, ytrain)

ValueError: not enough values to unpack (expected 3, got 2)

In [None]:
Pipeline(
    "Feature Engineering",
    "Encoding"
    "Scaling"
    "Model"
)