In [None]:
import pandas as pd
import numpy as np

# Generate random data
np.random.seed(42)  # For reproducibility
data = {
    "Age": np.random.randint(18, 80, size=1000),  # Random ages between 18 and 80
    "PolicyType": np.random.choice(["Auto", "Home", "Life"], size=1000),  # Random policy types
    "PremiumAmount": np.random.uniform(100, 1000, size=1000)  # Random premium amounts between 100 and 1000
}

# Create DataFrame
df = pd.DataFrame(data)

df.head()  # Display the first few rows of the DataFrame

In [17]:
import pandas as pd

df = pd.read_csv('../data/insurance_data.csv')  # Adjust the filename as necessary
print(df.head())


   index  PatientID   age gender   bmi  bloodpressure diabetic  children  \
0      0          1  39.0   male  23.2             91      Yes         0   
1      1          2  24.0   male  30.1             87       No         0   
2      2          3   NaN   male  33.3             82      Yes         0   
3      3          4   NaN   male  33.7             80       No         0   
4      4          5   NaN   male  34.1            100       No         0   

  smoker     region    claim  
0     No  southeast  1121.87  
1     No  southeast  1131.51  
2     No  southeast  1135.94  
3     No  northwest  1136.40  
4     No  northwest  1137.01  


In [18]:
# Drop rows where 'age' is missing
df = df.dropna(subset=['age'])

# Confirm it's clean
print(df['age'].isnull().sum())


0


In [19]:
# encode categorical columns
df_encoded = pd.get_dummies(df, columns=['gender', 'smoker', 'diabetic'], drop_first=True)

# Preview the new dataframe
print(df_encoded.head())


   index  PatientID   age   bmi  bloodpressure  children     region    claim  \
0      0          1  39.0  23.2             91         0  southeast  1121.87   
1      1          2  24.0  30.1             87         0  southeast  1131.51   
7      7          8  19.0  41.1            100         0  northwest  1146.80   
8      8          9  20.0  43.0             86         0  northwest  1149.40   
9      9         10  30.0  53.1             97         0  northwest  1163.46   

   gender_male  smoker_Yes  diabetic_Yes  
0         True       False          True  
1         True       False         False  
7         True       False         False  
8         True       False         False  
9         True       False         False  


In [21]:
# Convert any boolean columns to integers (0/1)
for col in df_encoded.columns:
    if df_encoded[col].dtype == 'bool':
        df_encoded[col] = df_encoded[col].astype(int)
        
# Dont think this is necessary, but just cleaner

df_encoded.head()


Unnamed: 0,index,PatientID,age,bmi,bloodpressure,children,region,claim,gender_male,smoker_Yes,diabetic_Yes
0,0,1,39.0,23.2,91,0,southeast,1121.87,1,0,1
1,1,2,24.0,30.1,87,0,southeast,1131.51,1,0,0
7,7,8,19.0,41.1,100,0,northwest,1146.8,1,0,0
8,8,9,20.0,43.0,86,0,northwest,1149.4,1,0,0
9,9,10,30.0,53.1,97,0,northwest,1163.46,1,0,0


In [25]:
# Define feature matrix (X) and target vector (y)
X = df_encoded[['age', 'bmi', 'bloodpressure', 'children', 'gender_male', 'smoker_Yes', 'diabetic_Yes']]
y = df_encoded['claim']

X.head() # Display the first few rows of X to confirm they are correct


Unnamed: 0,age,bmi,bloodpressure,children,gender_male,smoker_Yes,diabetic_Yes
0,39.0,23.2,91,0,1,0,1
1,24.0,30.1,87,0,1,0,0
7,19.0,41.1,100,0,1,0,0
8,20.0,43.0,86,0,1,0,0
9,30.0,53.1,97,0,1,0,0


In [26]:
X = sm.add_constant(X)
X.head()

Unnamed: 0,const,age,bmi,bloodpressure,children,gender_male,smoker_Yes,diabetic_Yes
0,1.0,39.0,23.2,91,0,1,0,1
1,1.0,24.0,30.1,87,0,1,0,0
7,1.0,19.0,41.1,100,0,1,0,0
8,1.0,20.0,43.0,86,0,1,0,0
9,1.0,30.0,53.1,97,0,1,0,0


In [27]:
# Define and fit the GLM model
model = sm.GLM(y, X, family=sm.families.Gamma(link=sm.families.links.log()))
result = model.fit()

# Print the summary
print(result.summary())




                 Generalized Linear Model Regression Results                  
Dep. Variable:                  claim   No. Observations:                 1335
Model:                            GLM   Df Residuals:                     1327
Model Family:                   Gamma   Df Model:                            7
Link Function:                    log   Scale:                         0.39997
Method:                          IRLS   Log-Likelihood:                -13446.
Date:                Sat, 26 Apr 2025   Deviance:                       506.26
Time:                        18:48:54   Pearson chi2:                     531.
No. Iterations:                    15   Pseudo R-squ. (CS):             0.6361
Covariance Type:            nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const             7.1286      0.190     37.496

In [None]:
# Calculate claim probability based on Age and PremiumAmount
# Older individuals and lower premiums increase the probability of a claim
df['ClaimProbability'] = (df['Age'] / df['Age'].max()) * 0.6 + (1 - (df['PremiumAmount'] / df['PremiumAmount'].max())) * 0.4

# Simulate claim outcomes 
df['ClaimOutcome'] = np.random.binomial(1, df['ClaimProbability'])

# df.drop(columns=['ClaimProbability'], inplace=True)

df.head()  

In [None]:
df_encoded = pd.get_dummies(df, columns=['PolicyType'], drop_first=True)

X = df_encoded[['Age', 'PremiumAmount', 'PolicyType_Home', 'PolicyType_Life']]
y = df_encoded['ClaimOutcome']


In [None]:
import statsmodels.api as sm

X = sm.add_constant(X)


In [None]:
# Create and fit the GLM model
model = sm.GLM(y, X, family=sm.families.Binomial())
result = model.fit()

print (result.summary())  # Display the model summary

In [None]:
# Predict claim probabilities
df_encoded['Predicted_Prob'] = result.predict(X)


In [None]:
print(X.dtypes)


In [None]:
X['PolicyType_Home'] = X['PolicyType_Home'].astype(int)
X['PolicyType_Life'] = X['PolicyType_Life'].astype(int)


In [None]:
import matplotlib.pyplot as plt

plt.scatter(df_encoded['Age'], df_encoded['Predicted_Prob'], alpha=0.5)
plt.xlabel('Age')
plt.ylabel('Predicted Claim Probability')
plt.title('Predicted Claim Probability vs Age')
plt.show()
