In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# import warnings

In [2]:
# https://stackoverflow.com/questions/46173419/seaborn-change-bar-colour-according-to-hue-name

In [5]:
df = pd.read_csv('data/insurance.csv')

FileNotFoundError: File b'data/insurance.csv' does not exist

In [None]:
df.head()

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.countplot(x = 'region', data = df, palette = "PiYG", orient = 'h', ax = ax, edgecolor = '1')
for i in ax.patches:
    ax.text(i.get_x()+0.3,i.get_height()+3,str(round((i.get_height()/df.region.shape[0])*100))+'%',\
           fontsize = 12)
ax.set_xlabel("Region", fontsize = 13)
ax.set_title("Region Distribution", fontsize = 15)
ax.tick_params(length=5,labelsize=12,labelcolor = 'black')
x_axis = ax.axes.get_yaxis().set_visible(False)
sns.despine(left= True)
plt.show()

In [None]:
print("minimum age:",df['age'].min())
print("maximum age:",df['age'].max())

In [None]:
# classify age into 3 groups
# Young adult(18-25), Adult(26-50), Senior(51-64)
# convert continious variable 'age' to categorical variable  

cut_points = [17,25,50,64]
labels = ['Young adult', 'Adult', 'Senior']
df['age_category'] = pd.cut(df["age"], cut_points, labels =  labels)
set(list(df['age_category']))

In [None]:
#Age distribution by categories

f, (ax,ax2) = plt.subplots(2,1,figsize = (8,10))
sns.countplot(x = 'age_category',data = df, palette = 'Pastel2',orient = 'v',ax = ax, edgecolor = '1')
for i in ax.patches:
    ax.text(i.get_x()+0.3,i.get_height()+3,\
           str(round((i.get_height()/df.age_category.shape[0])*100))+'%',fontsize =12)
ax.set_xlabel("Age Categories",fontsize =13)
ax.tick_params(length=5, labelsize = 12, labelcolor = 'black')
ax.set_title("Age Distribution by Categories",fontsize =15)

ax2.hist('age',bins = 10,data = df, edgecolor = '0.1')
ax2.set_xlabel("Age",fontsize =13)
ax2.tick_params(length=5, labelsize = 12, labelcolor = 'black')
ax2.set_title("Age Distribution",fontsize =15)
x_axis = ax.axes.get_yaxis().set_visible(False)
f.subplots_adjust(hspace = 0.5)
sns.despine(left=True)
plt.show()


In [None]:
def gender_dist_plot(x_axis,title):
    f,ax = plt.subplots(figsize=(10,5))
    sns.countplot(x=x_axis, data = df, ax = ax,palette=['dodgerblue','lightpink']
                  ,hue='sex', hue_order=['male','female'] )

    for i in ax.patches:
        ax.text(i.get_x()+0.1, i.get_height()+3,\
                str(round((i.get_height()/df.region.shape[0])*100))+'%')
    ax.set_title(title+ ' Distribution by Gender', fontsize = 15)
    ax.set_xlabel(title, fontsize =12)
    ax.tick_params(length=5, labelsize= 12, labelcolor = 'black')
    x_axis = ax.axes.get_yaxis().set_visible(False)
    ax.legend(loc=[1,0.8],fontsize = 12, title = "Gender Type",ncol=2)
    sns.despine(left = True)
    plt.show()

gender_dist_plot("age_category",'Age Category')

In [None]:
gender_dist_plot("region",'Region')

In [None]:
f, ax = plt.subplots(figsize=(10,5))
sns.countplot(x='region', data = df, ax = ax , hue = "smoker", palette=["C7", "C9"])
for i in ax.patches:
    ax.text(i.get_x()+0.1, i.get_height()+3,
               str(round((i.get_height()/df.region.shape[0])*100))+'%')
ax.set_xlabel("Region",fontsize=13)
ax.set_title("Regional Distribution of Smokers",fontsize =15)
ax.tick_params(length =5, labelsize=12)
xaxis = ax.axes.get_yaxis().set_visible(False)
sns.despine(left = True)
plt.show()

In [None]:
from scipy import stats
from scipy.stats import norm, skew, kurtosis

def data_transform(data,input):
    f,(ax1,ax2,ax3) = plt.subplots(1,3,figsize=(8,8))
    
    sns.boxplot(x =input, data= data, ax = ax1, orient='v')
    sns.distplot(data[input],ax = ax2, color = 'blue', hist = False)
    res = stats.probplot(data[input], plot = ax3)
    
    axes = [ax1,ax2]
    kwargs = {'fontsize':14,'color':'black'}
    ax1.set_title(input+' Boxplot Analysis',**kwargs)
    ax1.set_xlabel('Box',**kwargs)
    ax1.set_ylabel('BMI Values',**kwargs)

    ax2.set_title(input+' Distribution',**kwargs)
    ax2.set_xlabel(input+' values',**kwargs)

    ax3.set_title('Probability Plot',**kwargs)
    ax3.set_xlabel('Theoretical Quantiles',**kwargs)
    ax3.set_ylabel('Ordered Values',**kwargs)
    f.subplots_adjust(wspace=0.22,right= 2)
    sns.despine()
    
    return plt.show()

    

data_transform(df,'bmi')
    

# Categorize BMI value

In [None]:
cut_points = [14,19,25,30,65]
label_names = ['Underweight',"normal","overweight","obese"]
df["bmi_cat"] = pd.cut(df['bmi'],cut_points,labels=label_names)
gender_dist_plot('bmi_cat','BMI')

### Charges feature analysis

In [None]:
data_transform(df,'charges')

In [None]:
df.charges = np.log1p(df.charges)
data_transform(df,'charges')

### Scatter Plot Analysis

In [None]:
sns.lmplot(x = "bmi", y= "charges", hue = "smoker",data = df, 
           size = 6, aspect = 1.3,
          scatter_kws={"s": 50, "alpha": 1,'edgecolor':'black'}
          ,fit_reg=True)
plt.title('Scatterplot Analysis',fontsize=14)
plt.xlabel('BMI',fontsize=12)
plt.ylabel('Charge',fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
kwargs = {'fontsize':12,'color':'black'}
sns.heatmap(df.corr(),annot=True,robust=True)
plt.title('Correlation Analysis on the Dataset',**kwargs)
plt.tick_params(length=3,labelsize=12,color='black')
plt.yticks(rotation=0)
plt.show()

From the above Heatmap there is a strong correlation between age 
and charges. But from the correlation plot it is observed that there is 
correlation between age and bmi for smokers

Dividing the dataset for smokers and non smokers

### Smokers Dataset Analysis

In [None]:
df.drop(['age_category','bmi_cat'],axis=1,inplace=True)
df_smoker = df[df.smoker=='yes']
df_smoker.head()

In [None]:
#converts categorical to nummerical values
df_smoker = pd.get_dummies(df_smoker,drop_first=True)
df_smoker.head()

In [None]:
plt.figure(figsize=(12,8))
kwargs = {'fontsize':12,'color':'black'}
sns.heatmap(df_smoker.corr(),annot=True,robust=True)
plt.title('Correlation Analysis for Smoker',**kwargs)
plt.tick_params(length=3,labelsize=12,color='black')
plt.yticks(rotation=0)
plt.show()

The heatmap shoes strong correlation between age, bmi and charges for smokers

In [None]:
df_smoker.drop(['children','sex_male', 'region_northwest',
       'region_southeast', 'region_southwest'],axis=1,inplace=True)




In [None]:
sns.lmplot(x = 'bmi',y='charges',hue=None,data=df_smoker,size=6,aspect=1.5,
           scatter_kws={"s": 70, "alpha": 1,'edgecolor':'black'},legend=False,fit_reg=True)
plt.title('Scatterplot Analysis',fontsize=14)
plt.xlabel('BMI',fontsize=12)
plt.ylabel('Charge',fontsize=12)
plt.show()

## Multivariate Linear Regression Analysis for smoker

In [None]:
from sklearn.metrics import explained_variance_score,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
# import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor

In [None]:
X = df_smoker.drop('charges',axis=1)
y = df_smoker['charges']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
#Standardizing the values
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print('Intercept: {:.4f} \ncte1: {:.4f}\ncte2: {:.4f}'
      .format(model.intercept_,model.coef_[0],
      model.coef_[1]))
print('Model_Accuracy_Score (R Square): {:.4f} \nLoss(RMSE): {:.4f}'
      .format(r2_score(y_pred,y_test),
    np.sqrt(mean_squared_error(y_pred,y_test))))

#### Linear Regression visulaization for smokers

In [None]:
def model_scatter_plot(model):
    title = str(model)
    title = title.split('.')[3]
    title = title.replace("'>",'')
    lreg = model()
    lreg.fit(X_train,y_train)
    y_pred = lreg.predict(X_test)
    #model_table
    model_table = pd.DataFrame(y_pred,y_test).reset_index()
    model_table.columns=['y_test','y_pred']
    #Model Graph
    sns.lmplot(x = 'y_test',y='y_pred',data = model_table,size=6,aspect=1.5,
           scatter_kws={"s": 70, "alpha": 1,'edgecolor':'black'},fit_reg=True)
    plt.title(title +' Analysis',fontsize=14)
    plt.xlabel('y_test',fontsize=12)
    plt.ylabel('y_pred',fontsize=12)
    #plt.scatter(y_test,y_pred)
    return plt.show()

model_scatter_plot(LinearRegression)

#### Use the model

In [None]:
# how much would a person pay for insurance if he 
# is smoker with a specific and bmi value

def use_model(age, bmi_value):
    c = [[age,bmi_value]]
    c = sc.transform(c)
    charge_value = model.coef_[0]*(c[0][0]) + model.coef_[1]*(c[0][1]) + model.intercept_
    charge_value = np.exp(charge_value)
    x = ('The Insurrance Charges for a {:.1f} years old person who is a Smoker with an bmi = {:.1f} will be {:.4f}'.format(age,bmi_value,charge_value))
    # we use the np.exp() because we transformed the value of charge during the charge EDA earlier above
    return print(x)
    
    

In [None]:
#if you are a smoker of 23 yr old and bmi of 32 then what 
# insurrance would you be charged?
use_model(24,40)



In [None]:
def robust_model(input):
    #Model type to evaluate
    model_list = [ExtraTreesRegressor(),RandomForestRegressor(),GradientBoostingRegressor(),
            LinearRegression()]
    r_score = []
    loss = []
    for reg in model_list:
        reg.fit(X_train,y_train)
        y_pred = reg.predict(X_test)
        r_score.append(explained_variance_score(y_pred,y_test))
        loss.append(np.sqrt(mean_squared_error(y_pred,y_test)))
    ## Model score table
    model_str = ['ExtraTrees','Random Forest','Gradient Boosting',
            'Linear Regression']
    other_model = pd.DataFrame(r_score,model_str).reset_index()
    other_model.columns = ['Model','R(Square)']
    other_model['loss'] = loss
    other_model.sort_values('R(Square)',ascending=False,inplace=True)
    ## Model Graph
    ax = other_model[['R(Square)','loss']].plot(kind='bar',width=0.7,
                            figsize=(15,7), color=['slategray', 'darkred'], fontsize=13,edgecolor='0.2')
    for i in ax.patches:
        # get_x pulls left or right; get_height pushes up or down
        ax.text(i.get_x()+.1, i.get_height()+0.01, \
                str(round((i.get_height()), 3)), fontsize=12, color='black',)
    ax.set_title('Regression Model Evaluation For '+input,fontsize=14,color='black')
    ax.set_xticklabels(other_model.Model, rotation=0, fontsize=12)
    ax.set_xlabel('Model',**kwargs)
    x_axis = ax.axes.get_yaxis().set_visible(False)
    sns.despine(left=True)
    return plt.show()

robust_model('Smoker')

In [None]:
# Let's visualize the test data vs. the predicted data
model_scatter_plot(ExtraTreesRegressor)

model_scatter_plot(GradientBoostingRegressor)

model_scatter_plot(RandomForestRegressor)

### Non_smoker Dataset analysis

In [None]:
df_non_smoker = df[df.smoker=='no']
sns.lmplot(x = 'bmi',y='charges',hue=None,data=df_non_smoker,height=6,aspect=1.5,
           scatter_kws={"s": 70, "alpha": 1,'edgecolor':'black'},legend=False,fit_reg=True)
plt.title('Scatterplot Analysis',fontsize=14)
plt.xlabel('BMI',fontsize=12)
plt.ylabel('Charge',fontsize=12)
plt.show()


In [None]:
# Convert all categorical columns in the dataset to Numerical for the Analysis
df_non_smoker['children'] = df_non_smoker['children'].astype('category')
df_non_smoker = pd.get_dummies(df_non_smoker,drop_first=True)
#correlation Analysis
plt.figure(figsize=(12,8))
kwargs = {'fontsize':12,'color':'black'}
sns.heatmap(df_non_smoker.corr(),annot=True,robust=True)
plt.title('Correlation Analysis for Smoker',**kwargs)
plt.tick_params(length=3,labelsize=12,color='black')
plt.yticks(rotation=0)
plt.show()


In [None]:
# Let plot the age vs. charge scatter plot to see the correlation between them
sns.lmplot(x = 'age',y='charges',data=df_non_smoker,height=6,aspect=1.5,
           scatter_kws={"s": 70, "alpha": 1,'edgecolor':'black'},legend=False,fit_reg=True)
plt.title('Scatterplot Analysis',fontsize=14)
plt.show()