In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
import pickle
warnings.filterwarnings('ignore')

In [None]:
#Download data here: https://drive.google.com/file/d/1_-kRegOMJmIMG6-XnDwYKnADdNounmKm/view?usp=sharing
df = pd.read_csv('insurance.csv')
df

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
#Objects need to be converted into categorical data

In [None]:
df.isnull().sum()

### **EDA**

In [None]:
sns.set(style='whitegrid')
f, ax = plt.subplots(1,1, figsize=(12, 8))
ax = sns.distplot(df['charges'], kde = True, color = 'c')
plt.title('Distribution of Charges')

**Apply Natural Log**

Data terlalu miring (skewed) sehingga tidak mencerminkan normal distribusi. Oleh karena itu apply Natural Log untuk membuat data menjadi distribusi normal atau mencapai varian konstan.

In [None]:
f, ax = plt.subplots(1,1, figsize=(12, 8))
ax = sns.distplot(np.log10(df['charges']), kde = True, color = 'r')

**Check per Region**

In [None]:
#Univariate
charges = df['charges'].groupby(df.region).sum().sort_values(ascending = True)
# charges_df = charges.head().reset_index()
f, ax = plt.subplots(1, 1, figsize=(8, 6))
ax = sns.barplot(x=charges.head(), y=charges.head().index, palette='Blues')

plt.show()

**Charges Per Region Based on Sex**

In [None]:
# Multivariate Analysis
f, ax = plt.subplots(1, 1, figsize=(12, 8))
ax = sns.barplot(x='region', y='charges', hue='sex', data=df, palette='cool')

**Charges per Region Based on Smoker**

In [None]:
f, ax = plt.subplots(1,1, figsize=(12,8))
ax = sns.barplot(x = 'region', y = 'charges',
                 hue='smoker', data=df, palette='Reds_r')

**Charges per Region Based on How Many Children**

In [None]:
f, ax = plt.subplots(1, 1, figsize=(12, 8))
ax = sns.barplot(x='region', y='charges', hue='children', data=df, palette='Set1')

# **Analisa Medical charges berdasarkan faktor 'Smoker' dengan prediktor lain**

In [None]:
ax = sns.lmplot(x = 'age', y = 'charges', data=df, hue='smoker', palette='Set1')
ax = sns.lmplot(x = 'bmi', y = 'charges', data=df, hue='smoker', palette='Set2')
ax = sns.lmplot(x = 'children', y = 'charges', data=df, hue='smoker', palette='Set3')

In [None]:
f, ax = plt.subplots(1, 1, figsize=(10, 10))
ax = sns.violinplot(x = 'children', y = 'charges', data=df,
                 orient='v', hue='smoker', palette='inferno')

# **Data Preprocessing**

In [None]:
#Converting objects labels into categorical
df[['sex', 'smoker', 'region']] = df[['sex', 'smoker', 'region']].astype('category')
df.dtypes

In [None]:
#Converting category labels into numerical using LabelEncoder
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
df.sex = label.fit_transform(df.sex)
df.smoker = label.fit_transform(df.smoker)
df.region = label.fit_transform(df.region)
# df.dtypes
df

In [None]:
f, ax = plt.subplots(1, 1, figsize=(10, 10))
ax = sns.heatmap(df.corr(), annot=True, cmap="cool")

# **Algoritma**

**Train Test Split**

In [None]:
from sklearn.model_selection import train_test_split as holdout
from sklearn.linear_model import LinearRegression
from sklearn import metrics

x = df.drop(['charges'], axis = 1) #Prediktor
y = df['charges'] #Label
x_train, x_test, y_train, y_test = holdout(x, y, test_size=0.2, random_state=0)

**Linear Regression**

In [None]:
Lin_reg = LinearRegression()
Lin_reg.fit(x_train, y_train)
y_train_pred = Lin_reg.predict(x_train)
y_test_pred = Lin_reg.predict(x_test)
print(Lin_reg.intercept_)
print(Lin_reg.coef_)
print(Lin_reg.score(x_test, y_test))

**Lasso Regression**

In [None]:
from sklearn.linear_model import Lasso
Lasso = Lasso(alpha=0.2, fit_intercept=True, precompute=False, max_iter=1000,
              tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic')
Lasso.fit(x_train, y_train)
print(Lasso.intercept_)
print(Lasso.coef_)
print(Lasso.score(x_test, y_test))

**Ridge Regression**

In [None]:
from sklearn.linear_model import Ridge
Ridge = Ridge(alpha=0.5)
Ridge.fit(x_train, y_train)
print(Ridge.intercept_)
print(Ridge.coef_)
print(Ridge.score(x_test, y_test))

# **Evaluating Model**

In [None]:
##Evaluating the performance of the algorithm
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_test_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_test_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

In [None]:
##Predicting the charges
y_test_pred = Lin_reg.predict(x_test)
##Comparing the actual output values with the predicted values
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})
df

# **Menyimpan Model ke Pickle**

# **Mengintegrasi Model kedalam Flask**

In [None]:
!pip install pyngrok

In [None]:
!pip install flask