<a href="https://colab.research.google.com/github/valeria-edulabs/ai-experts/blob/main/meeting17/2_Linear_Regression_Advanced_Health_insurance_prices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and configurations

In [None]:
import datetime
from math import sqrt

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#  (high-level, simple to use)
import plotly.express as px
# (low-level, highly customizable)
import plotly.graph_objects as go

from scipy import stats



In [None]:
data_path = "https://storage.googleapis.com/biosense-ml-data/insurance.csv"
data_path_clean = "https://storage.googleapis.com/biosense-ml-data/insurance_clean.csv"
data_path_clean_no_outliers = "https://storage.googleapis.com/biosense-ml-data/insurance_clean_no_outliers.csv"

In [None]:
# Set Plotly as Pandas plotting backend

pd.options.plotting.backend = "plotly"

# Read the data

In [None]:
column_definitions = {
    'age': np.int8,
    'sex': 'category',
    'bmi': np.float32,
    'children': np.int8,
    'smoker': bool,
    'region': 'category',
    'charges': np.float32
}

In [None]:
df = pd.read_csv(data_path_clean, dtype=column_definitions)

In [None]:
df.columns

# Train Linear Regression

In [None]:
X = df[['age', 'sex', 'bmi', 'smoker', 'region', 'children']]
y = df['charges']

X = pd.get_dummies(X, columns=['region', 'sex'], drop_first=True)

In [None]:
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.3, random_state=47)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=47)

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_val, y_val)

# Correlation between features and target variable vs hypothsis

In [None]:
weights = pd.DataFrame(model.coef_, index=X_train.columns).transpose()

In [None]:
weights

In [None]:
bias = model.intercept_

In [None]:
charges = y_train

In [None]:
age = X_train['age']
weight = weights['age'][0]

line_x = np.linspace(1, 70, 70)
line_y = bias + weight * line_x
fig = go.Figure()

# Scatter plot
fig.add_trace(go.Scatter(
    x=age,
    y=charges,
    mode='markers',
    name='Actual'
))

# Line plot
fig.add_trace(go.Scatter(
    x=line_x,
    y=line_y,
    mode='lines',
    name='Hypothesis'
))

fig.update_layout(title=f"Age vs hypothesis | weight: {weight:.2f} | bias: {bias:.2f}")
fig.show()

In [None]:
bmi = X_train['bmi']
weight = weights['bmi'][0]

line_x = np.linspace(bmi.min(), bmi.max(), 50)
line_y = bias + weight * line_x

fig = go.Figure()

# Scatter plot
fig.add_trace(go.Scatter(
    x=bmi,
    y=charges,
    mode='markers',
    name='Actual'
))

# Line plot
fig.add_trace(go.Scatter(
    x=line_x,
    y=line_y,
    mode='lines',
    name='Hypothesis'
))

fig.update_layout(title=f"BMI vs hypothesis | weight: {weight} | bias: {bias}")
fig.show()

In [None]:
children = X_train['children']
weight = weights['children'][0]

line_x = np.linspace(0, 50, 50)
line_y = bias + weight * line_x

fig = go.Figure()

# Scatter plot
fig.add_trace(go.Scatter(
    x=children,
    y=charges,
    mode='markers',
    name='Actual'
))

# Line plot
fig.add_trace(go.Scatter(
    x=line_x,
    y= line_y,
    mode='lines',
    name='Hypothesis'
))

fig.update_layout(title=f"Children vs hypothesis | weight: {weight} | bias: {bias}" )
fig.show()

In [None]:
smoker = X_train['smoker']
weight = weights['smoker'][0]

line_x = np.linspace(0, 10, 11)
line_y = bias + weight * line_x

fig = go.Figure()

# Scatter plot
fig.add_trace(go.Scatter(
    x=smoker.astype(int),
    y=charges,
    mode='markers',
    name='Actual'
))

# Line plot
fig.add_trace(go.Scatter(
    x=line_x,
    y= line_y,
    mode='lines',
    name='Hypothesis'
))

fig.update_layout(title=f"Smoker vs hypothesis | weight: {weight} | bias: {bias}" )
fig.show()

# Linear Regression Assumptions

## No or little co-linearity

- check features co-linearity
- leave only not correlated features and train the model
- check the results

## Linearity

- check whether we have linear relationship between features and target variables


## Normality of errors -  Residuals

In [None]:
residuals = model.predict(X_val) - y_val
fig = px.histogram(
    residuals,
    nbins=100,
)
fig.show()

### Q-Q plot

In [None]:
residuals

In [None]:
def create_qq_plot(data, dist='norm'):
    """
    Creates a Q-Q plot using scipy.stats.probplot and displays it with Plotly.

    Args:
        data (array-like): The data to be plotted.
        dist (str): The distribution to compare against (e.g., 'norm', 'uniform').
    """
    # Generate the probability plot data
    probplot_data = stats.probplot(data, dist=dist)

    # Extract the quantiles and best-fit line
    quantiles = probplot_data[0][0]
    ordered_vals = probplot_data[0][1]
    slope, intercept, r = probplot_data[1]
    best_fit_line = slope * quantiles + intercept

    # Create the Plotly scatter plot
    fig = go.Figure()

    # Scatter plot of the ordered values against the quantiles
    fig.add_trace(go.Scatter(
        x=quantiles,
        y=ordered_vals,
        mode='markers',
        name='Data Quantiles'
    ))

    # Best-fit line
    fig.add_trace(go.Scatter(
        x=quantiles,
        y=best_fit_line,
        mode='lines',
        name='Best Fit Line',
        line=dict(color='red')
    ))

    # Set plot title and axis labels
    fig.update_layout(
        title=f'Q-Q Plot ({dist.capitalize()} Distribution)',
        xaxis_title='Theoretical Quantiles',
        yaxis_title='Ordered Values'
    )

    fig.show()

In [None]:
create_qq_plot(residuals)

# Feature Selection

- Select only features that don't have strong correlation between them, train the model again and see whether results improve
- Select only features that have linear correlation to target variable

# Feature Engineering

## Applying transformation to target variable

In [None]:
df['charges'].plot(kind='hist', nbins=150)

In [None]:
transformed_y = np.log(y)
transformed_y.plot(kind='hist', nbins=150)

In [None]:
# train the model again

X_train, X_test_val, y_train, y_test_val = train_test_split(X, transformed_y, test_size=0.3, random_state=47)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=47)

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_val, y_val)

In [None]:
residuals = model.predict(X_val) - y_val
fig = px.histogram(
    residuals,
    nbins=100,
)
fig.show()

In [None]:
create_qq_plot(residuals)

### Predictions - don't forget to perform inverse transformation on the predicitons!!!

In [None]:
model.predict(X_val[:2])

In [None]:
np.exp(model.predict(X_val[:2]))

In [None]:
transformed_y, lambda_value = stats.boxcox(y_train)

## Applying transformation to features

**❓Exercise: adding transformation to a feature**

- check whether there are  features that might need to be transformed for improved performance (features that don't show linear correlation with target variable)
- try applying transformation to age and see whether model performance improves