<a href="https://colab.research.google.com/github/valeria-edulabs/ai-experts/blob/main/meeting17/Linear-Regression-basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and configurations

In [None]:
import datetime
from math import sqrt

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#  (high-level, simple to use)
import plotly.express as px
# (low-level, highly customizable)
import plotly.graph_objects as go

from scipy import stats



In [None]:
data_path = "https://storage.googleapis.com/biosense-ml-data/insurance.csv"
data_path_clean = "https://storage.googleapis.com/biosense-ml-data/insurance_clean.csv"
data_path_clean_no_outliers = "https://storage.googleapis.com/biosense-ml-data/insurance_clean_no_outliers.csv"

In [None]:
# Set Plotly as Pandas plotting backend

pd.options.plotting.backend = "plotly"

# Read the data

In [None]:
column_definitions = {
    'age': np.int8,
    'sex': 'category',
    'bmi': np.float32,
    'children': np.int8,
    'smoker': bool,
    'region': 'category',
    'charges': np.float32
}

In [None]:
df = pd.read_csv(data_path_clean, dtype=column_definitions)

In [None]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900000,0,True,southwest,16884.923828
1,18,male,33.770000,1,False,southeast,1725.552246
2,28,male,33.000000,3,False,southeast,4449.461914
3,33,male,22.705000,0,False,northwest,21984.470703
4,32,male,28.879999,0,False,northwest,3866.855225
...,...,...,...,...,...,...,...
1334,50,male,30.969999,3,False,northwest,10600.547852
1335,18,female,31.920000,0,False,northeast,2205.980713
1336,18,female,36.849998,0,False,southeast,1629.833496
1337,21,female,25.799999,0,False,southwest,2007.944946


# Simple Linear Regression

Predict charges based on age

In [None]:
df['charges'].plot(kind="hist")

In [None]:
px.scatter(df, x="age", y="charges", color='smoker')


# Prepare features and labels - X, y

In [None]:
X = df[['age']]

In [None]:
y = df['charges']

In [None]:
y

# Create and fit the model

In [None]:
model = LinearRegression()

In [None]:
# y = wx + b

In [None]:
# Fit the model to the training data
model.fit(X, y)

# Review the model

In [None]:
# w
model.coef_

array([257.19762721])

In [None]:
# b
model.intercept_

3193.6031406237908

In [None]:

import plotly.graph_objects as go
import numpy as np

def plot_age_vs_price_with_hypothesis(ages, prices, model):
    """
    Plots age vs price with a line representing the hypothesis function.

    Args:
        ages: List or numpy array of ages.
        prices: List or numpy array of corresponding prices.
        hypothesis_func: A function that takes age as input and returns the predicted price.
    """

    fig = go.Figure()

    # Scatter plot of actual data
    fig.add_trace(go.Scatter(
        x=ages["age"],
        y=prices,
        mode='markers',
        name='Actual Data',
        marker=dict(size=8)
    ))

    # Generate points for the hypothesis line
    age_range = np.linspace(1, 100, 100)  # Use more points for smoother line
    predicted_prices = model.predict(pd.DataFrame({'age':age_range}))

    # Plot the hypothesis line
    fig.add_trace(go.Scatter(
        x=age_range,
        y=predicted_prices,
        mode='lines',
        name='Hypothesis Function',
        line=dict(color='red')
    ))

    # Layout settings
    fig.update_layout(
        title='Age vs. Price with Hypothesis Function',
        xaxis_title='Age',
        yaxis_title='Price',
        template="plotly_white" #clean white background for better visibility
    )

    fig.show()

plot_age_vs_price_with_hypothesis(X, y, model)

# Predict

In [None]:
predict_df = pd.DataFrame({
    "age": [10, 32, 90]
})
model.predict(predict_df)

array([ 5765.57941272, 11423.92721133, 26341.38958947])

In [None]:
model.predict([[10], [32], [90]])


X does not have valid feature names, but LinearRegression was fitted with feature names



array([ 5765.57941272, 11423.92721133, 26341.38958947])

### Plot predictions vs actual charges

In [None]:
actual = y
predicted = model.predict(X)

actual_vs_predictions = pd.DataFrame({'Actual': actual, 'Predicted': predicted})


fig = go.Figure(data=go.Scatter(
    x=actual_vs_predictions['Actual'],
    y=actual_vs_predictions['Predicted'],
    mode='markers',
    marker=dict(size=8),
    name='Predictions',
    hovertemplate='Actual: %{x:.2f}<br>Predicted: %{y:.2f}<extra></extra>' # Custom tooltip
))

min_val = 6000
max_val = 20000

fig.add_trace(go.Scatter(
    x=[min_val, max_val],
    y=[min_val, max_val],
    mode='lines',
    line=dict(color='red', dash='dash'),
    name='Perfect Predictions'
))

fig.update_layout(
    title='Actual vs. Predicted Values',
    xaxis_title='Actual Values',
    yaxis_title='Predicted Values',
    xaxis=dict(scaleanchor='y', scaleratio=1),
    yaxis=dict(scaleanchor='x', scaleratio=1)
)

fig.show()

# Evaluate

In [None]:
metrics.r2_score(y, predicted)

0.08906737336608239

In [None]:
# 1000
# 900

In [None]:
# 900
# 1000

In [None]:
# | 1000 - 900 | = 100
# |900 -1000 | = 100

In [None]:
metrics.mean_squared_error(y, predicted)

133382986.0370281

In [None]:
metrics.root_mean_squared_error(y, predicted)

11549.155208803288

In [None]:
metrics.mean_absolute_error(y, predicted)

9056.799373090153

In [None]:
metrics.mean_absolute_percentage_error(y, predicted) * 100

115.26463474131718

# Dataset Partitioning: Train, Validation, and Test Sets


In [None]:
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.3, random_state=47)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=47)

In [None]:
X_train.shape

(937, 1)

In [None]:
X_val.shape

(201, 1)

In [None]:
X_test.shape

(201, 1)

# Bias - variance tradeoff

### Train the same linear regression model - now on partitioned dataset

- fit the model using train set only
- display RMSE and r2 score for train set
- display RMSE and r2 score for validation set

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)


In [None]:
metrics.r2_score(y_train, model.predict(X_train))

0.09271455941086171

In [None]:
metrics.r2_score(y_val, model.predict(X_val))

0.08928198415206434

In [None]:
metrics.root_mean_squared_error(y_train, model.predict(X_train))

11435.209659056172

In [None]:
metrics.root_mean_squared_error(y_val, model.predict(X_val))

11903.51448303287

# Multivariate Linear Regression

In [None]:
px.scatter(df, x="bmi", y="charges", color="smoker")

In [None]:
X = df[['age', 'bmi']]

In [None]:
y = df['charges']

In [None]:
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.3, random_state=47)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=47)

In [None]:
model = LinearRegression()

In [None]:
# Fit the model to the training data
model.fit(X_train, y_train)

In [None]:
# charges = w1 * age + w2 * bmi + w3 * smoker + b
# weights
model.coef_

array([243.52843, 324.99713], dtype=float32)

In [None]:
model.intercept_

-6211.1543

In [None]:
model.score(X_train, y_train)

0.11999863386154175

In [None]:
model.score(X_val, y_val)

0.09623134136199951

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1339 entries, 0 to 1338
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       1339 non-null   int8    
 1   sex       1339 non-null   category
 2   bmi       1339 non-null   float32 
 3   children  1339 non-null   int8    
 4   smoker    1339 non-null   bool    
 5   region    1339 non-null   category
 6   charges   1339 non-null   float32 
dtypes: bool(1), category(2), float32(2), int8(2)
memory usage: 17.3 KB


# Handling Categorical Variables

### Adding smoker feature

In [None]:
X = df[['age', 'bmi', 'smoker']]
y = df['charges']

X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.3, random_state=47)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=47)

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
model.coef_

array([  261.22983,   301.26373, 23557.229  ], dtype=float32)

In [None]:
model.score(X_train, y_train)

0.7443608045578003

In [None]:
model.score(X_val, y_val)

0.7286289930343628

In [None]:
metrics.mean_absolute_percentage_error(y_val, model.predict(X_val)) * 100

49.506184458732605

### Adding region feature - using one-hot encoding

In [None]:
X = df[['age', 'bmi', 'smoker', 'region']]
y = df['charges']

X = pd.get_dummies(X, columns=['region'], drop_first=True)

In [None]:
# pd.get_dummies(X, columns=['region'], drop_first=True)

Unnamed: 0,age,bmi,smoker,region_northwest,region_southeast,region_southwest
0,19,27.900000,True,False,False,True
1,18,33.770000,False,False,True,False
2,28,33.000000,False,False,True,False
3,33,22.705000,False,True,False,False
4,32,28.879999,False,True,False,False
...,...,...,...,...,...,...
1334,50,30.969999,False,True,False,False
1335,18,31.920000,False,False,False,False
1336,18,36.849998,False,False,True,False
1337,21,25.799999,False,False,False,True


In [None]:
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.3, random_state=47)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=47)

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

0.7455049753189087

In [None]:
model.score(X_val, y_val)

0.7277483940124512

# Normalization

### min-max

In [None]:
X = df[['age', 'bmi', 'smoker', 'region']]
y = df['charges']

X = pd.get_dummies(X, columns=['region'], drop_first=True)

In [None]:
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.3, random_state=47)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=47)

In [None]:
minmax_scaler = MinMaxScaler()

In [None]:
minmax_scaler.fit(X_train)

In [None]:
minmax_scaler.transform(X_train)

array([[0.5217391 , 0.33225715, 1.        , 1.        , 0.        ,
        0.        ],
       [0.63043475, 0.55609363, 1.        , 0.        , 1.        ,
        0.        ],
       [0.78260875, 0.3143664 , 0.        , 1.        , 0.        ,
        0.        ],
       ...,
       [0.39130434, 0.26741993, 0.        , 0.        , 0.        ,
        1.        ],
       [0.5869565 , 0.55205804, 1.        , 1.        , 0.        ,
        0.        ],
       [0.04347828, 0.39359695, 0.        , 0.        , 0.        ,
        0.        ]], dtype=float32)

In [None]:
X_train_scaled = pd.DataFrame(minmax_scaler.transform(X_train), columns=X.columns)

In [None]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [None]:
model.score(X_train_scaled, y_train)

0.7455049753189087

In [None]:
X_val_scaled = pd.DataFrame(minmax_scaler.transform(X_val), columns=X.columns)

In [None]:
model.score(X_val_scaled, y_val)

0.727748453617096

### Z-score

In [None]:
X = df[['age', 'bmi', 'smoker', 'region']]
y = df['charges']

X = pd.get_dummies(X, columns=['region'], drop_first=True)

In [None]:
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.3, random_state=47)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=47)

In [None]:
z_scaler = StandardScaler()

In [None]:
z_scaler.fit(X_train)

In [None]:
z_scaler.mean_

array([39.36392743, 30.55909287,  0.20384205,  0.24226254,  0.26360726,
        0.25080043])

In [None]:
z_scaler.var_

array([1.97469478e+02, 3.77456021e+01, 1.62290468e-01, 1.83571402e-01,
       1.94118471e-01, 1.87899573e-01])

In [None]:
X_train_scaled = pd.DataFrame(z_scaler.transform(X_train),  columns=X_train.columns)

In [None]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [None]:
model.score(X_train_scaled, y_train)

0.7455049753189087

In [None]:
X_val_scaled = pd.DataFrame(z_scaler.transform(X_val),  columns=X_val.columns)

In [None]:
model.score(X_val_scaled, y_val)

0.727748453617096

# Linear Regression Assumptions

### No or little co-linearity

- check features co-linearity
- leave only not correlated features and train the model
- check the results

### Linearity

- check whether we have linear relationship between features and target variables


### Normality of errors -  Residuals

In [None]:
residuals = model.predict(X_val) - y_val
fig = px.histogram(
    residuals,
    nbins=100,
)
fig.show()

### Q-Q plot

In [None]:
residuals

In [None]:
def create_qq_plot(data, dist='norm'):
    """
    Creates a Q-Q plot using scipy.stats.probplot and displays it with Plotly.

    Args:
        data (array-like): The data to be plotted.
        dist (str): The distribution to compare against (e.g., 'norm', 'uniform').
    """
    # Generate the probability plot data
    probplot_data = stats.probplot(data, dist=dist)

    # Extract the quantiles and best-fit line
    quantiles = probplot_data[0][0]
    ordered_vals = probplot_data[0][1]
    slope, intercept, r = probplot_data[1]
    best_fit_line = slope * quantiles + intercept

    # Create the Plotly scatter plot
    fig = go.Figure()

    # Scatter plot of the ordered values against the quantiles
    fig.add_trace(go.Scatter(
        x=quantiles,
        y=ordered_vals,
        mode='markers',
        name='Data Quantiles'
    ))

    # Best-fit line
    fig.add_trace(go.Scatter(
        x=quantiles,
        y=best_fit_line,
        mode='lines',
        name='Best Fit Line',
        line=dict(color='red')
    ))

    # Set plot title and axis labels
    fig.update_layout(
        title=f'Q-Q Plot ({dist.capitalize()} Distribution)',
        xaxis_title='Theoretical Quantiles',
        yaxis_title='Ordered Values'
    )

    fig.show()

In [None]:
create_qq_plot(residuals)

# Feature and model engineering

### Can we improve the model by splitting one model into multiple models?

Do this:
- Train 2 models - one for smokers, one for non-smokers
- Check performance of both models
- Create an inference funciton that gets the data, checks which model to run, and returns the relevant prediction
- What is the performance of your inference that consists of 2 models?
- Try remove outliers (charges) from the data before training

Do not do this:
- Add normalization and check whether it improves the model performance

### Can we improvde the model by engineering non-linear features?