<a href="https://colab.research.google.com/github/valeria-edulabs/ai-experts/blob/main/meeting17/Normalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and configurations

In [None]:
import datetime
from math import sqrt

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#  (high-level, simple to use)
import plotly.express as px
# (low-level, highly customizable)
import plotly.graph_objects as go

from scipy import stats



In [None]:
data_path = "https://storage.googleapis.com/biosense-ml-data/insurance.csv"
data_path_clean = "https://storage.googleapis.com/biosense-ml-data/insurance_clean.csv"
data_path_clean_no_outliers = "https://storage.googleapis.com/biosense-ml-data/insurance_clean_no_outliers.csv"

In [None]:
# Set Plotly as Pandas plotting backend

pd.options.plotting.backend = "plotly"

# Read the data

In [None]:
column_definitions = {
    'age': np.int8,
    'sex': 'category',
    'bmi': np.float32,
    'children': np.int8,
    'smoker': bool,
    'region': 'category',
    'charges': np.float32
}

In [None]:
df = pd.read_csv(data_path_clean, dtype=column_definitions)

In [None]:
df.columns

# Train Linear Regression

In [None]:
X = df[['age', 'sex', 'bmi', 'smoker', 'region', 'children']]
y = df['charges']

X = pd.get_dummies(X, columns=['region', 'sex'], drop_first=True)

In [None]:
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.3, random_state=47)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=47)

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_val, y_val)

# Normalization

### min-max

In [None]:
X = df[['age', 'bmi', 'smoker', 'region']]
y = df['charges']

X = pd.get_dummies(X, columns=['region'], drop_first=True)

In [None]:
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.3, random_state=47)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=47)

In [None]:
minmax_scaler = MinMaxScaler()

In [None]:
minmax_scaler.fit(X_train)

In [None]:
minmax_scaler.transform(X_train)

In [None]:
X_train_scaled = pd.DataFrame(minmax_scaler.transform(X_train), columns=X.columns)

In [None]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [None]:
model.score(X_train_scaled, y_train)

In [None]:
X_val_scaled = pd.DataFrame(minmax_scaler.transform(X_val), columns=X.columns)

In [None]:
model.score(X_val_scaled, y_val)

### Z-score

In [None]:
X = df[['age', 'bmi', 'smoker', 'region']]
y = df['charges']

X = pd.get_dummies(X, columns=['region'], drop_first=True)

In [None]:
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.3, random_state=47)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=47)

In [None]:
z_scaler = StandardScaler()

In [None]:
z_scaler.fit(X_train)

In [None]:
z_scaler.mean_

In [None]:
z_scaler.var_

In [None]:
X_train_scaled = pd.DataFrame(z_scaler.transform(X_train),  columns=X_train.columns)

In [None]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [None]:
model.score(X_train_scaled, y_train)

In [None]:
X_val_scaled = pd.DataFrame(z_scaler.transform(X_val),  columns=X_val.columns)

In [None]:
model.score(X_val_scaled, y_val)