# Lets look at a simple implementaion of linear regression algorithm from scratch
## We use the classic medical insurance dataset

In [1]:
import pandas as pd
import numpy as np

In [10]:
ldf_insurance = pd.read_csv("data/insurance.csv")

In [11]:
ldf_insurance

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


zi = (xi – min(x)) / (max(x) – min(x))

where:

zi: The ith normalized value in the dataset
xi: The ith value in the dataset
min(x): The minimum value in the dataset
max(x): The maximum value in the dataset

In [14]:
def numeric_normalizer(age, pint_min_value, pint_max_value):
    return (age - pint_min_value)/(pint_max_value - pint_min_value)

In [33]:
def normalize_data(ldf_insurance):
    # Normalize Age
    lint_min_age = min(ldf_insurance["age"])
    lint_max_age = max(ldf_insurance["age"])
    ldf_insurance["age"] = ldf_insurance["age"].apply(numeric_normalizer, args=[lint_min_age, lint_max_age])
    
    # Normalize bmi
    lint_min_bmi = min(ldf_insurance["bmi"])
    lint_max_bmi = max(ldf_insurance["bmi"])
    ldf_insurance["bmi"] = ldf_insurance["bmi"].apply(numeric_normalizer, args=[lint_min_bmi, lint_max_bmi])
    
    # Normalize Children
    lint_min_children = min(ldf_insurance["children"])
    lint_max_children = max(ldf_insurance["children"])
    ldf_insurance["children"] = ldf_insurance["children"].apply(
        numeric_normalizer, args=[lint_min_children, lint_max_children])
    
    # Categorize sex
    ldf_insurance['sex'] = ldf_insurance['sex'].astype('category').cat.codes
    # Categorize smoker
    ldf_insurance['smoker'] = ldf_insurance['smoker'].astype('category').cat.codes
    # Categorize region
    ldf_insurance['region'] = ldf_insurance['region'].astype('category').cat.codes

In [35]:
# Now lets call the normalize data method on our dataset
normalize_data(ldf_insurance)

In [41]:
# Let's check if our method did what it was supposed to
ldf_insurance.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,0.021739,0,0.321227,0.0,1,3,16884.924
1,0.0,1,0.47915,0.2,0,2,1725.5523
2,0.217391,1,0.458434,0.6,0,2,4449.462
3,0.326087,1,0.181464,0.0,0,1,21984.47061
4,0.304348,1,0.347592,0.0,0,1,3866.8552
5,0.282609,0,0.263115,0.0,0,2,3756.6216
6,0.608696,0,0.470272,0.2,0,2,8240.5896
7,0.413043,0,0.316922,0.6,0,1,7281.5056
8,0.413043,1,0.37315,0.4,0,0,6406.4107
9,0.913043,0,0.265806,0.0,0,1,28923.13692
