In [19]:
import numpy as np
import pandas as pd

In [20]:
df = pd.read_csv('insurance.csv')
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [21]:
#converting the categorical data into numerical data
catg_columns = ['sex', 'smoker']

for i in catg_columns:
  df_one = pd.get_dummies(df[i])
  column_ = df_one.columns[1]
  print(column_)
  df_one.pop(df_one.columns[0])
  df = pd.concat((df, df_one), axis=1)
  df.pop(i)
  df = df.rename(columns={column_:i})
  

print(df.head())
#0 if female and 1 if male
#0 is no and 1 if yes


male
yes
   age     bmi  children     region      charges  sex  smoker
0   19  27.900         0  southwest  16884.92400    0       1
1   18  33.770         1  southeast   1725.55230    1       0
2   28  33.000         3  southeast   4449.46200    1       0
3   33  22.705         0  northwest  21984.47061    1       0
4   32  28.880         0  northwest   3866.85520    1       0


In [22]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [23]:
#region is 4 category data, converting it in numerical data

region_dict= {'southwest': 0,  'southeast':1, 'northwest':2, 'northeast':3}
region = []
for i in df['region']:
  region.append(region_dict.get(i))

df.pop('region')
df['region']= region

In [24]:
df.head()

Unnamed: 0,age,bmi,children,charges,sex,smoker,region
0,19,27.9,0,16884.924,0,1,0
1,18,33.77,1,1725.5523,1,0,1
2,28,33.0,3,4449.462,1,0,1
3,33,22.705,0,21984.47061,1,0,2
4,32,28.88,0,3866.8552,1,0,2


In [25]:
#reindexing the columns as they were
df = df.reindex(columns=['age','sex','bmi', 'children', 'smoker', 'region', 'charges' ])
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,0,16884.924
1,18,1,33.77,1,0,1,1725.5523
2,28,1,33.0,3,0,1,4449.462
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.88,0,0,2,3866.8552


In [26]:
#normalization

df = (df - df.mean())/df.std()

x = df.values[:,0:6]
y = df.values[:, 6]

df


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,-1.438227,-1.010141,-0.453151,-0.908274,1.969850,-1.343402,0.298472
1,-1.509401,0.989221,0.509431,-0.078738,-0.507273,-0.438331,-0.953333
2,-0.797655,0.989221,0.383164,1.580335,-0.507273,-0.438331,-0.728402
3,-0.441782,0.989221,-1.305043,-0.908274,-0.507273,0.466741,0.719574
4,-0.512957,0.989221,-0.292447,-0.908274,-0.507273,0.466741,-0.776512
...,...,...,...,...,...,...,...
1333,0.768185,0.989221,0.050278,1.580335,-0.507273,0.466741,-0.220468
1334,-1.509401,-1.010141,0.206062,-0.908274,-0.507273,1.371813,-0.913661
1335,-1.509401,-1.010141,1.014499,-0.908274,-0.507273,-0.438331,-0.961237
1336,-1.295877,-1.010141,-0.797515,-0.908274,-0.507273,-1.343402,-0.930014


In [27]:
#splitting the data in 70:30 ratio as train and test
x_train = x[:int(0.7*len(x))]
x_test = x[int(0.7*len(x)):]
y_train = y[:int(0.7*len(x))]
y_test = y[int(0.7*len(x)):]

In [28]:
#defining cost function
def cost_function(x, y, w, b):
  m = len(y)
  j = np.sum((x.dot(w)+b-y)**2)/(2*m)
  return j
  

In [29]:
#optimization by batch gradient descent
def optimize(x, y, w, b, alpha, iterations):
  cost_list = []
  for i in range(iterations):
    
    loss = x.dot(w)+b - y

    weight_gradient = x.T.dot(loss)/len(y)
    bias_gradient = np.sum(loss)/len(y)

    w = w - alpha*weight_gradient
    b = b - alpha*bias_gradient

    cost = cost_function(x,y, w, b)
    cost_list.append(cost)

    if i%200==0:
      print(cost)

  return w, b, cost_list 



In [30]:
#predicting the y for given weights
def predict_y(x, w, b):
  return x.dot(w)+b


In [31]:
#calculating the r^2 accuracy
def r2(y_pred, y):
  rss = np.sum((y_pred - y) ** 2)
  tss = np.sum((y-y.mean()) ** 2)

  r2 = 1-(rss/tss)
  return r2


In [32]:
w, b, cost_list= optimize(x_train, y_train, np.zeros(x.shape[1]), 0, 0.002,iterations=2000)

0.49191712388966163
0.2870741327446541
0.194299652419404
0.152159309698452
0.13296387015911715
0.12419587540689582
0.12018000744901539
0.1183357876521491
0.11748664768508009
0.11709467111475236


In [33]:
y_pred_train = predict_y(x_train, w, b)
y_pred_test = predict_y(x_test, w, b)


In [34]:
print(r2(y_pred_train, y_train))
print(r2(y_pred_test, y_test))

0.7630060971412639
0.7217044502029757


In [35]:
!pip install sklearn



You should consider upgrading via the 'c:\python3.9\python.exe -m pip install --upgrade pip' command.


In [36]:
from sklearn.linear_model import LinearRegression

In [37]:
reg=LinearRegression()

In [38]:
reg.fit(x_train,y_train)

LinearRegression()

In [39]:
print(reg.score(x_test,y_test))

0.7208798772569873
