In [5]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

total_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/linear-regression-project-tutorial/main/medical_insurance_cost.csv")
total_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [8]:
total_data["sex_n"] = pd.factorize(total_data["sex"])[0]
total_data["smoker_n"] = pd.factorize(total_data["smoker"])[0]
total_data["region_n"] = pd.factorize(total_data["region"])[0]
num_variables = ["age", "bmi", "children", "sex_n", "smoker_n", "region_n", "charges"]
num_database = pd.DataFrame(total_data[num_variables], index = total_data.index, columns = num_variables)

X = num_database
y = num_database["charges"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train.to_csv("clean_train.csv", index = False)
X_test.to_csv("clean_test.csv", index = False)

train_data = pd.read_csv("clean_train.csv")
test_data = pd.read_csv("clean_test.csv")

scaler = MinMaxScaler()
scal_features_test = scaler.fit_transform(test_data[num_variables])
total_data_scal_test = pd.DataFrame(scal_features_test, index = test_data.index, columns = num_variables)
total_data_scal_test.head()


Unnamed: 0,age,bmi,children,sex_n,smoker_n,region_n,charges
0,0.586957,0.233748,0.4,0.0,1.0,1.0,0.127134
1,0.391304,0.369216,0.0,0.0,1.0,0.666667,0.066104
2,1.0,0.28156,0.0,0.0,0.0,0.666667,0.450191
3,0.608696,0.249685,0.6,1.0,1.0,0.666667,0.130436
4,0.021739,0.42234,0.0,1.0,0.0,0.666667,0.520743


In [9]:
scal_features_train = scaler.fit_transform(train_data[num_variables])
total_data_scal_train = pd.DataFrame(scal_features_train, index = train_data.index, columns = num_variables)
total_data_scal_train.head()

Unnamed: 0,age,bmi,children,sex_n,smoker_n,region_n,charges
0,0.608696,0.107345,0.4,0.0,1.0,0.666667,0.131313
1,0.630435,0.224913,0.0,0.0,1.0,1.0,0.12059
2,0.73913,0.23944,0.0,0.0,1.0,0.333333,0.422901
3,0.456522,0.493947,1.0,0.0,1.0,0.333333,0.121601
4,0.782609,0.148238,0.6,0.0,1.0,0.666667,0.184696


In [10]:
X_train = total_data_scal_train.drop(["charges"], axis = 1)
y_train = total_data_scal_train["charges"]
X_test = total_data_scal_test.drop(["charges"], axis = 1)
y_test = total_data_scal_test["charges"]

model = LinearRegression()
model.fit(X_train, y_train)

print(f"Intercept (a): {model.intercept_}")
print(f"Coefficients (b1, b2): {model.coef_}")

Intercept (a): 0.3213152613903446
Coefficients (b1, b2): [ 1.92360435e-01  2.03038802e-01  3.45765858e-02 -3.05696295e-04
 -3.84698775e-01  1.32396221e-02]


In [11]:
y_pred = model.predict(X_test)
y_pred

array([ 1.24053884e-01,  9.56794915e-02,  5.79669753e-01,  1.33667950e-01,
        4.19769192e-01,  1.58219283e-01, -1.80163075e-02,  2.57226098e-01,
       -5.62981897e-03,  1.62932035e-01,  4.34996735e-01,  1.33349077e-01,
        6.64682405e-02,  6.08220520e-01,  6.37900902e-01,  5.86128295e-01,
        2.31330787e-01,  5.66404747e-01,  1.28815348e-01,  4.91853033e-01,
        4.03813385e-02,  1.44296980e-01,  1.76149242e-02,  9.70979091e-02,
        1.64183576e-01,  1.90465060e-01,  2.16054330e-01,  8.09469753e-02,
        1.40507142e-01,  1.63703564e-02,  1.27153882e-01,  1.93743242e-01,
        5.60736271e-02,  3.41673090e-02,  5.15316714e-02,  1.92972430e-01,
        1.04913345e-02,  1.22359391e-01,  5.20763883e-01,  5.11981670e-01,
        4.22785612e-02,  5.12950756e-02,  2.10216571e-01,  1.65666778e-01,
        1.24775330e-01,  1.75336812e-01,  6.72779577e-02,  2.91272105e-02,
        5.59455854e-01,  1.31399179e-01,  2.39825213e-01,  1.89143643e-02,
        1.84039889e-01,  

In [12]:
print(f"Mean squared error: {mean_squared_error(y_test, y_pred)}")
print(f"Coefficient of determination: {r2_score(y_test, y_pred)}")

Mean squared error: 0.008584313423077833
Coefficient of determination: 0.7830468064321546
