In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [4]:
auto_df = pd.read_csv('datasets/CarPrice_Assignment.csv')
auto_df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


# JSON Serialization

## Exploring Cleaning data

In [5]:
auto_df.drop(['car_ID', 'symboling', 'CarName'], axis=1, inplace=True)

In [6]:
## Encoder les données str to int
auto_df = pd.get_dummies(auto_df)

## Training model

In [7]:
X = auto_df.drop('price', axis=1)
Y = auto_df['price']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
model = LinearRegression()
model = model.fit(x_train, y_train)

In [9]:
y_pred = model.predict(x_test)

In [10]:
# To show our prediction and the reel values
df_y = pd.DataFrame({'y_test' : y_test, 'y_pred' : y_pred})
df_y.sample(10)

Unnamed: 0,y_test,y_pred
161,8358.0,6996.769125
171,11549.0,13154.217552
181,15750.0,19818.609405
52,6795.0,5940.460004
80,9959.0,9353.320217
156,6938.0,7682.139138
92,6849.0,6229.391652
22,6377.0,5837.60513
193,12290.0,10811.588613
18,5151.0,356.336506


In [11]:
print("Training score : ", model.score(x_train, y_train))

score = r2_score(y_test, y_pred)
print("Testing score : ", score)

Training score :  0.9510014290429848
Testing score :  0.8705629244079386


## JSON Serialization

In [12]:
import json

In [13]:
## Obtain info about model
print(model.coef_, model.intercept_)

[ 8.57478757e+01 -6.24044033e+01  6.45494829e+02  7.72176065e+01
  4.55627580e+00  9.30272943e+01 -2.26810032e+03 -3.09893913e+03
 -6.60230718e+02 -5.38721435e+00  1.69493040e+00 -1.00476142e+02
  1.24308571e+02  3.14783471e+03 -3.14783471e+03 -8.69918810e+02
  8.69918810e+02  7.53235430e+01 -7.53235430e+01  3.02463849e+03
 -1.08305973e+03 -6.90543343e+02 -5.78297446e+00 -1.24525244e+03
 -1.03215926e+03 -2.20097670e+02  1.25225693e+03 -5.15883946e+03
  5.15883946e+03  3.17867601e+02  3.41060513e-12 -1.83505030e+03
  2.41465424e+03  1.89245449e+03 -4.97319336e+03  2.18326733e+03
  5.18951329e+03 -2.80496125e+03 -4.37485131e+03 -6.82097949e+02
  0.00000000e+00  4.89129887e+02  2.18326733e+03  6.85903772e+02
  3.35795690e+02 -8.61682649e+02  3.14783471e+03 -1.72386248e+03
  6.55371147e+02 -1.80637197e+03 -4.32988210e+02] -30231.805508318423


In [15]:
## For a Regression model, we just need to save those coefficient
model_param = {}

model_param['coef'] = list(model.coef_)
model_param['intercept'] = model.intercept_.tolist()

In [16]:
json_txt = json.dumps(model_param, indent=4)
json_txt

'{\n    "coef": [\n        85.74787568553332,\n        -62.40440326866277,\n        645.4948289878965,\n        77.21760650062174,\n        4.5562758028051675,\n        93.0272942644537,\n        -2268.100319176822,\n        -3098.939130831859,\n        -660.2307179333836,\n        -5.387214345570328,\n        1.694930399263285,\n        -100.47614179284096,\n        124.30857132656683,\n        3147.8347085674145,\n        -3147.8347085673627,\n        -869.9188100300985,\n        869.9188100301453,\n        75.32354298698277,\n        -75.32354298696515,\n        3024.6384893029876,\n        -1083.0597299150506,\n        -690.5433430511237,\n        -5.782974458157241,\n        -1245.2524418787084,\n        -1032.159255350555,\n        -220.09767018719242,\n        1252.2569255376775,\n        -5158.8394583782265,\n        5158.83945837823,\n        317.8676006931753,\n        3.410605131648481e-12,\n        -1835.0502980684082,\n        2414.6542357959675,\n        1892.454493068901

In [17]:
# Save model
with open('models/regerssor_param.txt', 'w') as file:
    file.write(json_txt)

In [22]:
# To open and use
with open('models/regerssor_param.txt', 'r') as file:
    json_text = json.load(file)

json_model = LinearRegression()
json_model.coef_ = np.array(json_text['coef'])
json_model.intercept_ = np.array(json_text['intercept'])

In [23]:
# Use model
y_pred = json_model.predict(x_test)
r2_score(y_test, y_pred)



0.8705629244079386

## PICKLE Serilization

In [24]:
import pickle

In [26]:
pickle.dump(model, open('models/model.pkl', 'wb'))

In [27]:
## Reading and use
pickle_model = pickle.load(open('models/model.pkl', 'rb'))

In [29]:
y_pred = pickle_model.predict(x_test)
r2_score(y_test, y_pred)

0.8705629244079386

## JOBLIB Serialization

In [31]:
import joblib

In [30]:
filename = 'models/model.joblib'

In [32]:
joblib.dump(model, filename)

['models/model.joblib']

In [33]:
## Reading and use
joblib_model = joblib.load(filename)

In [34]:
y_pred = joblib_model.predict(x_test)
r2_score(y_test, y_pred)

0.8705629244079386