## TEST X

In [73]:
# imports
import numpy as np
import pandas as pd
import joblib
from fastapi import FastAPI
from pydantic import BaseModel
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings("ignore")

In [74]:
X = pd.read_csv("/Users/student/Desktop/Tests/API/Data/preprocessed_X.csv")
X.head()

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires
0,0,Citroën,140411,100,diesel,black,other,1,1,0,0,1,1,1
1,1,Citroën,13929,317,other,grey,other,1,1,0,0,0,1,1
2,2,Citroën,183297,120,diesel,white,other,0,0,0,0,1,0,1
3,3,Citroën,128035,135,diesel,other,other,1,1,0,0,1,1,1
4,4,Citroën,97097,160,diesel,other,other,1,1,0,0,0,1,1


In [75]:
X.drop(columns="Unnamed: 0", axis=1, inplace=True)
X.head()


Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires
0,Citroën,140411,100,diesel,black,other,1,1,0,0,1,1,1
1,Citroën,13929,317,other,grey,other,1,1,0,0,0,1,1
2,Citroën,183297,120,diesel,white,other,0,0,0,0,1,0,1
3,Citroën,128035,135,diesel,other,other,1,1,0,0,1,1,1
4,Citroën,97097,160,diesel,other,other,1,1,0,0,0,1,1


In [76]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4842 entries, 0 to 4841
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   model_key                  4842 non-null   object
 1   mileage                    4842 non-null   int64 
 2   engine_power               4842 non-null   int64 
 3   fuel                       4842 non-null   object
 4   paint_color                4842 non-null   object
 5   car_type                   4842 non-null   object
 6   private_parking_available  4842 non-null   int64 
 7   has_gps                    4842 non-null   int64 
 8   has_air_conditioning       4842 non-null   int64 
 9   automatic_car              4842 non-null   int64 
 10  has_getaround_connect      4842 non-null   int64 
 11  has_speed_regulator        4842 non-null   int64 
 12  winter_tires               4842 non-null   int64 
dtypes: int64(9), object(4)
memory usage: 491.9+ KB


In [77]:
# change objects to categories
X["model_key"] = X["model_key"].astype("category")
X["fuel"] = X["fuel"].astype("category")
X["paint_color"] = X["paint_color"].astype("category")
X["car_type"] = X["car_type"].astype("category")

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4842 entries, 0 to 4841
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   model_key                  4842 non-null   category
 1   mileage                    4842 non-null   int64   
 2   engine_power               4842 non-null   int64   
 3   fuel                       4842 non-null   category
 4   paint_color                4842 non-null   category
 5   car_type                   4842 non-null   category
 6   private_parking_available  4842 non-null   int64   
 7   has_gps                    4842 non-null   int64   
 8   has_air_conditioning       4842 non-null   int64   
 9   automatic_car              4842 non-null   int64   
 10  has_getaround_connect      4842 non-null   int64   
 11  has_speed_regulator        4842 non-null   int64   
 12  winter_tires               4842 non-null   int64   
dtypes: category(4), int64(9)
memory u

In [79]:
# preprocessing
cat_features = []
num_features = []

for col_name, col_type in X.dtypes.items():
    if ((col_type=="category")):
        cat_features.append(col_name)
    elif col_type==np.int64:
        num_features.append(col_name)

num_transformer = Pipeline(steps=[
    ("standardization", StandardScaler())
])

# bools are created from one hot encoding, so it is important to replace those values with binary code
cat_transformer = Pipeline(steps=[
    ("one hot encoding", OneHotEncoder(drop="first"))
])

# create preprocessor
# parameters: name, transformer, columns to be applied on
preprocessor = ColumnTransformer(transformers=[
    ("numerical", num_transformer, num_features),
    ("categorical", cat_transformer, cat_features)
]) 

print(cat_features)
print(num_features)
display(preprocessor)

['model_key', 'fuel', 'paint_color', 'car_type']
['mileage', 'engine_power', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']


In [81]:
X_preprocessed = preprocessor.fit_transform(X)
X_preprocessed[0]


array([-0.00965592, -0.74339496,  0.90532684,  0.51146318, -0.503096  ,
       -0.49761089,  1.08226635,  1.77256924,  0.27000094,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ])

In [89]:
test_data = {
  "model_key": "Peugeot",
  "mileage": 20000,
  "engine_power": 135,
  "fuel": "diesel",
  "paint_color": "grey",
  "car_type": "estate",
  "private_parking_available": 1,
  "has_gps": 0,
  "has_air_conditioning": 1,
  "automatic_car": 1,
  "has_getaround_connect": 0,
  "has_speed_regulator": 0,
  "winter_tires": 1
}

In [90]:
test_data_df =     df = pd.DataFrame(dict(test_data), index=[0])

test_data_df

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires
0,Peugeot,20000,135,diesel,grey,estate,1,0,1,1,0,0,1


In [84]:
preprocessor.transform(test_data_df)

array([[-2.01108183,  0.15481653,  0.90532684, -1.95517497,  1.98769219,
         2.00960234, -0.92398697, -0.56415286,  0.27000094,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ]])

In [85]:
test_data_preprocessed = preprocessor.transform(test_data_df)
test_data_preprocessed

array([[-2.01108183,  0.15481653,  0.90532684, -1.95517497,  1.98769219,
         2.00960234, -0.92398697, -0.56415286,  0.27000094,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ]])

## API

In [86]:
# declare app
app = FastAPI()

# declare pred features
class PredictionFeatures(BaseModel):
    model_key: str
    mileage: int
    engine_power: int
    fuel: str
    paint_color: str
    car_type: str
    private_parking_available: int
    has_gps: int
    has_air_conditioning: int
    automatic_car: int
    has_getaround_connect: int
    has_speed_regulator: int
    winter_tires: int

In [87]:
# create "predict" endpoint
@app.post("/predict")
def predict(PredictionFeatures: PredictionFeatures):
   # read features into df
    df = pd.DataFrame(dict(PredictionFeatures), index=[0])

    # change objects to categories
    df["model_key"] = df["model_key"].astype("category")
    df["fuel"] = df["fuel"].astype("category")
    df["paint_color"] = df["paint_color"].astype("category")
    df["car_type"] = df["car_type"].astype("category")
    
    df = preprocessor.transform(df)

    # load model
    model = joblib.load("final_model_api")

    # make prediction
    prediction = model.predict(df)

    # return prediction in a list
    returned_pred = {"prediction": prediction.tolist()[0]}
    return returned_pred

In [88]:
predict(test_data)

{'prediction': 144.91}