## **Exercise 18.03**
###  Adding Data Processing Steps into a Web API

### Importing modules

In [1]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

### Loading data

In [10]:
data =  pd.read_csv('https://raw.githubusercontent.com/fenago/DSBook/main/Chapter%204/glass.csv')

In [None]:
data.head()

In [12]:
data.fillna(0, inplace=True)

### Extract the 'Type' response variable using the .pop() method:

In [13]:

y = data.pop('Type')

### Dropping RI COLUMN

In [14]:
data.drop('RI', axis=1, inplace=True)

In [None]:
data.head()

### Create a variable called 'training_rows' that will contain the number of rows that correspond to 70% of the records:

In [17]:
training_rows = int(data.shape[0] * 0.7)
training_rows

149

### Instantiate a RandomForestClassifier with random_state=1 and save it into a new variable called rf_model:

In [None]:

rf_model = RandomForestClassifier(random_state=1)

### Split the df and y DataFrames into training and test sets using training_rows as the threshold for the split:

In [19]:
X_train = data[:training_rows]
y_train = y[:training_rows]
X_test = data[training_rows:]
y_test = y[training_rows:]

### Calculate the number of missing values for each column by combining the.isna() with .sum() methods

In [None]:
X_train.isna().sum()

### Extract the list of columns that are not of the object type and save the result in a variable called num_columns:

In [None]:
num_columns = [col for col in X_train.columns if X_train[col].dtype != 'object']
num_columns

### Create an empty dictionary called column_mean, iterate through the num_columns list, and for each column, add the column name and its average value to this dictionary and display its content:

In [None]:
column_mean = {}
for col in num_columns:
 column_mean[col] = X_train[col].mean()
column_mean


In [23]:
import pickle
pickle.dump(column_mean, open("columns_mean.pkl", "wb" ) )

In [None]:
for col in num_columns:
  X_train[col].fillna(column_mean[col], inplace=True)

### Import the pickle package and save column_mean into a file called columns_mean.pkl

In [None]:
rf_model = RandomForestClassifier(random_state=1)
rf_model.fit(X_train, y_train)
joblib.dump(rf_model, "model.pkl") 


### Import the socket, threading, requests, json, and numpy packages and the Flask class, as well as the jsonify and request functions from the flask package:

In [26]:
import socket
import threading
import requests
import json
from flask import Flask, jsonify, request
import numpy as np

### Create a new Flask app and save it into a variable called app

In [27]:

app = Flask(__name__)

In [28]:
trained_model = joblib.load("model.pkl")
var_means = pickle.load(open("columns_mean.pkl", "rb" ) )

### Create an API endpoint for the 'api' path that accepts only POST requests and will call a function called predict().

In [29]:
@app.route('/api', methods=['POST'])
def predict():
  data = request.get_json()
  df_test = pd.DataFrame(data, index=[0])
  for col, avg_value in var_means.items():
    df_test[col].fillna(avg_value, inplace=True)
  prediction = trained_model.predict(df_test)
  str_pred = np.array2string(prediction)
  return jsonify(str_pred)

### Create a new thread for running your Flask app using the threading.Threadmethod with the following parameters: target=app.run and kwargs={'host':'0.0.0.0','port':80}:

In [None]:
flask_thread = threading.Thread(target=app.run, kwargs={'host':'0.0.0.0','port':80})
flask_thread.start()

### Convert the first record of X_test that has missing value on the 'Bare Nuclei' column and convert it into json format using the .to_json()method

In [None]:
record = X_test.iloc[0,].to_list()
record

In [36]:
headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'}
ip_address = socket.gethostbyname(socket.gethostname())


In [39]:
j_data = json.dumps([record])

In [None]:
r = requests.post(f"http://{ip_address}/api", data=j_data, headers=headers)
r.text