# Predicting whether an order should be sent to a technical approver

## Part 1: Load and examine the data

In [1]:
data_bucket = 'mlforbusiness'
subfolder = 'ch02'
dataset = 'orders_with_predicted_value.csv'

In [2]:
import pandas as pd
from time import sleep
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset}')
df.head()

Unnamed: 0,tech_approval_required,requester_id,role,product,quantity,price,total
0,0,E2300,tech,Desk,1,664,664
1,0,E2300,tech,Keyboard,9,649,5841
2,0,E2374,non-tech,Keyboard,1,821,821
3,1,E2374,non-tech,Desktop Computer,24,655,15720
4,0,E2327,non-tech,Desk,1,758,758


In [4]:
print(f'Number of rows in dataset: {df.shape[0]}')
print(df[df.columns[0]].value_counts())

Number of rows in dataset: 1000
0    807
1    193
Name: tech_approval_required, dtype: int64


## Part 2: Get the data into the right shape

In [5]:
encoded_data = pd.get_dummies(df)
encoded_data.head()

Unnamed: 0,tech_approval_required,quantity,price,total,requester_id_E2300,requester_id_E2301,requester_id_E2302,requester_id_E2303,requester_id_E2304,requester_id_E2306,...,requester_id_E2400,role_non-tech,role_tech,product_Chair,product_Cleaning,product_Desk,product_Desktop Computer,product_Keyboard,product_Laptop Computer,product_Mouse
0,0,1,664,664,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,0,9,649,5841,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,0,1,821,821,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,1,24,655,15720,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4,0,1,758,758,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [6]:
corrs = encoded_data.corr()['tech_approval_required'].abs()
columns = corrs[corrs > .1].index
corrs = corrs.filter(columns)
corrs

tech_approval_required      1.000000
role_non-tech               0.122454
role_tech                   0.122454
product_Chair               0.134168
product_Cleaning            0.191539
product_Desk                0.292137
product_Desktop Computer    0.752144
product_Keyboard            0.242224
product_Laptop Computer     0.516693
product_Mouse               0.190708
Name: tech_approval_required, dtype: float64

In [7]:
encoded_data = encoded_data[columns]
encoded_data.head()

Unnamed: 0,tech_approval_required,role_non-tech,role_tech,product_Chair,product_Cleaning,product_Desk,product_Desktop Computer,product_Keyboard,product_Laptop Computer,product_Mouse
0,0,0,1,0,0,1,0,0,0,0
1,0,0,1,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,1,0,0
3,1,1,0,0,0,0,1,0,0,0
4,0,1,0,0,0,1,0,0,0,0


## Part 3: Create training, validation and test data sets

In [8]:
train_df, val_and_test_data = train_test_split(encoded_data, test_size=0.3, random_state=0)

In [9]:
train_df.shape

(700, 10)

In [10]:
val_and_test_data.shape

(300, 10)

## Part 4: Train the model

In [11]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")
classifier = LogisticRegression(solver="lbfgs")

In [12]:
input_to_classifier = train_df[train_df.columns[train_df.columns != "tech_approval_required"]].values
output = train_df["tech_approval_required"].values

In [13]:
input_to_classifier.shape

(700, 9)

In [14]:
output.shape

(700,)

In [15]:
classifier.fit(input_to_classifier, output)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Part 5: Test the model

In [16]:
test_input = val_and_test_data[train_df.columns[train_df.columns != "tech_approval_required"]].values
predictions = classifier.predict(test_input)
val_and_test_data["prediction"] = predictions

In [17]:
val_and_test_data.head()

Unnamed: 0,tech_approval_required,role_non-tech,role_tech,product_Chair,product_Cleaning,product_Desk,product_Desktop Computer,product_Keyboard,product_Laptop Computer,product_Mouse,prediction
993,0,1,0,0,1,0,0,0,0,0,0
859,0,1,0,0,0,1,0,0,0,0,0
298,0,1,0,0,1,0,0,0,0,0,0
553,0,1,0,0,0,1,0,0,0,0,0
672,0,1,0,0,0,0,0,1,0,0,0


In [18]:
(val_and_test_data['prediction'] == val_and_test_data['tech_approval_required']).mean()

1.0