# Random Forest Regression

## Importing the libraries

In [24]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import redshift_connector
import keyring

## Importing the dataset

In [25]:
### Importing via Redshift
pwd = keyring.get_password("redshift-production.db.customink.com", "dan.caley")

# Connecting to redshift
# https://docs.aws.amazon.com/redshift/latest/mgmt/python-connect-examples.html#python-connect-query
conn = redshift_connector.connect(
    host='redshift-production.db.customink.com',
    database='cink',
    user='dan.caley',
    password= pwd
)

# Reading SQL File
open_file = open('sql_code.sql','r')
sql_file = open_file.read()
open_file.close()

# Running Query from sql file
dataset = pd.read_sql_query(sql_file, conn)

# Removing Binary headers
remove_binary = dataset.columns.astype(str).str.replace("'b",'')
dataset.columns = remove_binary



In [26]:
dataset.describe()

Unnamed: 0,days_deliverd,net_price,total_units,designs_prior_30,errors,zip_wealth,sales_bulk_following_365
count,70941.0,70941.0,70941.0,70941.0,70941.0,70941.0,70941.0
mean,11.172312,598.665785,48.496032,2.53646,0.134732,2407322000.0,1084.663598
std,6.040281,704.350811,92.846683,3.453797,0.341439,1872485000.0,2468.901342
min,1.0,0.2,0.0,0.0,0.0,0.0,1.53
25%,8.0,268.07,13.0,1.0,0.0,1025876000.0,264.78
50%,11.0,399.04,25.0,1.0,0.0,2038128000.0,493.0
75%,13.0,664.16,50.0,3.0,0.0,3338458000.0,1061.0
max,309.0,31079.0,5000.0,137.0,1.0,16080830000.0,90764.21


In [27]:
# Check NA's
dataset.isna().sum()

customer_order_id            0
date_placed                  0
days_deliverd                0
net_price                    0
total_units                  0
designs_prior_30             0
errors                       0
zip_wealth                   0
segment_name_uber            0
segment_name_ultra           0
style_uber_category          0
style_category_utlra         0
uber_sales_channel_attr      0
uber_sales_channel_placed    0
sales_bulk_following_365     0
dtype: int64

In [28]:
# Importing CSV
#dataset = pd.read_csv('Data.csv')
data = dataset[['days_deliverd','net_price','total_units','designs_prior_30','errors','zip_wealth','sales_bulk_following_365']]
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Random Forest Regression model on the whole dataset

In [30]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

## Predicting the Test set results

In [31]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 954.67   37.58]
 [ 611.15  885.6 ]
 [ 739.92 2427.83]
 ...
 [ 420.44  432.52]
 [1148.91  263.04]
 [ 224.05 2279.18]]


## Evaluating the Model Performance

In [32]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.023098762076678825