# All Regression Models

## Initial Setup

### Importing Libraries

In [42]:
import math
import redshift_connector
import pandas as pd
import keyring
import numpy as np
import matplotlib.pyplot as plt

### Importing Dataset

In [174]:
### Importing via CSV
### dataset = pd.read_csv('Data.csv')

### Importing via Redshift
pwd = keyring.get_password("redshift-production.db.customink.com", "dan.caley")

# Connecting to redshift
# https://docs.aws.amazon.com/redshift/latest/mgmt/python-connect-examples.html#python-connect-query
conn = redshift_connector.connect(
    host='redshift-production.db.customink.com',
    database='cink',
    user='dan.caley',
    password= pwd
)

# Reading SQL File
open_file = open('sql_code.sql','r')
sql_file = open_file.read()
open_file.close()

# Running Query from sql file
dataset = pd.read_sql_query(sql_file, conn)

# Removing Binary headers
remove_binary = dataset.columns.astype(str).str.replace("'b",'')
dataset.columns = remove_binary


In [175]:
dataset.describe()

Unnamed: 0,customer_account_id,days_deliverd,net_price,total_units,designs_prior_30,errors,median_household_income,population,segment_rank_ultra,uber_channel_attr_rank,purchase_days,palive,sales_90d,orders,sales_bulk_following_365
count,70941.0,70941.0,70941.0,70941.0,70941.0,70941.0,70941.0,70941.0,70941.0,70941.0,70941.0,70941.0,70941.0,70941.0,70941.0
mean,27466780.0,11.172312,598.665785,48.496032,2.53646,0.134732,80466.111994,29655.57373,2.188269,1.0,19.273833,0.195373,1115.206883,0.801511,333.979086
std,5805846.0,6.040281,704.350811,92.846683,3.453797,0.341439,35566.018544,19316.695875,0.828699,0.0,26.094422,0.274609,9467.560621,1.704437,1896.062144
min,2082.0,1.0,0.2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,28133540.0,8.0,268.07,13.0,1.0,0.0,54653.0,15094.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,29358840.0,11.0,399.04,25.0,1.0,0.0,74187.0,27562.0,2.0,1.0,3.0,0.022,198.59,1.0,0.0
75%,30491920.0,13.0,664.16,50.0,3.0,0.0,100909.0,40770.0,3.0,1.0,33.0,0.322,646.17,1.0,0.0
max,41958860.0,309.0,31079.0,5000.0,137.0,1.0,250001.0,128294.0,3.0,1.0,90.0,1.0,369616.8,94.0,68180.05


In [177]:
# Check NA's
dataset.isna().sum()

customer_order_id            0
customer_account_id          0
date_placed                  0
days_deliverd                0
net_price                    0
total_units                  0
designs_prior_30             0
errors                       0
median_household_income      0
population                   0
segment_name_uber            0
segment_name_ultra           0
segment_rank_ultra           0
style_uber_category          0
style_category_utlra         0
uber_sales_channel_attr      0
uber_channel_attr_rank       0
uber_sales_channel_placed    0
purchase_days                0
palive                       0
sales_90d                    0
orders                       0
sales_bulk_following_365     0
dtype: int64

## Multiple Linear Regression

### Finding the Best Model

In [178]:
import statsmodels.api as sm

In [179]:
dataset['sales_bulk_following_365'] = dataset['sales_bulk_following_365']
dataset['net_price'] = dataset['net_price']
dataset['zip_wealth'] = dataset['population'] * dataset['median_household_income']

In [182]:
data = dataset[['total_units','net_price','designs_prior_30','errors','zip_wealth','sales_90d','palive','sales_bulk_following_365']]
#data = pd.get_dummies(data, columns=['segment_name_uber'], drop_first=True)

In [183]:
### Breaking the Dependant and Independant Variables
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [184]:
data.head()

Unnamed: 0,total_units,net_price,designs_prior_30,errors,zip_wealth,sales_90d,palive,sales_bulk_following_365
0,11.0,700.48,2,0,3053451940,700.48,0.711,597.26
1,34.0,494.06,1,0,323131066,494.06,0.589,0.0
2,6.0,209.52,5,0,699209720,209.52,0.022,0.0
3,20.0,532.8,7,0,114061272,532.8,0.389,2016.17
4,20.0,555.0,3,0,644308746,0.0,0.0,0.0


In [185]:
X_const = sm.add_constant(X) # adding a constant
model = sm.OLS(y, X).fit()
print(model.summary())

                                    OLS Regression Results                                   
Dep. Variable:     sales_bulk_following_365   R-squared (uncentered):                   0.260
Model:                                  OLS   Adj. R-squared (uncentered):              0.260
Method:                       Least Squares   F-statistic:                              3555.
Date:                      Sun, 06 Mar 2022   Prob (F-statistic):                        0.00
Time:                              21:24:26   Log-Likelihood:                     -6.2651e+05
No. Observations:                     70941   AIC:                                  1.253e+06
Df Residuals:                         70934   BIC:                                  1.253e+06
Df Model:                                 7                                                  
Covariance Type:                  nonrobust                                                  
                       coef    std err          t      P>|t|

  x = pd.concat(x[::order], 1)


In [186]:
ypred = model.predict(X)


In [187]:
from sklearn import metrics
import math

In [188]:

mse = metrics.mean_squared_error(y, ypred)
rmse = math.sqrt(mse)
print(rmse)

1656.4500128531063


In [189]:
rmse2 = np.exp(rmse)
rmse2

  rmse2 = np.exp(rmse)


inf

In [190]:
#dataset['prediction'] = np.exp(ypred)
#dataset['sales_bulk_following_365'] = np.exp(dataset['sales_bulk_following_365'])
#dataset['accuracy'] = np.where(abs(dataset['sales_bulk_following_365'] - dataset['prediction'])<=rmse2,1,0)


In [117]:
#dataset['variance'] = abs(dataset['sales_bulk_following_365'] - dataset['prediction']) / dataset['sales_bulk_following_365']

In [118]:
#dataset

### Dealing with Dummy Variables

In [125]:
data = dataset[['days_deliverd','net_price','total_units','designs_prior_30','errors','zip_wealth','segment_name_ultra','palive','sales_bulk_following_365']]

In [126]:
data = pd.get_dummies(data, columns=['segment_name_ultra'], drop_first=True)
data = data[['days_deliverd','net_price','total_units','designs_prior_30','errors','segment_name_ultra_Family, Org, & Athletics',
             'segment_name_ultra_Students & Schools','palive','sales_bulk_following_365']]
data

Unnamed: 0,days_deliverd,net_price,total_units,designs_prior_30,errors,"segment_name_ultra_Family, Org, & Athletics",segment_name_ultra_Students & Schools,palive,sales_bulk_following_365
0,9,151.26,6.0,1,0,0,0,0.0000,0.0
1,4,516.45,25.0,2,0,0,1,0.1333,0.0
2,10,41.99,1.0,3,0,0,1,0.0000,0.0
3,5,561.66,40.0,1,0,0,0,0.0000,0.0
4,15,439.00,50.0,2,0,0,0,0.0000,0.0
...,...,...,...,...,...,...,...,...,...
70936,14,248.00,100.0,1,0,0,0,0.8000,0.0
70937,13,324.22,18.0,2,0,0,1,0.4666,0.0
70938,12,243.36,6.0,1,0,0,0,0.0000,0.0
70939,3,279.74,7.0,1,1,1,0,0.0000,0.0


In [127]:
### Breaking the Dependant and Independant Variables
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
data.head()

Unnamed: 0,days_deliverd,net_price,total_units,designs_prior_30,errors,"segment_name_ultra_Family, Org, & Athletics",segment_name_ultra_Students & Schools,palive,sales_bulk_following_365
0,9,151.26,6.0,1,0,0,0,0.0,0.0
1,4,516.45,25.0,2,0,0,1,0.1333,0.0
2,10,41.99,1.0,3,0,0,1,0.0,0.0
3,5,561.66,40.0,1,0,0,0,0.0,0.0
4,15,439.0,50.0,2,0,0,0,0.0,0.0


In [128]:
X_const = sm.add_constant(X) # adding a constant
model = sm.OLS(y, X).fit()
print(model.summary())

                                    OLS Regression Results                                   
Dep. Variable:     sales_bulk_following_365   R-squared (uncentered):                   0.066
Model:                                  OLS   Adj. R-squared (uncentered):              0.066
Method:                       Least Squares   F-statistic:                              627.9
Date:                      Sun, 06 Mar 2022   Prob (F-statistic):                        0.00
Time:                              19:34:00   Log-Likelihood:                     -6.1232e+05
No. Observations:                     70941   AIC:                                  1.225e+06
Df Residuals:                         70933   BIC:                                  1.225e+06
Df Model:                                 8                                                  
Covariance Type:                  nonrobust                                                  
                                                  coef    st

  x = pd.concat(x[::order], 1)


### Linear Regression Sckit Learn

### Splitting the dataset into the Training set and Test set

In [24]:
### Adding Dummy Variables
#data = dataset[['days_deliverd','net_price','total_units','designs_prior_30','errors','segment_name_ultra','sales_bulk_following_365']]
#data = pd.get_dummies(data, columns=['segment_name_ultra'], drop_first=True)
#data = data[['days_deliverd','net_price','total_units','designs_prior_30','errors','segment_name_ultra_Family, Org, & Athletics',
#             'segment_name_ultra_Students & Schools','sales_bulk_following_365']]
#X = data.iloc[:, :-1]
#y = data.iloc[:, -1]

In [37]:
data = dataset[['days_deliverd','net_price','designs_prior_30','errors','zip_wealth','sales_bulk_following_365']]
#data['sales_bulk_following_365'] = np.log(data['sales_bulk_following_365'])
#data['net_price'] = np.log(data['net_price'])
### Breaking the Dependant and Independant Variables
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Training the Multiple Linear Regression model on the Training set

In [39]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

### Predicting the Test set results

In [40]:
y_pred = regressor.predict(X_test)
#np.set_printoptions(precision=2)

### Evaluating the Model Performance

In [41]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.07857580513623186