In [295]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import Lasso
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

from IPython.display import display # Allows the use of display() for DataFrames

# Pretty display for notebooks
%matplotlib inline

In [296]:
# Load the data 
full_data = pd.read_csv("/Users/derekwang/Desktop/Python/Sales Forecast/training_data_FY18_2H_FY19_All.csv")

In [297]:
# Define outcome
outcomes_raw = full_data[['bookings_pct_finish']]

# Define features = removing outcome, dt, and fy_quarter label
features_raw = full_data.drop(['bookings_pct_finish'], axis = 1)

# display(features_raw.head())
print(features_raw.columns)
# features_raw.describe()

Index(['dt', 'fy_quarter', 'day_of_qtr', 'sales_div', 'quota', 'stage_1_count',
       'stage_1_amount', 'stage_2_count', 'stage_2_amount', 'stage_3_count',
       'stage_3_amount', 'stage_4_count', 'stage_4_amount', 'stage_5_count',
       'stage_5_amount', 'bookings_qtd', 'direct_bookings', 'qtr_pct',
       's1_quota_ratio', 's2_quota_ratio', 's3_quota_ratio', 's4_quota_ratio',
       's5_quota_ratio', 's12345_bk_qtd_quota_ratio',
       's2345_bk_qtd_quota_ratio', 's345_bk_qtd_quota_ratio',
       's45_bk_qtd_quota_ratio', 's5_bk_qtd_quota_ratio', 'bookings_pct_qtd'],
      dtype='object')


In [298]:
# fill in NA's with 0 - FEATURES
features = features_raw.fillna(0.0)
# display(features.head())
print(features.columns)


# fill in NA's with 0 - OUTCOMES
outcomes = outcomes_raw.fillna(0.0)
# display(outcomes.head())
print(outcomes.columns)

Index(['dt', 'fy_quarter', 'day_of_qtr', 'sales_div', 'quota', 'stage_1_count',
       'stage_1_amount', 'stage_2_count', 'stage_2_amount', 'stage_3_count',
       'stage_3_amount', 'stage_4_count', 'stage_4_amount', 'stage_5_count',
       'stage_5_amount', 'bookings_qtd', 'direct_bookings', 'qtr_pct',
       's1_quota_ratio', 's2_quota_ratio', 's3_quota_ratio', 's4_quota_ratio',
       's5_quota_ratio', 's12345_bk_qtd_quota_ratio',
       's2345_bk_qtd_quota_ratio', 's345_bk_qtd_quota_ratio',
       's45_bk_qtd_quota_ratio', 's5_bk_qtd_quota_ratio', 'bookings_pct_qtd'],
      dtype='object')
Index(['bookings_pct_finish'], dtype='object')


In [299]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)

In [300]:
X_train_raw2 = X_train_raw.drop(['dt', 'fy_quarter', 'day_of_qtr', 'quota', 'stage_1_count', 'stage_1_amount', 
                          'stage_2_count', 'stage_2_amount', 'stage_3_count', 'stage_3_amount', 'stage_4_count', 
                          'stage_4_amount', 'stage_5_count', 'stage_5_amount', 'bookings_qtd', 'direct_bookings'], 
                                axis = 1)

X_test_raw2 = X_test_raw.drop(['dt', 'fy_quarter', 'day_of_qtr', 'quota', 'stage_1_count', 'stage_1_amount', 
                          'stage_2_count', 'stage_2_amount', 'stage_3_count', 'stage_3_amount', 'stage_4_count', 
                          'stage_4_amount', 'stage_5_count', 'stage_5_amount', 'bookings_qtd', 'direct_bookings'], 
                               axis = 1)


In [301]:
# One-hot encoding
X_train = pd.get_dummies(X_train_raw2)
X_test = pd.get_dummies(X_test_raw2)

In [302]:
# get rid of blanks in headers
X_train.columns = X_train.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
print('X_train columns')
print(X_train.columns)
display(X_train.head())

X_test.columns = X_test.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
print('X_test columns')
print(X_test.columns)
display(X_test.head())

X_train columns
Index(['qtr_pct', 's1_quota_ratio', 's2_quota_ratio', 's3_quota_ratio',
       's4_quota_ratio', 's5_quota_ratio', 's12345_bk_qtd_quota_ratio',
       's2345_bk_qtd_quota_ratio', 's345_bk_qtd_quota_ratio',
       's45_bk_qtd_quota_ratio', 's5_bk_qtd_quota_ratio', 'bookings_pct_qtd',
       'sales_div_api', 'sales_div_comm', 'sales_div_comm-vast',
       'sales_div_ed', 'sales_div_ent', 'sales_div_gov',
       'sales_div_healthcare', 'sales_div_intl', 'sales_div_intl-anz',
       'sales_div_intl-apac', 'sales_div_intl-emea', 'sales_div_intl-uk',
       'sales_div_majors', 'sales_div_network_alliance', 'sales_div_smb',
       'sales_div_smb-vast'],
      dtype='object')


Unnamed: 0,qtr_pct,s1_quota_ratio,s2_quota_ratio,s3_quota_ratio,s4_quota_ratio,s5_quota_ratio,s12345_bk_qtd_quota_ratio,s2345_bk_qtd_quota_ratio,s345_bk_qtd_quota_ratio,s45_bk_qtd_quota_ratio,...,sales_div_healthcare,sales_div_intl,sales_div_intl-anz,sales_div_intl-apac,sales_div_intl-emea,sales_div_intl-uk,sales_div_majors,sales_div_network_alliance,sales_div_smb,sales_div_smb-vast
120,0.404494,0.004063,0.083274,0.101866,0.094908,0.08777,0.519234,0.515171,0.431897,0.330031,...,0,0,0,0,0,0,0,0,0,0
3849,0.505618,0.085689,0.183588,0.162858,0.087046,0.091856,1.183806,1.098117,0.91453,0.751672,...,0,0,0,0,0,0,0,0,0,1
4595,0.651685,0.041134,0.672432,0.357264,0.422547,0.021961,2.445669,2.404535,1.732103,1.374839,...,0,0,1,0,0,0,0,0,0,0
1780,0.516854,0.02269,1.466905,0.646286,0.131429,0.4065,3.13369,3.111,1.644095,0.997809,...,0,0,0,0,0,0,0,1,0,0
4304,0.467391,0.043576,0.319995,0.595848,0.324918,0.076139,1.919407,1.875831,1.555836,0.959988,...,0,1,0,0,0,0,0,0,0,0


X_test columns
Index(['qtr_pct', 's1_quota_ratio', 's2_quota_ratio', 's3_quota_ratio',
       's4_quota_ratio', 's5_quota_ratio', 's12345_bk_qtd_quota_ratio',
       's2345_bk_qtd_quota_ratio', 's345_bk_qtd_quota_ratio',
       's45_bk_qtd_quota_ratio', 's5_bk_qtd_quota_ratio', 'bookings_pct_qtd',
       'sales_div_api', 'sales_div_comm', 'sales_div_comm-vast',
       'sales_div_ed', 'sales_div_ent', 'sales_div_gov',
       'sales_div_healthcare', 'sales_div_intl', 'sales_div_intl-anz',
       'sales_div_intl-apac', 'sales_div_intl-emea', 'sales_div_intl-uk',
       'sales_div_majors', 'sales_div_network_alliance', 'sales_div_smb',
       'sales_div_smb-vast'],
      dtype='object')


Unnamed: 0,qtr_pct,s1_quota_ratio,s2_quota_ratio,s3_quota_ratio,s4_quota_ratio,s5_quota_ratio,s12345_bk_qtd_quota_ratio,s2345_bk_qtd_quota_ratio,s345_bk_qtd_quota_ratio,s45_bk_qtd_quota_ratio,...,sales_div_healthcare,sales_div_intl,sales_div_intl-anz,sales_div_intl-apac,sales_div_intl-emea,sales_div_intl-uk,sales_div_majors,sales_div_network_alliance,sales_div_smb,sales_div_smb-vast
1583,0.358696,0.107858,0.468289,0.564631,0.484016,0.229901,2.089514,1.981656,1.513368,0.948736,...,0,0,0,0,0,0,0,0,0,0
3199,0.054348,0.135994,0.627076,0.552847,0.213799,0.096475,1.65627,1.520276,0.893199,0.340353,...,0,0,0,0,0,1,0,0,0,0
3860,0.629213,0.106382,0.198174,0.161867,0.086443,0.07725,1.372473,1.266091,1.067916,0.906049,...,0,0,0,0,0,0,0,0,0,1
2554,0.978261,0.611555,0.745264,1.165982,0.350741,0.367706,4.052103,3.440547,2.695283,1.529301,...,1,0,0,0,0,0,0,0,0,0
168,0.94382,0.008586,0.08266,0.14301,0.103489,0.112966,0.844769,0.836183,0.753523,0.610513,...,0,0,0,0,0,0,0,0,0,0


In [303]:
## scaling & lasso feature selection

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train) 

# lasso_reg = Lasso()
# lasso_reg.fit(X_train_scaled, y_train)

# reg_coef = lasso_reg.coef_
# print(reg_coef)

In [304]:
display(y_train.head())

Unnamed: 0,bookings_pct_finish
120,0.415658
3849,1.197114
4595,1.407783
1780,0.856805
4304,1.29583


In [305]:
reg = LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
         normalize=False)
reg.fit(X_train,y_train)


LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
         normalize=False)

In [306]:
reg.score(X_train,y_train)

0.7415549151075775

In [307]:
print("reg.coef_")
print(reg.coef_)
print("")
print("reg.intercept_")
print(reg.intercept_)

reg.coef_
[[-8.59042286e-01 -1.81772833e+11 -5.64510119e+11  1.70317422e+12
   7.57322637e+11  1.49383902e+11  1.81772833e+11  3.82737287e+11
  -2.26768434e+12  9.45851579e+11  6.07938735e+11  1.49383902e+11
   7.34888667e-01  7.78218388e-01  8.87569836e-01  7.17530146e-01
   4.41441055e-01  5.66013707e-01  7.08094419e-01  1.05893467e+00
   9.99898478e-01  9.91741279e-01  8.28338999e-01  9.12595388e-01
   6.73644520e-01  7.01101144e-01  8.96488135e-01  1.02008346e+00]]

reg.intercept_
0.0


In [308]:
y_test_hat = reg.predict(X_test)

In [309]:
print(y_test_hat)

[[0.850028  ]
 [1.02976264]
 [1.18930167]
 ...
 [1.00018246]
 [1.14090818]
 [0.85978722]]


In [310]:
from sklearn.metrics import r2_score

r2_scoring = r2_score(y_test,y_test_hat)
print('r2_score')
print(r2_scoring)
print('')

def avg_abs_var_pct(y_test,y_test_hat):
    var_percent = np.subtract(y_test,y_test_hat)/y_test
    abs_var_percent = abs(var_percent)
    av_abs_var_percent = np.mean(abs_var_percent)
    print('av_abs_var_percent')
    print(av_abs_var_percent)
    print('')
    
def avg_abs_var_finish(y_test,y_test_hat):
    var_finish = np.subtract(y_test,y_test_hat)
    abs_var_finish = abs(var_finish)
    av_abs_var_finish = np.mean(abs_var_finish)
    print('avg_abs_var_finish')
    print(av_abs_var_finish)
    print('')
    
avg_abs_var_pct(y_test,y_test_hat)

avg_abs_var_finish(y_test,y_test_hat)

r2_score
0.7473333323693414

av_abs_var_percent
bookings_pct_finish    0.185663
dtype: float64

avg_abs_var_finish
bookings_pct_finish    0.129801
dtype: float64



In [311]:
y_test_hat = y_test_hat.flatten()
print(y_test_hat)

[0.850028   1.02976264 1.18930167 ... 1.00018246 1.14090818 0.85978722]


In [312]:
y_test_hat = y_test_hat.flatten()
y_test_hat = pd.DataFrame(y_test_hat)
type(y_test_hat)
display(y_test_hat.head())

Unnamed: 0,0
0,0.850028
1,1.029763
2,1.189302
3,0.883385
4,0.499077


In [313]:
display(y_test.head())

Unnamed: 0,bookings_pct_finish
1583,0.832024
3199,1.113925
3860,1.197114
2554,0.928775
168,0.415658


In [314]:
# display(X_test_raw.head()) 

In [315]:
df = pd.concat([X_test_raw.reset_index(), y_test.reset_index(), y_test_hat.reset_index()], axis=1)
df.to_csv("/Users/derekwang/Desktop/Python/Sales Forecast/results_compare.csv", sep=',')

In [316]:
#### predict for current quarter ####

# Load the current_quarter_data 
curr_data = pd.read_csv("/Users/derekwang/Desktop/Python/Sales Forecast/testing_data_FY20_Q1.csv")

# Define outcome
outcomes_raw_c = curr_data[['bookings_pct_finish']]

# Define features = removing outcome, dt, and fy_quarter label
features_raw_c1 = curr_data.drop(['direct_bookings', 'bookings_pct_finish'], axis = 1)

features_raw_c = features_raw_c1.drop(['dt', 
                                'fy_quarter', 
                                'day_of_qtr', 
                                'quota', 
                                'stage_1_count', 
                                'stage_1_amount', 
                                'stage_2_count', 
                                'stage_2_amount', 
                                'stage_3_count', 
                                'stage_3_amount', 
                                'stage_4_count', 
                                'stage_4_amount', 
                                'stage_5_count', 
                                'stage_5_amount', 
                                'bookings_qtd'], axis = 1)

# fill in NA's with 0 - FEATURES
features_c = features_raw_c.fillna(0.0)

# fill in NA's with 0 - OUTCOMES
outcomes_c = outcomes_raw.fillna(0.0)

# One-hot encoding
X_curr = pd.get_dummies(features_c)

# get rid of blanks in headers
X_curr.columns = X_curr.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

# predict
y_curr = reg.predict(X_curr)

# flatten results
y_curr = y_curr.flatten()
y_curr = pd.DataFrame(y_curr)

# export prediciton
dfc = pd.concat([features_raw_c1.reset_index(), y_curr.reset_index()], axis=1)
dfc.to_csv("/Users/derekwang/Desktop/Python/Sales Forecast/curr_quarter_prediction.csv", sep=',')

Other models to try:


https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html
    
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lars.html#sklearn.linear_model.Lars

