In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression

from IPython.display import display # Allows the use of display() for DataFrames

# Pretty display for notebooks
%matplotlib inline

In [2]:
# Load the data 
full_data = pd.read_csv("/Users/derekwang/Desktop/Python/Sales Forecast/training_data_FY18_2H_FY19_All.csv")

In [3]:
# Define outcome
outcomes_raw = full_data[['bookings_pct_finish']]

# Define features = removing outcome, dt, and fy_quarter label
features_raw = full_data.drop(['bookings_pct_finish'], axis = 1)

# display(features_raw.head())
print(features_raw.columns)
# features_raw.describe()

Index(['dt', 'fy_quarter', 'day_of_qtr', 'sales_div', 'quota', 'stage_1_count',
       'stage_1_amount', 'stage_2_count', 'stage_2_amount', 'stage_3_count',
       'stage_3_amount', 'stage_4_count', 'stage_4_amount', 'stage_5_count',
       'stage_5_amount', 'bookings_qtd', 'direct_bookings', 'qtr_pct',
       's1_quota_ratio', 's2_quota_ratio', 's3_quota_ratio', 's4_quota_ratio',
       's5_quota_ratio', 's12345_bk_qtd_quota_ratio',
       's2345_bk_qtd_quota_ratio', 's345_bk_qtd_quota_ratio',
       's45_bk_qtd_quota_ratio', 's5_bk_qtd_quota_ratio', 'bookings_pct_qtd'],
      dtype='object')


In [4]:
# fill in NA's with 0 - FEATURES
features = features_raw.fillna(0.0)
# display(features.head())
print(features.columns)


# fill in NA's with 0 - OUTCOMES
outcomes = outcomes_raw.fillna(0.0)
# display(outcomes.head())
print(outcomes.columns)

Index(['dt', 'fy_quarter', 'day_of_qtr', 'sales_div', 'quota', 'stage_1_count',
       'stage_1_amount', 'stage_2_count', 'stage_2_amount', 'stage_3_count',
       'stage_3_amount', 'stage_4_count', 'stage_4_amount', 'stage_5_count',
       'stage_5_amount', 'bookings_qtd', 'direct_bookings', 'qtr_pct',
       's1_quota_ratio', 's2_quota_ratio', 's3_quota_ratio', 's4_quota_ratio',
       's5_quota_ratio', 's12345_bk_qtd_quota_ratio',
       's2345_bk_qtd_quota_ratio', 's345_bk_qtd_quota_ratio',
       's45_bk_qtd_quota_ratio', 's5_bk_qtd_quota_ratio', 'bookings_pct_qtd'],
      dtype='object')
Index(['bookings_pct_finish'], dtype='object')


In [5]:
# test train split

X_train_raw, X_test_raw, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)

In [6]:
# drop features that are not needed

X_train_raw2 = X_train_raw.drop(['dt', 'fy_quarter', 'day_of_qtr', 'quota', 'stage_1_count', 'stage_1_amount', 
                          'stage_2_count', 'stage_2_amount', 'stage_3_count', 'stage_3_amount', 'stage_4_count', 
                          'stage_4_amount', 'stage_5_count', 'stage_5_amount', 'bookings_qtd', 'direct_bookings',
                          's1_quota_ratio', 's2_quota_ratio', 's3_quota_ratio', 's4_quota_ratio','s5_quota_ratio'
                                ], 
                                axis = 1)

X_test_raw2 = X_test_raw.drop(['dt', 'fy_quarter', 'day_of_qtr', 'quota', 'stage_1_count', 'stage_1_amount', 
                          'stage_2_count', 'stage_2_amount', 'stage_3_count', 'stage_3_amount', 'stage_4_count', 
                          'stage_4_amount', 'stage_5_count', 'stage_5_amount', 'bookings_qtd', 'direct_bookings',
                          's1_quota_ratio', 's2_quota_ratio', 's3_quota_ratio', 's4_quota_ratio','s5_quota_ratio'
                              ], 
                              axis = 1)


In [7]:
# One-hot encoding
X_train = pd.get_dummies(X_train_raw2)
X_test = pd.get_dummies(X_test_raw2)

In [8]:
# get rid of blanks in headers

X_train.columns = X_train.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
print('X_train columns')
print(X_train.columns)
display(X_train.head())

X_test.columns = X_test.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
print('X_test columns')
print(X_test.columns)
display(X_test.head())

X_train columns
Index(['qtr_pct', 's12345_bk_qtd_quota_ratio', 's2345_bk_qtd_quota_ratio',
       's345_bk_qtd_quota_ratio', 's45_bk_qtd_quota_ratio',
       's5_bk_qtd_quota_ratio', 'bookings_pct_qtd', 'sales_div_api',
       'sales_div_comm', 'sales_div_comm-vast', 'sales_div_ed',
       'sales_div_ent', 'sales_div_gov', 'sales_div_healthcare',
       'sales_div_intl', 'sales_div_intl-anz', 'sales_div_intl-apac',
       'sales_div_intl-emea', 'sales_div_intl-uk', 'sales_div_majors',
       'sales_div_network_alliance', 'sales_div_smb', 'sales_div_smb-vast'],
      dtype='object')


Unnamed: 0,qtr_pct,s12345_bk_qtd_quota_ratio,s2345_bk_qtd_quota_ratio,s345_bk_qtd_quota_ratio,s45_bk_qtd_quota_ratio,s5_bk_qtd_quota_ratio,bookings_pct_qtd,sales_div_api,sales_div_comm,sales_div_comm-vast,...,sales_div_healthcare,sales_div_intl,sales_div_intl-anz,sales_div_intl-apac,sales_div_intl-emea,sales_div_intl-uk,sales_div_majors,sales_div_network_alliance,sales_div_smb,sales_div_smb-vast
120,0.404494,0.519234,0.515171,0.431897,0.330031,0.235123,0.147354,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3849,0.505618,1.183806,1.098117,0.91453,0.751672,0.664627,0.572771,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4595,0.651685,2.445669,2.404535,1.732103,1.374839,0.952292,0.930331,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1780,0.516854,3.13369,3.111,1.644095,0.997809,0.866381,0.459881,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4304,0.467391,1.919407,1.875831,1.555836,0.959988,0.635071,0.558932,0,0,0,...,0,1,0,0,0,0,0,0,0,0


X_test columns
Index(['qtr_pct', 's12345_bk_qtd_quota_ratio', 's2345_bk_qtd_quota_ratio',
       's345_bk_qtd_quota_ratio', 's45_bk_qtd_quota_ratio',
       's5_bk_qtd_quota_ratio', 'bookings_pct_qtd', 'sales_div_api',
       'sales_div_comm', 'sales_div_comm-vast', 'sales_div_ed',
       'sales_div_ent', 'sales_div_gov', 'sales_div_healthcare',
       'sales_div_intl', 'sales_div_intl-anz', 'sales_div_intl-apac',
       'sales_div_intl-emea', 'sales_div_intl-uk', 'sales_div_majors',
       'sales_div_network_alliance', 'sales_div_smb', 'sales_div_smb-vast'],
      dtype='object')


Unnamed: 0,qtr_pct,s12345_bk_qtd_quota_ratio,s2345_bk_qtd_quota_ratio,s345_bk_qtd_quota_ratio,s45_bk_qtd_quota_ratio,s5_bk_qtd_quota_ratio,bookings_pct_qtd,sales_div_api,sales_div_comm,sales_div_comm-vast,...,sales_div_healthcare,sales_div_intl,sales_div_intl-anz,sales_div_intl-apac,sales_div_intl-emea,sales_div_intl-uk,sales_div_majors,sales_div_network_alliance,sales_div_smb,sales_div_smb-vast
1583,0.358696,2.089514,1.981656,1.513368,0.948736,0.46472,0.23482,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3199,0.054348,1.65627,1.520276,0.893199,0.340353,0.126554,0.030078,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3860,0.629213,1.372473,1.266091,1.067916,0.906049,0.819606,0.742356,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2554,0.978261,4.052103,3.440547,2.695283,1.529301,1.178561,0.810855,0,0,0,...,1,0,0,0,0,0,0,0,0,0
168,0.94382,0.844769,0.836183,0.753523,0.610513,0.507024,0.394058,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [9]:
## scaling & lasso feature selection

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train) 

# lasso_reg = Lasso()
# lasso_reg.fit(X_train_scaled, y_train)

# reg_coef = lasso_reg.coef_
# print(reg_coef)

In [10]:
display(y_train.head())

Unnamed: 0,bookings_pct_finish
120,0.415658
3849,1.197114
4595,1.407783
1780,0.856805
4304,1.29583


In [11]:
# fit linear reg

reg = LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
         normalize=False)
reg.fit(X_train,y_train)


LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
         normalize=False)

In [12]:
reg.score(X_train,y_train)

0.7416300270755511

In [13]:
print("reg.coef_")
print(reg.coef_)
print("")
print("reg.intercept_")
print(reg.intercept_)

reg.coef_
[[-0.86571531 -0.01049891  0.06826813 -0.01767718  0.12568017  0.31049574
   0.38874141  0.73499701  0.77773496  0.88677307  0.71759307  0.4418267
   0.56666127  0.7081326   1.05851029  0.99990322  0.99178601  0.82823026
   0.91241649  0.67405066  0.7012832   0.89675307  1.02030458]]

reg.intercept_
0.0


In [14]:
y_test_hat = reg.predict(X_test)

In [15]:
print(y_test_hat)

[[0.8484742 ]
 [1.02973759]
 [1.18567256]
 ...
 [1.00086298]
 [1.13922754]
 [0.8552825 ]]


In [16]:
# scoring

from sklearn.metrics import r2_score

r2_scoring = r2_score(y_test,y_test_hat)
print('r2_score')
print(r2_scoring)
print('')

def avg_abs_var_pct(y_test,y_test_hat):
    var_percent = np.subtract(y_test,y_test_hat)/y_test
    abs_var_percent = abs(var_percent)
    av_abs_var_percent = np.mean(abs_var_percent)
    print('av_abs_var_percent')
    print(av_abs_var_percent)
    print('')
    
def avg_abs_var_finish(y_test,y_test_hat):
    var_finish = np.subtract(y_test,y_test_hat)
    abs_var_finish = abs(var_finish)
    av_abs_var_finish = np.mean(abs_var_finish)
    print('avg_abs_var_finish')
    print(av_abs_var_finish)
    print('')
    
avg_abs_var_pct(y_test,y_test_hat)

avg_abs_var_finish(y_test,y_test_hat)

r2_score
0.7475456322439586

av_abs_var_percent
bookings_pct_finish    0.185245
dtype: float64

avg_abs_var_finish
bookings_pct_finish    0.129836
dtype: float64



In [17]:
y_test_hat = y_test_hat.flatten()
y_test_hat = pd.DataFrame(y_test_hat)
type(y_test_hat)
display(y_test_hat.head())

Unnamed: 0,0
0,0.848474
1,1.029738
2,1.185673
3,0.879283
4,0.491933


In [18]:
display(y_test.head())

Unnamed: 0,bookings_pct_finish
1583,0.832024
3199,1.113925
3860,1.197114
2554,0.928775
168,0.415658


In [19]:
# display(X_test_raw.head()) 

In [20]:
# compare actual to y_hat

df = pd.concat([X_test_raw.reset_index(), y_test.reset_index(), y_test_hat.reset_index()], axis=1)
df.to_csv("/Users/derekwang/Desktop/Python/Sales Forecast/results_compare__1_linear_reg.csv", sep=',')

In [21]:
#### predict for current quarter ####

# Load the current_quarter_data 
curr_data = pd.read_csv("/Users/derekwang/Desktop/Python/Sales Forecast/testing_data_FY20_Q1.csv")

# Define outcome
outcomes_raw_c = curr_data[['bookings_pct_finish']]

# Define features = removing outcome, dt, and fy_quarter label
features_raw_c1 = curr_data.drop(['direct_bookings', 'bookings_pct_finish'], axis = 1)

features_raw_c = features_raw_c1.drop(['dt', 
                                'fy_quarter', 
                                'day_of_qtr', 
                                'quota', 
                                'stage_1_count', 
                                'stage_1_amount', 
                                'stage_2_count', 
                                'stage_2_amount', 
                                'stage_3_count', 
                                'stage_3_amount', 
                                'stage_4_count', 
                                'stage_4_amount', 
                                'stage_5_count', 
                                'stage_5_amount', 
                                'bookings_qtd',
                                's1_quota_ratio', 's2_quota_ratio', 's3_quota_ratio', 's4_quota_ratio','s5_quota_ratio'       
                                      ], axis = 1)

# fill in NA's with 0 - FEATURES
features_c = features_raw_c.fillna(0.0)

# fill in NA's with 0 - OUTCOMES
outcomes_c = outcomes_raw.fillna(0.0)

# One-hot encoding
X_curr = pd.get_dummies(features_c)

# get rid of blanks in headers
X_curr.columns = X_curr.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

# predict
y_curr = reg.predict(X_curr)

# flatten results
y_curr = y_curr.flatten()
y_curr = pd.DataFrame(y_curr)

# export prediciton
dfc = pd.concat([features_raw_c1.reset_index(), y_curr.reset_index()], axis=1)
dfc.to_csv("/Users/derekwang/Desktop/Python/Sales Forecast/curr_quarter_prediction__1_linear_reg.csv", sep=',')