In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandasql as ps
from sklearn import datasets, linear_model, model_selection, metrics
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import random

In [None]:
# Global Variable
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
raw_data = pd.read_csv('LinearRegression_Raw.csv')

In [None]:
raw_data.head(10)

In [None]:
raw_data.columns

In [3]:
Renamed_Data = raw_data[['grass_date',
                      'traffic_a1',
                      'traffic_a1_new',
                      'traffic_a1_existing_new',
                      'traffic_a1_repeat',
                      'a1_new_orders',
                      'a1_existing_new_orders',
                      'a1_repeat_orders',
                      'gmv_usd',
                      'net_shopee_coin_rebate_usd',
                      'nmv_usd',
                      'net_orders']]

In [6]:
Agg_Data = ps.sqldf("SELECT grass_date, sum(traffic_a1) as 'traffic_a1', sum(traffic_a1_new) as 'traffic_a1_new', sum(traffic_a1_existing_new) as 'traffic_a1_existing_new', sum(traffic_a1_repeat) as 'traffic_a1_repeat', sum(a1_new_orders) as 'a1_new_orders', sum(a1_existing_new_orders) as 'a1_existing_new_orders', sum(a1_repeat_orders) as 'a1_repeat_orders', sum(gmv_usd) as 'gmv_usd', sum(nmv_usd) as 'nmv_usd', sum(net_orders) as 'net_orders', sum(net_shopee_coin_rebate_usd) as 'net_shopee_coin_rebate_usd' FROM Renamed_Data GROUP BY grass_date having sum(net_shopee_coin_rebate_usd)>100")

In [7]:
Agg_Data['grass_date'] = pd.to_datetime(Agg_Data['grass_date'], format = '%m/%d/%Y')

In [8]:
Filtered_Data = Agg_Data[Agg_Data['grass_date']>'2022-01-23']

In [9]:
Filtered_Data

Unnamed: 0,grass_date,traffic_a1,traffic_a1_new,traffic_a1_existing_new,traffic_a1_repeat,a1_new_orders,a1_existing_new_orders,a1_repeat_orders,gmv_usd,nmv_usd,net_orders,net_shopee_coin_rebate_usd
0,2022-01-24,200163,12213,56254,131696,3539,4645.0,19090,115676.6260,51027.01320,14147,442.08
1,2022-01-25,176614,11887,46923,117804,3673,4334.0,16841,111632.2970,45134.80620,12648,420.25
2,2022-01-26,203361,11497,56221,135643,3625,4909.0,19502,121791.5040,50528.56600,13989,471.22
3,2022-01-27,203347,12255,56289,134803,3838,5123.0,20580,118642.9270,49661.29910,14995,474.43
4,2022-01-28,179308,10942,46063,122303,3538,4589.0,18153,108016.8810,46718.24470,13271,453.88
...,...,...,...,...,...,...,...,...,...,...,...,...
78,2022-04-05,234581,9792,60608,164181,2694,4672.0,21299,149680.8120,78667.36302,16288,726.72
79,2022-04-06,214361,9506,53329,151526,2725,4439.0,19022,144614.7276,73941.19729,14524,666.01
80,2022-04-07,203549,8919,48316,146314,2447,3725.0,15841,122930.4281,62859.55761,12277,592.47
81,2022-04-08,183493,8319,42892,132282,2084,3238.0,14101,106830.6758,55166.64621,10659,511.81


In [10]:
x = Filtered_Data[['traffic_a1',
                'traffic_a1_new',
                'traffic_a1_existing_new',
                'traffic_a1_repeat',
                'a1_new_orders',
                'a1_existing_new_orders',
                'a1_repeat_orders',
                'gmv_usd',
                'nmv_usd',
                'net_orders']]
y = Filtered_Data[['net_shopee_coin_rebate_usd']]

In [13]:
x_train,x_test,y_train,y_test=model_selection.train_test_split(x,y,random_state=random.randint(1,99999))

In [15]:
LR = linear_model.LinearRegression()
LR.fit(x_train, y_train)

LinearRegression()

In [16]:
print(LR.coef_)

[[ 0.00130743 -0.00899312  0.01704702 -0.00674647  0.03912258  0.00756794
  -0.05030292 -0.00104679  0.01365529  0.0571915 ]]


In [17]:
y_predict = LR.predict(x_test)

In [19]:
print("MSE: " + str(mean_squared_error(y_test,y_predict)))

#print(metrics.mean_absolute_error(y_test,y_predict))
print(r2_score(y_test,y_predict))
print(LR.score(x_test,y_test))

#输出多元回归算法的各个特征的系数矩阵
print(LR.coef_)               

#输出多元线性回归算法各个特征的系数排序，可以知道各个特征的影响度
print(np.argsort(LR.coef_))  

#输出各个特征按照影响系数从小到大的顺序
print(Filtered_Data.columns[np.argsort(LR.coef_)])  

MSE: 38806.34170203962
0.6011007277642084
0.6011007277642084
[[ 0.00130743 -0.00899312  0.01704702 -0.00674647  0.03912258  0.00756794
  -0.05030292 -0.00104679  0.01365529  0.0571915 ]]
[[6 1 3 7 0 5 8 2 4 9]]
[['a1_existing_new_orders' 'traffic_a1' 'traffic_a1_existing_new'
  'a1_repeat_orders' 'grass_date' 'a1_new_orders' 'gmv_usd'
  'traffic_a1_new' 'traffic_a1_repeat' 'nmv_usd']]


  print(Filtered_Data.columns[np.argsort(LR.coef_)])


In [None]:
#Using Pearson Correlation
Pure_Value_Raw = Raw_Final[[ 'traffic_a1',
                'traffic_a1_new',
                'traffic_a1_existing_new',
                'traffic_a1_repeat',
                'a1_new_orders',
                'a1_existing_new_orders',
                'a1_repeat_orders',
                'gmv_usd',
                'nmv_usd',
                'net_orders',
                'net_shopee_coin_rebate_usd']]

plt.figure(figsize=(12,10))
cor = Pure_Value_Raw.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
variables = [ 'traffic_a1',
                'traffic_a1_new',
                'traffic_a1_existing_new',
                'traffic_a1_repeat',
                'a1_new_orders',
                'a1_existing_new_orders',
                'a1_repeat_orders',
                'gmv_usd',
                'nmv_usd',
                'net_orders']
x = x.values
y = y.values

In [None]:
x_train_standard = StandardScaler().fit_transform(x)

In [None]:
x = pd.DataFrame(x_train_standard)

In [None]:
pca = PCA()
x_pca = pca.fit_transform(x)
x_pca = pd.DataFrame(x_pca)
x_pca.head()

In [None]:
explained_variance = pca.explained_variance_ratio_
explained_variance

In [None]:
x_pca['net_shopee_coin_rebate_usd']=y

In [None]:
x_pca.columns = ['traffic_a1',
                'traffic_a1_new',
                'traffic_a1_existing_new',
                'traffic_a1_repeat',
                'a1_new_orders',
                'a1_existing_new_orders',
                'a1_repeat_orders',
                'gmv_usd',
                'nmv_usd',
                'net_orders','net_shopee_coin_rebate_usd']

In [None]:
x_pca.head()

In [None]:
x1 = x_pca[['traffic_a1','traffic_a1_new','traffic_a1_existing_new','traffic_a1_repeat','a1_new_orders','a1_existing_new_orders','a1_repeat_orders','gmv_usd','nmv_usd','net_orders']]
y1 = x_pca[['net_shopee_coin_rebate_usd']]

list1=[]
i=1
while i<10000:
    x_train,x_test,y_train,y_test=model_selection.train_test_split(x1,y1,random_state=i)
    LR = linear_model.LinearRegression()
    LR.fit(x_train, y_train)
    y_predict = LR.predict(x_test)
    a = x_pca.columns[np.argmax(LR.coef_)]
    list1.append(a)
    i=i+1 
    
dict = {}
for key in list1:
    dict[key] = dict.get(key, 0) + 1
print(dict)    

In [None]:
print("MSE: " + str(mean_squared_error(y_test,y_predict)))

#print(metrics.mean_absolute_error(y_test,y_predict))
#print(r2_score(y_test,y_predict))
#print(LR.score(x_test,y_test))

#输出多元回归算法的各个特征的系数矩阵
#print(LR.coef_)               

#输出多元线性回归算法各个特征的系数排序，可以知道各个特征的影响度
#print(np.argsort(LR.coef_))  

#输出各个特征按照影响系数从小到大的顺序
a = x_pca.columns[np.argmax(LR.coef_)]

In [None]:
print(a)

In [None]:
x = Raw_Final[[ 'traffic_a1',
                'nmv_usd']]
y = Raw_Final[['net_shopee_coin_rebate_usd']]

In [None]:
x_train,x_test,y_train,y_test=model_selection.train_test_split(x,y,random_state=10000)

In [None]:
import IPython
IPython.display.clear_output(wait=True)

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

import ipympl

%matplotlib widget

In [None]:
LR = linear_model.LinearRegression()
LR.fit(x_train, y_train)
y_predict = LR.predict(x_test)

'''fig = plt.figure(1,(6,4),dpi = 250)
ax = fig.gca(projection='3d')
ax.scatter(Raw_Final['traffic_a1'],Raw_Final['nmv_usd'],Raw_Final['net_shopee_coin_rebate_usd'],s=1)'''

# creating figure
fig = plt.figure()
ax = Axes3D(fig)
  
# creating the plot
plot_geeks = ax.scatter(Raw_Final['traffic_a1'], Raw_Final['nmv_usd'], Raw_Final['net_shopee_coin_rebate_usd'], color='green')
  
# setting title and labels
ax.set_title("3D plot")
ax.set_xlabel('x-traffic_a1')
ax.set_ylabel('y-nmv_usd')
ax.set_zlabel('z-net_shopee_coin_rebate_usd')
  
# displaying the plot
plt.show()


'''print("MSE: " + str(mean_squared_error(y_test,y_predict)))

#print(metrics.mean_absolute_error(y_test,y_predict))
print(r2_score(y_test,y_predict))
print(LR.score(x_test,y_test))

#输出多元回归算法的各个特征的系数矩阵
print(LR.coef_)               

#输出多元线性回归算法各个特征的系数排序，可以知道各个特征的影响度
print(np.argsort(LR.coef_))  

#输出各个特征按照影响系数从小到大的顺序
print(Raw_Final.columns[np.argsort(LR.coef_)])  '''

In [None]:
LR.coef_

In [None]:
y_predict1 = pd.DataFrame(y_predict)
b = pd.concat([x_test,y_test],axis = 1)

b.reset_index(drop=True, inplace=True)
y_predict1.reset_index(drop=True, inplace=True)
result = pd.concat([b,y_predict1],axis=1)
result = result.rename(columns={0: "predict_result"})

In [None]:
result

TO PREDICT FUTURE COIN REBATE

In [None]:
test_data = pd.read_csv('LR_Test.csv')

In [None]:
#test_data['traffic_a1'] = test_data['a1_new'] + test_data['a1_existing_new'] + test_data['a1_repeat']

In [None]:
act_actual = test_data[['grass_date','traffic_a1','nmv_usd','net_shopee_coin_rebate_usd']]

In [None]:
act_actual1 = ps.sqldf("SELECT grass_date, sum(traffic_a1) as 'traffic_a1', sum(nmv_usd) as 'nmv_usd', sum(net_shopee_coin_rebate_usd) as 'net_shopee_coin_rebate_usd' FROM act_actual GROUP BY grass_date having sum(net_shopee_coin_rebate_usd)>0")

In [None]:
#y_actpredict = LR.predict(act_actual)

act_actual1['grass_date'] = pd.to_datetime(act_actual1['grass_date'], format = '%m/%d/%Y')

act_actual = act_actual1[act_actual1['grass_date']>'2022-04-17']
act_actual = act_actual[act_actual['net_shopee_coin_rebate_usd']>100]

In [None]:
act_actual

In [None]:
actual_predict = LR.predict(act_actual[['traffic_a1','nmv_usd']])

In [None]:
actual_predict

In [None]:
print("MSE: " + str(r2_score(act_actual['net_shopee_coin_rebate_usd'],actual_predict)))

In [None]:
act_actual['net_shopee_coin_rebate_usd']

FEATURE SELECTION TEST

In [None]:
from sklearn.feature_selection import VarianceThreshold

test_data1 = test_data['grass_date'] = pd.to_datetime(test_data['grass_date'], format = '%m/%d/%Y')


In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression,LinearRegression

test_data_data = act_actual1.drop(columns=['grass_date','net_shopee_coin_rebate_usd'])
test_data_target = act_actual1[['net_shopee_coin_rebate_usd']]
#递归特征消除法，返回特征选择后的数据
#参数estimator为基模型
#参数n_features_to_select为选择的特征个数
c = RFE(estimator=LinearRegression(), n_features_to_select=1).fit_transform(test_data_data,test_data_target)

In [None]:
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model
 
# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore", module="scipy",
                        message="^internal gelsd")
 
# Generate features matrix, target vector, and the true coefficients
features, target = make_regression(n_samples = 10000,
                                   n_features = 100,
                                   n_informative = 2,
                                   random_state = 1)
# Create a linear regression
ols = linear_model.LinearRegression()
 
# Recursively eliminate features
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)
Once we have conducted RFE, we can see the number of features we should keep:
# Number of best features
rfecv.n_features_
5
We can also see which of those features we should keep:
# Which categories are best
rfecv.support_
We can even view the rankings of the features:
# Rank features best (1) to worst
rfecv.ranking_

In [None]:
features.columns

In [None]:
raw_data.head(30)

In [None]:
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandasql as ps
from sklearn import datasets, linear_model, model_selection, metrics
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns

# Create a linear regression
ols = linear_model.LinearRegression()
raw_data = pd.read_csv('LinearRegression_Raw.csv')
raw_1 = raw_data.drop(columns=['grass_date','first_day_of_month', 'first_day_of_week', 'if_mtd', 'if_wtd']).dropna()
features = raw_1.drop(columns=['net_shopee_coin_rebate_usd'])
target = raw_1['net_shopee_coin_rebate_usd']
# Recursively eliminate features
rfecv = RFECV(estimator=linear_model.LinearRegression(), step=2, scoring="neg_mean_squared_error")
rfecv.fit(features,target).get_feature_names_out()
#rfecv.transform(features)

In [None]:
#Once we have conducted RFE, we can see the number of features we should keep:
# Number of best features
rfecv.n_features_


In [None]:
#We can even view the rankings of the features:
# Rank features best (1) to worst
rfecv.ranking_

In [None]:
print(selector.get_support(indices=True))  

In [None]:
print(selector)