# Topic : Lecture 3 Multi-Linear regression
<img src="https://www.tribloom.com/wp-content/uploads/2019/08/CRISP-DM_Process_Diagram-768x769.png" height=300>

Follow the CRSIP-DM method
1. Step 1: Import library, import data
2. Step 2: Pre-processing (missing data, categorical type, normalization, format transform)
3. Step 3: Build ML Model
4. Step 4: Evaluate Model
5. Step 5: Deploy (Prediction)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Step 1: Load data (also import library)

In [1]:
# import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#import data
data=pd.read_csv("50_Startups.csv")
print(data.head()) # show first 5 items
print(type(data))
print(data.info())

#print(data.iloc[:,:-1])

   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB
None


# Step 2: Pre-process X, Y
* 資料型別的 transformation) format transform (轉換成numpy format)
* (missing data=> imputation, 
* normalization
* data type 例如 categorical data onehot encoding, Label_Encoding, padas 套件有一個簡單的 get_dummies 的API



In [10]:
#1. (missing data=> imputation,
# no missing data
from sklearn.impute import SimpleImputer

arg1 = np.array(data.iloc[:,0])
arg1 = arg1.reshape([-1, 1])
arg2 = np.array(data.iloc[:,2])
arg2 = arg2.reshape([-1, 1])

# 平均插補
imr = SimpleImputer(missing_values=0, strategy='mean')
imr = imr.fit(arg1)
imputed_data = imr.transform(arg1)
data.iloc[:,0] = imputed_data

imr = SimpleImputer(missing_values=0, strategy='mean')
imr = imr.fit(arg2)
imputed_data = imr.transform(arg2)
data.iloc[:,2] = imputed_data
# print(data.iloc[:,0:5])


#2. normalization
# Skip sklearn linear model 會幫我們做

#3. nominal data 的轉換 ==> pandas 下面 pd.getdummies
#==> X 5個特徵 1Y

X=data.iloc[:,:-1]
Y=data.iloc[:,-1]
print(X.columns)

X.info()
X=pd.get_dummies(X)
print(X.columns)
X.info()
X=X.values.reshape(-1,6)
Y=Y.values.reshape(-1,1)
#4. 資料型別的 transformation) format transform (轉換成numpy format)


X=X[:,:-1] 
print(type(X),X.shape)
print(type(Y),Y.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
print(type(X_train),X_train.shape)
print(type(Y_train),Y_train.shape)


Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
dtypes: float64(3), object(1)
memory usage: 1.7+ KB
Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State_California',
       'State_Florida', 'State_New York'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   R&D Spend         50 non-null     float64
 1   Administration    50 non-null     float64
 2   Marketing Spend   50 non-null     float64
 3   Sta

# Step 3: Build Model for training

In [3]:
from sklearn.linear_model import LinearRegression as LR
model=LR()  # constructor
#all in policy
model.fit(X_train,Y_train) # training==> find a* and b*

print(model) 
print("a*=",model.coef_,"b*=",model.intercept_, )


LinearRegression()
a*= [[ 6.45209703e-01  1.00425490e-01  8.15150660e-02 -6.68132451e+03
  -1.81773441e+03]] b*= [33616.46584791]


# Step 4: Evalute Model

# model selection (Lasso)

In [4]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn import datasets

from sklearn.linear_model import LassoCV
lassocv = LassoCV(cv=5, random_state=0, max_iter=10000)
lassocv.fit(X_train, Y_train)
al = lassocv.alpha_
print(al)

lasso = Lasso(alpha=al)
lasso.fit(X_train, Y_train)
print(lasso.score(X_test, Y_test)*100)

80638246.1944936
89.11191245602875


  return f(**kwargs)


In [5]:
from sklearn.metrics import r2_score as R2
from sklearn.metrics import mean_squared_error as MSE
yPre=model.predict(X_train)
print("MSE=",MSE(Y_train,yPre))
print("R2=",R2(Y_train,yPre))


#backward selection 
#檢查p-values select 重要的特徵
import statsmodels.api as sm
X_train = np.append(arr = np.ones((40, 1)).astype(int), values = X_train, axis = 1)
X_opt = X_train [:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = Y_train, exog = X_opt).fit()
print("===================================================")
print('0-5', regressor_OLS.summary())
print("====================================================")


X_opt = X_train [:, [0, 1, 2, 3, 5]]
regressor_OLS = sm.OLS(endog = Y_train, exog = X_opt).fit()
print("====================================================")
print('01345',regressor_OLS.summary())
print("====================================================")

X_opt = X_train [:, [0, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = Y_train, exog = X_opt).fit()
print("====================================================")
print('0345',regressor_OLS.summary())
print("====================================================")


X_opt = X_train [:, [0, 3, 5]]
regressor_OLS = sm.OLS(endog = Y_train, exog = X_opt).fit()
print("====================================================")
print('035',regressor_OLS.summary())
print("====================================================")


X_opt = X_train [:, [0, 3]]
regressor_OLS = sm.OLS(endog = Y_train, exog = X_opt).fit()
print("====================================================")
print('03',regressor_OLS.summary())
print("====================================================")




MSE= 361894553.49593556
R2= 0.7789917661408049
0-5                             OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.779
Model:                            OLS   Adj. R-squared:                  0.746
Method:                 Least Squares   F-statistic:                     23.97
Date:                Thu, 13 Oct 2022   Prob (F-statistic):           2.96e-10
Time:                        16:03:02   Log-Likelihood:                -450.89
No. Observations:                  40   AIC:                             913.8
Df Residuals:                      34   BIC:                             923.9
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c

In [13]:
pip install pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycaret
  Downloading pycaret-2.3.10-py3-none-any.whl (320 kB)
[K     |████████████████████████████████| 320 kB 29.5 MB/s 
[?25hCollecting scipy<=1.5.4
  Downloading scipy-1.5.4-cp37-cp37m-manylinux1_x86_64.whl (25.9 MB)
[K     |████████████████████████████████| 25.9 MB 1.4 MB/s 
[?25hCollecting pyyaml<6.0.0
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 44.9 MB/s 
Collecting Boruta
  Downloading Boruta-0.3-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.4 MB/s 
Collecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 6.1 MB/s 
[?25hCollecting scikit-learn==0.23.2
  Downloading scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 57.2 MB/s 
[?25hCollecting 

In [None]:
pip install numba==0.53

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting numba==0.53
  Downloading numba-0.53.0-cp37-cp37m-manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Collecting llvmlite<0.37,>=0.36.0rc1
  Downloading llvmlite-0.36.0-cp37-cp37m-manylinux2010_x86_64.whl (25.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.3/25.3 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: llvmlite, numba
  Attempting uninstall: llvmlite
    Found existing installation: llvmlite 0.37.0
    Uninstalling llvmlite-0.37.0:
      Successfully uninstalled llvmlite-0.37.0
  Attempting uninstall: numba
    Found existing installation: numba 0.54.1
    Uninstalling numba-0.54.1:
      Successfully uninstalled numba-0.54.1
Successfully installed llvmlite-0.36.0 numba-0.53.0
[0m

# Encoding

In [49]:
states=pd.get_dummies(data['State'],drop_first=True)
states.head()

X_data=pd.concat([data.iloc[:,:5],states],axis=1).drop('State',axis=1)
X_data.columns=['R&D Spend','Administration','Marketing Spend','Profit','Florida','New York']
X_data

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,Florida,New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0
5,131876.9,99814.71,362861.36,156991.12,0,1
6,134615.46,147198.87,127716.82,156122.51,0,0
7,130298.13,145530.06,323876.68,155752.6,1,0
8,120542.52,148718.95,311613.29,152211.77,0,1
9,123334.88,108679.17,304981.62,149759.96,0,0


In [30]:
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# LabelEncoder()
# real_x = data.iloc[:,:5].values
# real_x[:,3] = le.fit_transform(real_x[:,3])
# real_x

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
LabelEncoder()
real_x = data
real_x.iloc[:,3] = le.fit_transform(real_x.iloc[:,3])
real_x

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94
5,131876.9,99814.71,362861.36,2,156991.12
6,134615.46,147198.87,127716.82,0,156122.51
7,130298.13,145530.06,323876.68,1,155752.6
8,120542.52,148718.95,311613.29,2,152211.77
9,123334.88,108679.17,304981.62,0,149759.96


# model selection (Pycaret)

In [51]:
from pycaret.classification import *

# init setup
clf1 = setup(data = X_data, target = 'New York')


Unnamed: 0,Description,Value
0,session_id,797
1,Target,New York
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(50, 6)"
5,Missing Values,False
6,Numeric Features,5
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


INFO:logs:create_model_container: 0
INFO:logs:master_model_container: 0
INFO:logs:display_container: 1
INFO:logs:Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True, features_todrop=[],
                                      id_columns=[],
                                      ml_usecase='classification',
                                      numerical_features=[], target='New York',
                                      time_features=[])),
                ('imputer',
                 Simple_Imputer(categorical_strategy='not_available',
                                fill_value_categorical=None,
                                fill_value_numerical=None,
                                numeric_stra...
                ('scaling', 'passthrough'), ('P_transform', 'passthrough'),
                ('binn', 'passthrough'), ('rem_outliers', 'passthrough'),
                ('clust

In [64]:
real_x = data.iloc[:,:-1].values
real_y = data.iloc[:,-1].values
from sklearn.preprocessing import OneHotEncoder
# oneHe = OneHotEncoder(categorical_features=[3])

from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([('oneHe', OneHotEncoder(categories='auto'), [3])] ,remainder='passthrough')
real_x = ct.fit_transform(real_x)
real_x

from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y =  train_test_split(real_x,real_y,test_size=0.3, random_state=99)
train_x.shape, train_y.shape

((35, 6), (35,))

In [52]:
# compare models
# 比較演算法的表現，找出最佳的模型
# best = compare_models(errors = 'raise')
best = compare_models()


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.7,0.5167,0.4,0.3,0.3333,0.2,0.2155,0.017
rf,Random Forest Classifier,0.7,0.4833,0.3,0.3,0.3,0.1667,0.1667,0.463
lightgbm,Light Gradient Boosting Machine,0.7,0.5,0.0,0.0,0.0,0.0,0.0,0.095
dummy,Dummy Classifier,0.7,0.5,0.0,0.0,0.0,0.0,0.0,0.017
et,Extra Trees Classifier,0.625,0.7167,0.4,0.3,0.3333,0.1067,0.1167,0.446
knn,K Neighbors Classifier,0.6167,0.4333,0.0,0.0,0.0,-0.1167,-0.1167,0.119
lr,Logistic Regression,0.6083,0.3333,0.0,0.0,0.0,-0.1333,-0.1333,0.381
lda,Linear Discriminant Analysis,0.5917,0.7167,0.4,0.2167,0.2667,0.0433,0.0667,0.021
dt,Decision Tree Classifier,0.5833,0.5333,0.4,0.25,0.3,0.0233,0.0488,0.02
ridge,Ridge Classifier,0.5833,0.0,0.3,0.1167,0.1667,-0.0233,0.0,0.015


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:GaussianNB(priors=None, var_smoothing=1e-09)
INFO:logs:compare_models() succesfully completed......................................


In [55]:
# 用不同演算法建立子模型(Logistic Regression)
lr = create_model('lr')
ridge = create_model('ridge')
lda = create_model('lda')
gbc = create_model('gbc')

stacker = stack_models(estimator_list = [ridge,lda,gbc], meta_model = lr)
stacker

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.75,0.0,0.0,0.0,0.0,0.0,0.0
1,0.5,0.0,0.0,0.0,0.0,-0.3333,-0.3333
2,0.75,0.3333,0.0,0.0,0.0,0.0,0.0
3,0.75,1.0,0.0,0.0,0.0,0.0,0.0
4,0.6667,1.0,0.0,0.0,0.0,0.0,0.0
5,0.6667,0.0,0.0,0.0,0.0,0.0,0.0
6,0.6667,0.5,0.0,0.0,0.0,0.0,0.0
7,0.3333,0.0,0.0,0.0,0.0,-0.5,-0.5
8,0.3333,0.0,0.0,0.0,0.0,-0.5,-0.5
9,0.6667,0.5,0.0,0.0,0.0,0.0,0.0


INFO:logs:create_model_container: 22
INFO:logs:master_model_container: 22
INFO:logs:display_container: 10
INFO:logs:StackingClassifier(cv=5,
                   estimators=[('ridge',
                                RidgeClassifier(alpha=1.0, class_weight=None,
                                                copy_X=True, fit_intercept=True,
                                                max_iter=None, normalize=False,
                                                random_state=797, solver='auto',
                                                tol=0.001)),
                               ('lda',
                                LinearDiscriminantAnalysis(n_components=None,
                                                           priors=None,
                                                           shrinkage=None,
                                                           solver='svd',
                                                           store_covariance=False,
                 

StackingClassifier(cv=5,
                   estimators=[('ridge',
                                RidgeClassifier(alpha=1.0, class_weight=None,
                                                copy_X=True, fit_intercept=True,
                                                max_iter=None, normalize=False,
                                                random_state=797, solver='auto',
                                                tol=0.001)),
                               ('lda',
                                LinearDiscriminantAnalysis(n_components=None,
                                                           priors=None,
                                                           shrinkage=None,
                                                           solver='svd',
                                                           store_covariance=False,
                                                           tol=0.0001)),
                               ('gbc',
                     

In [56]:
# 模型儲存成pkl檔
# 讀取模型，Label是模型預測的結果
save_model(stacker, 'stacker')
model = load_model('stacker')
pred = predict_model(model,data = X_data)

pred.head()

INFO:logs:Initializing save_model()
INFO:logs:save_model(model=StackingClassifier(cv=5,
                   estimators=[('ridge',
                                RidgeClassifier(alpha=1.0, class_weight=None,
                                                copy_X=True, fit_intercept=True,
                                                max_iter=None, normalize=False,
                                                random_state=797, solver='auto',
                                                tol=0.001)),
                               ('lda',
                                LinearDiscriminantAnalysis(n_components=None,
                                                           priors=None,
                                                           shrinkage=None,
                                                           solver='svd',
                                                           store_covariance=False,
                                                           tol=0.0001)

Transformation Pipeline and Model Successfully Saved


INFO:logs:Initializing load_model()
INFO:logs:load_model(model_name=stacker, platform=None, authentication=None, verbose=True)
INFO:logs:Initializing predict_model()
INFO:logs:predict_model(estimator=Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True, features_todrop=[],
                                      id_columns=[],
                                      ml_usecase='classification',
                                      numerical_features=[], target='New York',
                                      time_features=[])),
                ('imputer',
                 Simple_Imputer(categorical_strategy='not_available',
                                fill_value_categorical=None,
                                fill_value_numerical=None,
                                numeric_stra...
                                                                            verbose=0,

Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Stacking Classifier,0.64,0.492,0.0,0.0,0.0,-0.0393,-0.1025


Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,Florida,New York,Label,Score
0,165349.2,136897.8,471784.1,192261.83,0,1,0,0.7171
1,162597.7,151377.59,443898.53,191792.06,0,0,0,0.7488
2,153441.51,101145.55,407934.54,191050.39,1,0,0,0.5513
3,144372.41,118671.85,383199.62,182901.99,0,1,0,0.6234
4,142107.34,91391.77,366168.42,166187.94,1,0,0,0.5762


In [57]:
from sklearn.metrics import accuracy_score

accuracy_score(pred['New York'],pred['Label'])

0.64

# Step 5: Deploy Model to predict new value

In [None]:
data.to_csv("result.csv", index=False, mode='w')