## Target variable : tip

In [1]:
!cd

C:\Users\admin\Desktop\2021-K-Digital-Training-main\SQL


In [2]:
import pymysql.cursors
import pandas as pd
import numpy as np

In [3]:
# MySQL DB에서 데이터 받아와서 DataFrame에 저장

conn = pymysql.connect(host='localhost', user='root', 
                       password='1281', db='tip', charset='utf8',
                       autocommit=True, cursorclass=pymysql.cursors.DictCursor)
try:

    with conn.cursor() as curs:
      sql = "SELECT * FROM tips;"
      curs.execute(sql)
      rs = curs.fetchall()

      # DB에서 받아온 값을 DataFrame에 넣음

      tips = pd.DataFrame(rs)

finally:
   conn.close()

tips.tail(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
274,20.65,3.35,Male,No,,Dinner,4.0
275,20.65,3.35,,No,,Dinner,5.0
276,10.34,1.66,,No,Sun,Dinner,


In [4]:
tips.isnull().sum() 

total_bill    0
tip           0
sex           7
smoker        0
day           0
time          0
size          2
dtype: int64

In [5]:
tips.replace('',np.nan,inplace=True) # '' -> np.nan

In [6]:
tips.tail(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
274,20.65,3.35,Male,No,,Dinner,4.0
275,20.65,3.35,,No,,Dinner,5.0
276,10.34,1.66,,No,Sun,Dinner,


In [7]:
tips.isnull().sum() 

total_bill     0
tip            0
sex            7
smoker         0
day           17
time           0
size           2
dtype: int64

## Encoding

In [8]:
tips['sex'].replace({'Female':0, 'Male':1}, inplace=True) # 바이너리인코딩 -> 원핫인코딩
tips["smoker"].replace({"No" : 0, "Yes" : 1}, inplace=True)  # 바이너리인코딩
tips["day"].replace({"Thur" : 0, "Fri" : 1, "Sat" : 2, "Sun" : 3}, inplace=True) #라벨인코딩
tips["time"].replace({"Lunch" : 0, "Dinner" : 1}, inplace=True) # 바이너리인코딩

In [9]:
tips.info() #모든 타입이 object가 아니어야함

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  277 non-null    float64
 1   tip         277 non-null    float64
 2   sex         270 non-null    float64
 3   smoker      277 non-null    int64  
 4   day         260 non-null    float64
 5   time        277 non-null    int64  
 6   size        275 non-null    float64
dtypes: float64(5), int64(2)
memory usage: 15.3 KB


## SimpleImputer
NaN <- median

In [10]:
from sklearn.impute import SimpleImputer

In [11]:
data = tips.values
data[-3:, :]

array([[20.65,  3.35,  1.  ,  0.  ,   nan,  1.  ,  4.  ],
       [20.65,  3.35,   nan,  0.  ,   nan,  1.  ,  5.  ],
       [10.34,  1.66,   nan,  0.  ,  3.  ,  1.  ,   nan]])

In [12]:
# 카테고리형 변수이므로 중앙값 또는 최빈값 주기
imputer = SimpleImputer(strategy='median') # 최빈값은 strategy='most_frequent
imputer.fit(data)
data_trans = imputer.transform(data) # Imputed data set (nan값 없는 data set)

In [13]:
tips = pd.DataFrame(data_trans, columns=tips.columns)

In [14]:
tips.tail(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
274,20.65,3.35,1.0,0.0,2.0,1.0,4.0
275,20.65,3.35,1.0,0.0,2.0,1.0,5.0
276,10.34,1.66,1.0,0.0,3.0,1.0,2.0


### tip_rate라는 파생변수 생성

In [15]:
tips['tip_rate'] = tips['tip'] / tips['total_bill'] #파생변수
tips.tail(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_rate
274,20.65,3.35,1.0,0.0,2.0,1.0,4.0,0.162228
275,20.65,3.35,1.0,0.0,2.0,1.0,5.0,0.162228
276,10.34,1.66,1.0,0.0,3.0,1.0,2.0,0.160542


### 목표변수(y)를 tip으로 설정

In [16]:
y=tips['tip'] # Target variable -> Tip

In [17]:
X=tips.drop('tip',axis=1)

In [18]:
X.columns

Index(['total_bill', 'sex', 'smoker', 'day', 'time', 'size', 'tip_rate'], dtype='object')

In [19]:
data = X.values # Train data

In [20]:
y = y.values

## 1. RFE (Report which features were selected)

In [21]:
from sklearn.feature_selection import RFE
# from sklearn.tree import DecisionTreeClassifier  # 변수가 0, 1일 때 사용
from sklearn.svm import SVR # regression은 SVR 사용

In [22]:
estimator = SVR(kernel="linear") # 목표변수(tip)가 regression이라서 SVR 사용

In [23]:
# define RFE
rfe = RFE(estimator, n_features_to_select=4)

In [24]:
# fit RFE
selector=rfe.fit(data, y)

In [25]:
selector.support_ 

array([ True, False,  True, False, False,  True,  True])

In [26]:
# summarize all features
x = [name for name in tips.columns if name not in ['tip']]
for i in range(data.shape[1]):
  print(f'{x[i]}     \tRank: {rfe.ranking_[i]}')

total_bill     	Rank: 1
sex     	Rank: 2
smoker     	Rank: 1
day     	Rank: 4
time     	Rank: 3
size     	Rank: 1
tip_rate     	Rank: 1


## MinMaxScaler

In [27]:
from sklearn.preprocessing import MinMaxScaler  #Nomalization 호출

In [28]:
trans=MinMaxScaler()

In [29]:
X_norm=trans.fit_transform(data)

In [30]:
df_norm=pd.DataFrame(X_norm)

In [31]:
#Nomalization 확인
df_norm.describe() #값이 min 0과 max 1

Unnamed: 0,0,1,2,3,4,5,6
count,277.0,277.0,277.0,277.0,277.0,277.0,277.0
mean,0.343624,0.66426,0.33574,0.604091,0.754513,0.33574,0.18238
std,0.178734,0.473103,0.473103,0.374423,0.431155,0.203022,0.087719
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.213867,0.0,0.0,0.333333,1.0,0.2,0.140329
50%,0.308965,1.0,0.0,0.666667,1.0,0.2,0.182465
75%,0.413699,1.0,1.0,1.0,1.0,0.4,0.221647
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### StandardScaler

In [32]:
from sklearn.preprocessing import StandardScaler #Standardization 호출

In [33]:
sc = StandardScaler()

In [34]:
df_sc = sc.fit_transform(data)

In [35]:
df_transform_sc=pd.DataFrame(df_sc)

In [36]:
df_transform_sc.describe().round() #평균이 0, 분산이 1로 바뀜

Unnamed: 0,0,1,2,3,4,5,6
count,277.0,277.0,277.0,277.0,277.0,277.0,277.0
mean,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.0,-1.0,-1.0,-2.0,-2.0,-2.0,-2.0
25%,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-0.0
50%,-0.0,1.0,-1.0,0.0,1.0,-1.0,0.0
75%,0.0,1.0,1.0,1.0,1.0,0.0,0.0
max,4.0,1.0,1.0,1.0,1.0,3.0,9.0


참고로 정규화하고 표준화 값은 아래 모델에 사용되지 않음

## PCA (Principal Component Analysis)

In [37]:
# PCA는 목표변수 설정이 필요하지 않음
from sklearn.decomposition import PCA

In [38]:
data.shape

(277, 7)

In [39]:
# define the transform
trans = PCA(n_components=4)

In [40]:
# transform the data
X_dim = trans.fit_transform(data_trans)

In [41]:
# summarize data after the transform
X_dim[:3, :]

array([[-2.70197953, -1.13007487,  1.76847415, -0.07258704],
       [-9.15736726, -1.5494515 , -0.08333977, -0.73286243],
       [ 1.62589542, -1.23073393, -0.42041362,  0.10046241]])

## 2. Regression Feature Selection(Select-KBest)

In [42]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [43]:
fs = SelectKBest(score_func=f_regression, k=4)

In [44]:
data.shape

(277, 7)

In [45]:
y.shape

(277,)

In [46]:
# apply feature selection
X_selected = fs.fit_transform(data, y)

In [47]:
X_selected.shape

(277, 4)

In [48]:
x = [name for name in tips.columns if name not in ['tip']]
for i in range(data.shape[1]):
    print(f'{x[i]}     \tScore: {fs.scores_[i]:.4}')

total_bill     	Score: 237.0
sex     	Score: 3.69
smoker     	Score: 0.4186
day     	Score: 1.008
time     	Score: 2.104
size     	Score: 73.08
tip_rate     	Score: 43.98
