# 특성에 범주형 데이터가 있는 데이터 수치 예측 모델 만들기

In [38]:
import pandas as pd

tips = pd.read_csv("../contents/tips.csv")
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [39]:
input = tips.drop("tip", axis=1)
target = tips["tip"]
input

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,29.03,Male,No,Sat,Dinner,3
240,27.18,Female,Yes,Sat,Dinner,2
241,22.67,Male,Yes,Sat,Dinner,2
242,17.82,Male,No,Sat,Dinner,2


In [40]:
input["sex"][input["sex"] == "Female"] = 1
input["sex"][input["sex"] == "Male"] = 0

input["smoker"][input["smoker"] == "Yes"] = 1
input["smoker"][input["smoker"] == "No"] = 0

input["time"][input["time"] == "Dinner"] = 1
input["time"][input["time"] == "Lunch"] = 0
input

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input["sex"][input["sex"] == "Female"] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input["sex"][input["sex"] == "Male"] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input["smoker"][input["smoker"] == "Yes"] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input["smoker"][input["smoker"] == "No

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,1,0,Sun,1,2
1,10.34,0,0,Sun,1,3
2,21.01,0,0,Sun,1,3
3,23.68,0,0,Sun,1,2
4,24.59,1,0,Sun,1,4
...,...,...,...,...,...,...
239,29.03,0,0,Sat,1,3
240,27.18,1,1,Sat,1,2
241,22.67,0,1,Sat,1,2
242,17.82,0,0,Sat,1,2


In [41]:
# get_dummies : Convert categorical variable into dummy/indicator variables.
#               Each variable is converted in as many 0/1 variables as there are different values.
input = pd.concat([input, pd.get_dummies(tips["day"])], axis=1)
input = input.drop("day", axis=1)
input

Unnamed: 0,total_bill,sex,smoker,time,size,Fri,Sat,Sun,Thur
0,16.99,1,0,1,2,False,False,True,False
1,10.34,0,0,1,3,False,False,True,False
2,21.01,0,0,1,3,False,False,True,False
3,23.68,0,0,1,2,False,False,True,False
4,24.59,1,0,1,4,False,False,True,False
...,...,...,...,...,...,...,...,...,...
239,29.03,0,0,1,3,False,True,False,False
240,27.18,1,1,1,2,False,True,False,False
241,22.67,0,1,1,2,False,True,False,False
242,17.82,0,0,1,2,False,True,False,False


In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(input.values.astype("float"), target, test_size=0.2)

In [43]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()

mms.fit(X_train)
X_train_mms = mms.transform(X_train)
X_test_mms = mms.transform(X_test)

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_mms, y_train)
print(lr.score(X_train_mms, y_train))
print(lr.score(X_test_mms, y_test))

0.48150152025002846
0.37998966645927335


In [44]:
from sklearn.linear_model import Lasso
penalties=[0.001,0.01,0.1,1,10,100]
for i in penalties:
    lasso=Lasso(alpha=i)
    lasso.fit(X_train_mms, y_train)
    print("규제 알파:", i)
    print(lasso.score(X_train_mms, y_train))
    print(lasso.score(X_test_mms, y_test))
    print("-"*100)

규제 알파: 0.001
0.4814581543232759
0.38382823846676795
----------------------------------------------------------------------------------------------------
규제 알파: 0.01
0.47748269979853253
0.41078250128916083
----------------------------------------------------------------------------------------------------
규제 알파: 0.1
0.31575003041884153
0.34767134077548334
----------------------------------------------------------------------------------------------------
규제 알파: 1
0.0
-0.007445908340548701
----------------------------------------------------------------------------------------------------
규제 알파: 10
0.0
-0.007445908340548701
----------------------------------------------------------------------------------------------------
규제 알파: 100
0.0
-0.007445908340548701
----------------------------------------------------------------------------------------------------


In [45]:
from sklearn.linear_model import Ridge
penalties=[0.001,0.01,0.1,1,10,100]
for i in penalties:
    r=Ridge(alpha=i)
    r.fit(X_train_mms, y_train)
    print("규제 알파:", i)
    print(r.score(X_train_mms, y_train))
    print(r.score(X_test_mms, y_test))
    print("-"*100)

규제 알파: 0.001
0.4815015140722846
0.37999428182329276
----------------------------------------------------------------------------------------------------
규제 알파: 0.01
0.48150090459164696
0.38003594099642546
----------------------------------------------------------------------------------------------------
규제 알파: 0.1
0.4814419850405235
0.38046290530633275
----------------------------------------------------------------------------------------------------
규제 알파: 1
0.4769949092895014
0.38468345683279936
----------------------------------------------------------------------------------------------------
규제 알파: 10
0.3660552143807074
0.34626472228505756
----------------------------------------------------------------------------------------------------
규제 알파: 100
0.09537881753319088
0.10774735318171869
----------------------------------------------------------------------------------------------------


In [46]:
lr.coef_

array([ 3.99847686,  0.0580028 , -0.11416161,  0.06250041,  1.53958447,
        0.17924448, -0.10978618, -0.03083499, -0.0386233 ])