> Lasso Regression

In [3]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()   #sklearn.datasets 모듈 함수 / 캘리포니아 주택 가격 데이터셋 불러옴
data = pd.DataFrame(housing.data, columns=housing.feature_names)
data['TARGET'] = housing.target


print(data.head())

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  TARGET  
0    -122.23   4.526  
1    -122.22   3.585  
2    -122.24   3.521  
3    -122.25   3.413  
4    -122.25   3.422  


In [5]:
from sklearn.feature_selection import VarianceThreshold

threshold = 0.1
selector = VarianceThreshold(threshold=threshold)
data_selected = selector.fit_transform(data)

print(f"선택된 변수 수: {data_selected.shape[1]}")  #열의 개수

선택된 변수 수: 9


In [6]:
variances = data.var()
print(variances.sort_values())  # 가장 낮은 분산부터 확인
#EDA로 분산 분포를 보고 적절한 값을 실험적으로 정하기

AveBedrms     2.245915e-01
TARGET        1.331615e+00
MedInc        3.609323e+00
Longitude     4.014139e+00
Latitude      4.562293e+00
AveRooms      6.121533e+00
AveOccup      1.078700e+02
HouseAge      1.583963e+02
Population    1.282470e+06
dtype: float64


In [11]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

model = LinearRegression()   # 중요한 변수 5개를 선택하는 예제 , 선형 회귀 모델
rfe = RFE(estimator=model , n_features_to_select=5)  # 선형 회귀 모델을 기준으로 변수 5개 select
rfe.fit(data.iloc[:, :-1], data['TARGET'])  # RFE를 실제 데이터에 적용 / 마지막 열을 제외한 입력변수 X , 에측 대상인 타겟변수 y

selected_features = data.columns[:-1][rfe.support_]  # 마지막 열을 제외한 모든 feature 이름
print(f'선택된 변수: {selected_features.tolist()}')

선택된 변수: ['MedInc', 'AveRooms', 'AveBedrms', 'Latitude', 'Longitude']


> Filter 방식 : 높은 상관계수를 가지는 feature를 사용 ,표준편차가 0에 가까운 해당 피처는 삭제

In [12]:
df= pd.read_csv('train.csv')
print(df)
df.head().T   # 전치 ! 행과 열을 뒤바꾸는 것    가로에서 세로로 보기 쉽게 정렬

                  datetime  season  holiday  workingday  weather   temp  \
0      2011-01-01 00:00:00       1        0           0        1   9.84   
1      2011-01-01 01:00:00       1        0           0        1   9.02   
2      2011-01-01 02:00:00       1        0           0        1   9.02   
3      2011-01-01 03:00:00       1        0           0        1   9.84   
4      2011-01-01 04:00:00       1        0           0        1   9.84   
...                    ...     ...      ...         ...      ...    ...   
10881  2012-12-19 19:00:00       4        0           1        1  15.58   
10882  2012-12-19 20:00:00       4        0           1        1  14.76   
10883  2012-12-19 21:00:00       4        0           1        1  13.94   
10884  2012-12-19 22:00:00       4        0           1        1  13.94   
10885  2012-12-19 23:00:00       4        0           1        1  13.12   

        atemp  humidity  windspeed  casual  registered  count  
0      14.395        81     0.0000 

Unnamed: 0,0,1,2,3,4
datetime,2011-01-01 00:00:00,2011-01-01 01:00:00,2011-01-01 02:00:00,2011-01-01 03:00:00,2011-01-01 04:00:00
season,1,1,1,1,1
holiday,0,0,0,0,0
workingday,0,0,0,0,0
weather,1,1,1,1,1
temp,9.84,9.02,9.02,9.84,9.84
atemp,14.395,13.635,13.635,14.395,14.395
humidity,81,80,80,75,75
windspeed,0.0,0.0,0.0,0.0,0.0
casual,3,8,5,3,0


In [16]:
df['year'] = df['datetime'].apply(lambda x: int(x.split(' ')[0].split('-')[0]))
df['month'] = df['datetime'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))
df['day'] = df['datetime'].apply(lambda x: int(x.split(' ')[0].split('-')[2]))
df['hour'] = df['datetime'].apply(lambda x: int(x.split(' ')[1].split(':')[0]))
df['minute'] = df['datetime'].apply(lambda x: int(x.split(' ')[1].split(':')[1]))
df['second'] = df['datetime'].apply(lambda x: int(x.split(' ')[1].split(':')[2]))
df

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,hour,minute,second
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16,2011,1,1,0,0,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40,2011,1,1,1,0,0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32,2011,1,1,2,0,0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13,2011,1,1,3,0,0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1,2011,1,1,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336,2012,12,19,19,0,0
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241,2012,12,19,20,0,0
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168,2012,12,19,21,0,0
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129,2012,12,19,22,0,0


In [17]:
sample = '2011-01-01 00:00:00'

In [18]:
parsed = pd.to_datetime(sample)
parsed.dayofweek

5

In [19]:
df['dayofweek'] = df['datetime'].apply(lambda x: pd.to_datetime(x).dayofweek)
df

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,hour,minute,second,dayofweek
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16,2011,1,1,0,0,0,5
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40,2011,1,1,1,0,0,5
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32,2011,1,1,2,0,0,5
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13,2011,1,1,3,0,0,5
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1,2011,1,1,4,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336,2012,12,19,19,0,0,2
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241,2012,12,19,20,0,0,2
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168,2012,12,19,21,0,0,2
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129,2012,12,19,22,0,0,2


In [21]:
set(df['dayofweek'])

{0, 1, 2, 3, 4, 5, 6}

In [23]:
df.describe().T.sort_values(by='std')['std']  # 표준편차.  값이 크면 멀리 떨어져있음
# T 전치. feature가 행이 됨. 표준편를 기준으로 오름차순 정렬하고 그 중에서 std컬럼만 추출

minute          0.000000
second          0.000000
holiday         0.166599
workingday      0.466159
year            0.500019
weather         0.633839
season          1.116174
dayofweek       2.004585
month           3.444373
day             5.476608
hour            6.915838
temp            7.791590
windspeed       8.164537
atemp           8.474601
humidity       19.245033
casual         49.960477
registered    151.039033
count         181.144454
Name: std, dtype: float64

> Wrapper : 
> feature를 하나씩 넣고, 빼면서 평가 지표 점수 확인 -> 시행착오를 거친다.
Forward Selection(전진 선택), Backward Elimination(후방 제거), Stepwise Selection(단계별 선택)

In [24]:
df.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'year', 'month', 'day', 'hour', 'minute', 'second', 'dayofweek'],
      dtype='object')