In [1]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:86% !important;}
div.cell.code_cell.rendered{width:100%;}
div.CodeMirror {font-family:Consolas; font-size:15pt;}
div.output {font-size:15pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:15pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{font-size:12pt;padding:5px;}
table.dataframe{font-size:15px;}
</style>
"""))

## 1.데이터 가져오기

In [2]:
import pandas as pd

df = pd.read_csv(r'C:\Ai_x\download\부동산\최종전국평당분양가격(결측치제외).csv', encoding='cp949')
display(df)


Unnamed: 0,지역명,평당분양가격,연도,월
0,서울,18189.0,2013,12
1,부산,8111.0,2013,12
2,대구,8080.0,2013,12
3,인천,10204.0,2013,12
4,광주,6098.0,2013,12
...,...,...,...,...
2171,전북,12058.2,2024,8
2172,전남,13120.8,2024,8
2173,경북,13827.0,2024,8
2174,경남,13252.8,2024,8


# 2. 지역명의 라벨 인코딩
- 지역명을 라벨인코딩한 지역명2(분석할 경우 원핫인코딩까지)

In [3]:
region_data = df['지역명']  # 또는 data = df.loc[:, '지역명']
print(region_data.head())  # 확인

0    서울
1    부산
2    대구
3    인천
4    광주
Name: 지역명, dtype: object


In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
region_data = le.fit_transform(region_data)
df['지역명2'] = pd.Series(region_data)
display(df.head())

Unnamed: 0,지역명,평당분양가격,연도,월,지역명2
0,서울,18189.0,2013,12,8
1,부산,8111.0,2013,12,7
2,대구,8080.0,2013,12,5
3,인천,10204.0,2013,12,11
4,광주,6098.0,2013,12,4


In [38]:
X_data = df[['지역명2','연도','월']].values
y_data = df[['평당분양가격']].values


# 3. normalization 스케일 조정
- 입력변수(지역명2, 연도, 월)와 타켓변수(평당분양가격) 따로 스케일 조정 MinMaxScaler 이용
 - 전체내용 : df
 - 독립변수를 넘파이배열로 : df.iloc[:,:-1].values
 - 종속변수를 넘파이배열로 : df.iloc[:,-1:].values

In [39]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
scaled_X_data = scaler_X.fit_transform(df[['지역명2', '연도', '월']])
scaled_y_data = scaler_y.fit_transform(df[['평당분양가격']])
scaled_X_data, scaled_y_data

(array([[0.5       , 0.        , 1.        ],
        [0.4375    , 0.        , 1.        ],
        [0.3125    , 0.        , 1.        ],
        ...,
        [0.1875    , 1.        , 0.63636364],
        [0.125     , 1.        , 0.63636364],
        [0.875     , 1.        , 0.63636364]]),
 array([[0.32819817],
        [0.06527439],
        [0.06446563],
        ...,
        [0.21439846],
        [0.19941822],
        [0.51684429]]))

## 4. standariation 스케일 조정
 - 입력변수와 타겟변수 따로 스케일 조정(standarScaler 이용)
 - 지역명2s, 연도s, 원s 필드 추가

In [40]:

scaler_X = StandardScaler()
scaler_y = StandardScaler()
df[['지역명2s', '연도s', '월s']]= scaler_X.fit_transform(df[['지역명2', '연도', '월']])
df[['평당분양가격s']] = scaler_y.fit_transform(df[['평당분양가격']])


df.head()


Unnamed: 0,지역명,평당분양가격,연도,월,지역명2,지역명2s,연도s,월s,평당분양가격s
0,서울,1.168591,2013,12,8,0.0,-1.875367,1.62196,1.168591
1,부산,-0.728312,2013,12,7,-0.204124,-1.875367,1.62196,-0.728312
2,대구,-0.734147,2013,12,5,-0.612372,-1.875367,1.62196,-0.734147
3,인천,-0.334363,2013,12,11,0.612372,-1.875367,1.62196,-0.334363
4,광주,-1.107203,2013,12,4,-0.816497,-1.875367,1.62196,-1.107203


## 5. 지역명을 원핫인코딩


In [49]:
from tensorflow.keras.utils import to_categorical
to_categorical(df['지역명2'])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]], dtype=float32)

In [52]:
# 지역명, 지역명2
loc_info = df[['지역명','지역명2']].head(17).sort_values(by='지역명2')
loc_column_names = loc_info['지역명'].tolist()
print(loc_column_names)

['강원', '경기', '경남', '경북', '광주', '대구', '대전', '부산', '서울', '세종', '울산', '인천', '전남', '전북', '제주', '충남', '충북']


In [53]:
df[loc_column_names] = to_categorical(df['지역명2'])
df.head()

Unnamed: 0,지역명,평당분양가격,연도,월,지역명2,지역명2s,연도s,월s,평당분양가격s,강원,...,부산,서울,세종,울산,인천,전남,전북,제주,충남,충북
0,서울,1.168591,2013,12,8,0.0,-1.875367,1.62196,1.168591,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,부산,-0.728312,2013,12,7,-0.204124,-1.875367,1.62196,-0.728312,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,대구,-0.734147,2013,12,5,-0.612372,-1.875367,1.62196,-0.734147,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,인천,-0.334363,2013,12,11,0.612372,-1.875367,1.62196,-0.334363,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,광주,-1.107203,2013,12,4,-0.816497,-1.875367,1.62196,-1.107203,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
pd.options.display.max_columns = 30 # ㅗ치대 가능한 데이터 프레임 열수