<a href="https://colab.research.google.com/github/dantae74/tensorflow/blob/main/01-basic-sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Tensorflow 2.0

In [None]:
import os
import tarfile
import urllib.request
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
DOWNLOAD_ROOT = 'https://raw.githubusercontent.com/ageron/handson-ml/master/'
HOUSING_PATH = os.path.join('datasets', 'housing')
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [None]:
def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, 'housing.tgz')
    urllib.request.urlretrieve(housing_url, tgz_path)

    housig_tgz = tarfile.open(tgz_path)
    housig_tgz.extractall(path=housing_path)
    housig_tgz.close()

fetch_housing_data()

In [None]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

housing = load_housing_data()

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
housing['income_cat'] = pd.cut(housing['median_income'],
                               bins=[0.,1.5,3.0,4.5,6., np.inf],
                               labels=[1,2,3,4,5])

In [None]:
housing['income_cat'].hist()

### 소득 카테고리 기반 계층 샘플링

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
strat_test_set['income_cat'].value_counts() / len(strat_test_set)

In [None]:
# income_cat 특성 삭제
for set_ in (strat_test_set, strat_train_set):
  set_.drop("income_cat", axis=1, inplace=True)


In [None]:
# 훈련세트를 보존하기 위한 복사본 만들기
housing = strat_train_set.copy()

In [None]:
# 지리정보(위도와 경도)를 사용한 데이터 시각화
housing.plot(kind='scatter', x="longitude", y="latitude")

In [None]:
# 캘리포니아 지역의 특별한 패턴 정보 찾기 (밀집지역 부각한 산점도)
housing.plot(kind='scatter', x="longitude", y="latitude", alpha=0.1)

In [None]:
# 캘리포니아 주택가격(빨간색은 높은가격, 파란색은 낮은가격), 큰원은 인구가 밀집된 지역
housing.plot(kind='scatter', x="longitude", y="latitude", alpha=0.4,
             s=housing['population']/100, label='population', figsize=(10,7),
             c='median_house_value', cmap=plt.get_cmap("jet"), colorbar=True,
             sharex=False)
plt.legend()

In [None]:
# 상관관계 조사
corr_matrix = housing.corr()

corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
housing.columns

In [None]:
# 상관관계 (산점도)
from pandas.plotting import scatter_matrix

attributes = ['median_house_value', 'median_income', 'housing_median_age', 'total_bedrooms']
scatter_matrix(housing[attributes], figsize=(12,8))

In [None]:
# median_house_value에 대한 median_income 상관관계 산점도
housing.plot(kind='scatter', x='median_income', y='median_house_value', alpha=0.1)