# 下载数据

In [None]:
def callback(blocknum, blocksize, totalsize):
    '''
    :param blocknum: 已下载数据块
    :param blocksize: 数据块大小
    :param totalsize: 远程文件大小
    :return:
    '''
    percent = 100.0*blocknum*blocksize/totalsize
    if(percent>100):
        percent = 100
    print('%.2f%%' % percent)

In [None]:
def download(path, outpath='.'):
    """Use urllib to download a file.

    Parameters
    ----------
    path : str
        Url to download

    Returns
    -------
    path : str
        Location of downloaded file.
    """
    import os
    from six.moves import urllib

    if not os.path.isdir(outpath):
        os.makedirs(outpath)
        
    fname = path.split('/')[-1]
    output = outpath  + os.path.sep + fname
    if os.path.exists(output):
        return output

    print('Downloading ' + path)

    def progress(count, block_size, total_size):
        if count % 20 == 0:
            print('Downloaded %02.02f/%02.02f MB' % (
                count * block_size / 1024.0 / 1024.0,
                total_size / 1024.0 / 1024.0), end='\r')

    filepath, _ = urllib.request.urlretrieve(
        path, filename=output, reporthook=progress)
    return filepath 

In [None]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://github.com/ageron/handson-ml/blob/master/"
HOUSING_PATH = "datasets" + os.path.sep + "housing"
HOUSING_URL = DOWNLOAD_ROOT + "datasets" + "/" + "housing" + "/housing.tgz"

def fetch_housing_data(housing_url = HOUSING_URL, housing_path=HOUSING_PATH):

  
    filepath = download(housing_url, housing_path)
    print('\nDownload finished!')
  


    #https://github.com/dspinxd/handson-ml/blob/master/datasets/housing/housing.tgz
    #a,b = urllib.request.urlretrieve(housing_url, tgz_path)



In [None]:
import tarfile
def extract(tgz_path):
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=os.path.dirname(tgz_path))
    housing_tgz.close()

# 加载数据

In [None]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    """
    加载数据
    """
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
"下载数据，很奇怪，只能下载55kb，文件有400k，目前没找到原因"
fetch_housing_data()
#housing = load_housing_data()
#housing.head()

In [None]:
"解压数据文件"
extract(HOUSING_PATH +  os.path.sep + "housing.tgz")

# 快速查看数据结构

In [None]:
"加载数据"
housing = load_housing_data()
print(housing.head())
print(housing.info())

In [None]:
"用value_counts() 方法查看离大海距离项中都有哪些类别"
housing["ocean_proximity"].value_counts()

In [None]:
"数值属性概括"
housing.describe()

In [None]:
#%matplotlib inline
import matplotlib.pyplot as plt
#bins ：柱状个数，figsize： 纵横比例
housing.hist(bins=20, figsize=(20,15))
plt.show()

# 创建测试集

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)


In [None]:
import numpy as np
#print(housing["median_income"])
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
#print(housing["income_cat"])
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
#print(type(housing["income_cat"]))

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]


In [None]:
housing["income_cat"].value_counts() / len(housing)

In [None]:
len(strat_test_set) / len(housing)

In [None]:
strat_test_set["income_cat"].value_counts() / len(strat_test_set["income_cat"])

In [None]:
strat_train_set["income_cat"].value_counts() / len(strat_train_set["income_cat"])

In [None]:
for set in (strat_train_set, strat_test_set):
    set.drop(["income_cat"], axis=1, inplace=True)

In [None]:
strat_train_set.head()

# 数据探索和可视化、发现规律

In [None]:
housing = strat_train_set.copy()

## 地理数据可视化

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude",alpha=0.1)

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=housing["population"]/100, label="population",
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,)
plt.legend()

## 查找关联

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms","housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

In [None]:
housing.plot(kind="scatter", x="median_income",y="median_house_value",alpha=0.1)

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()