In [0]:
source = "abfss://raw@cloudinfrastg.dfs.core.windows.net/00_data_source/"
data = "housing.csv"

In [0]:
import pandas as pd

housing = spark.read.csv(source + data, header=True, inferSchema=True)   
housing.display() 

In [0]:
housing.printSchema()

In [0]:
housing_pd = housing.toPandas ()
housing_pd['ocean_proximity'].value_counts()


In [0]:
housing_pd.describe()

In [0]:
import matplotlib.pyplot as plt

housing_pd.hist(bins=50, figsize=(12, 8))
plt.show()

####Split train-test with own function

In [0]:
import numpy as np

def shuffle_and_split_data(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [0]:
train_set, test_set = shuffle_and_split_data(housing_pd, 0.2)

In [0]:
len(train_set), len(test_set)

In [0]:
from zlib import crc32

def is_id_in_test_set(identifier, test_ratio):
    return crc32(np.int64(identifier)) < test_ratio * 2**32

def split_data_with_id_hash(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [0]:
housing_with_id = housing_pd.reset_index() 

In [0]:
housing_with_id.display()
train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, "index")

####Split train-test with sklearn

In [0]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing_pd, test_size=0.2, random_state=42)

In [0]:
len(train_set), len(test_set)

In [0]:
housing_pd["income_cat"] = pd.cut(housing_pd["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [0]:
housing_pd.display()

In [0]:
housing_pd["income_cat"].value_counts().sort_index().plot.bar(rot=0, grid=True)
plt.xlabel("Income category")
plt.ylabel("Number of districts")
plt.show()

####Stratified split for categories

In [0]:
from sklearn.model_selection import StratifiedShuffleSplit

splitter = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
strat_splits = []
for train_index, test_index in splitter.split(housing_pd, housing_pd["income_cat"]):
    strat_train_set_n = housing_pd.iloc[train_index]
    strat_test_set_n = housing_pd.iloc[test_index]
    strat_splits.append([strat_train_set_n, strat_test_set_n])

In [0]:
strat_train_set, strat_test_set = strat_splits[0]

In [0]:
strat_train_set.display()

In [0]:
from sklearn.model_selection import train_test_split

strat_train_set, strat_test_set = train_test_split(
    housing_pd, test_size=0.2, stratify=housing_pd["income_cat"], random_state=42)

In [0]:
strat_train_set.display()

In [0]:
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [0]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

####Visualize the data

In [0]:
strat_train_set.plot(kind="scatter", x="longitude", y="latitude", grid=True)
plt.show()

In [0]:
strat_train_set.plot(kind="scatter", x="longitude", y="latitude", grid=True, alpha=0.2)
plt.show()

In [0]:
strat_train_set.plot(kind="scatter", x="longitude", y="latitude", grid=True,
             s=strat_train_set["population"] / 100, label="population",
             c="median_house_value", cmap="jet", colorbar=True,
             legend=True, sharex=False, figsize=(10, 7))
plt.show()

In [0]:
corr_matrix = strat_train_set.corr( numeric_only=True)

In [0]:
corr_matrix["median_house_value"].sort_values(ascending=False)