# House Prices and Nutrition

## I) Module imports

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from utils import calculate_nutripoints
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pprint
pp = pprint.PrettyPrinter(depth=4)

sns.color_palette("colorblind")
np.random.seed(42)

sns.set_theme('notebook')

In [None]:
%load_ext autoreload
%autoreload 2

## II) Data imports

In [None]:
NUTRITION_COLS = ["area_id", "energy_tot", "energy_fat", "energy_saturate", "energy_sugar", "energy_protein",
                  "energy_carb", "energy_fibre", "energy_alcohol", "h_nutrients_calories"]

year_grocery = pd.read_csv("data/year_lsoa_grocery.csv")
print(year_grocery.shape)
display(year_grocery.head())

In [None]:
min_grocery = year_grocery[['energy_tot', 'saturate', 'salt',
                            'sugar', 'f_fruit_veg', 'fibre', 'protein']].min(axis=0)

max_grocery = year_grocery[['energy_tot', 'saturate', 'salt',
                            'sugar', 'f_fruit_veg', 'fibre', 'protein']].max(axis=0)

year_grocery["nutripoints"] = year_grocery.apply(
    lambda row: calculate_nutripoints(row, min_grocery, max_grocery), axis=1)

In [None]:
COLS = ["Code", "Year ending Dec 2014"]

housing_prices = pd.read_excel(
    "data/land-registry-house-prices-LSOA.xls", sheet_name="Mean")[COLS]
housing_prices.rename(
    columns={"Year ending Dec 2014": "mean house price"}, inplace=True)
housing_prices.dropna(inplace=True)

print(housing_prices.shape)
display(housing_prices.head())

### A) Merge datasets

In [None]:
grocery_housing = pd.merge(
    year_grocery, housing_prices, left_on="area_id", right_on="Code")
grocery_housing.drop("Code", axis=1, inplace=True)

In [None]:
grocery_housing['mean house price'] = pd.to_numeric(
    grocery_housing['mean house price'], errors='coerce')
display(grocery_housing)

## Exploratory Data Analysis

In [None]:
ax = sns.histplot(data=grocery_housing, x="mean house price")

print(grocery_housing["mean house price"].median())

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 5))

sns.ecdfplot(data=grocery_housing, x="mean house price",
             complementary=True, ax=ax[0])
sns.ecdfplot(data=grocery_housing, x="mean house price",
             complementary=True, ax=ax[1])
ax[1].set(xscale="log", yscale="log")


price_mean = grocery_housing["mean house price"].mean()
price_median = grocery_housing["mean house price"].median()

print(price_mean, price_median, price_mean / price_median)

The data is skewed to the right, with a long tail of high scores pulling the mean up more than the median. According to the graphs, the data clearly does not follow a normal distribution.

In [None]:
COLS = ["energy_tot", "energy_fat", "energy_saturate", "energy_sugar", "energy_protein",
        "energy_carb", "energy_fibre", "energy_alcohol", "h_nutrients_calories", "nutripoints"]

correlation = grocery_housing.corr(method="spearman").loc[COLS]

correlation['mean house price'].plot.bar(
    x=None, y=None, width=0.8, legend=None)

plt.title("Mean house price")
plt.ylabel("spearman correlation")

In [None]:
COLS_CORRELATED = ["energy_tot", "energy_sugar", "energy_carb", "energy_fibre", "h_nutrients_calories", "nutripoints"]
colors = ["r", "r", "r", "g", "g", "r"]

N = len(COLS_CORRELATED)

fig, ax = plt.subplots(2, 3, figsize=(25, 10))

for i, col in enumerate(COLS_CORRELATED):
    ax[int(i / 3), i % 3].scatter(grocery_housing["mean house price"],
                    y=grocery_housing[col], c=[colors[i]] * len(grocery_housing))
    ax[int(i / 3), i % 3].set(xscale="log", title=col, xlabel="mean house price")
    
fig.tight_layout()

Nutripoints are negatively related to mean house prices. The graph on the ri

In [None]:
house_prices_median = grocery_housing["mean house price"].median()


def classify_median(row, median):
    if row["mean house price"] > median:
        return "high"
    else:
        return "low"


grocery_housing["pricing"] = grocery_housing.apply(
    lambda i: classify_median(i, house_prices_median), axis=1)

In [None]:
grocery_housing[['nutripoints', 'pricing']
                ].boxplot(by='pricing', figsize=(10, 5))

plt.show()

## III) Predictive Models

In [None]:
scaler = StandardScaler()
grocery_housing.dropna(axis=0, inplace=True)
grocery_housing_stand = grocery_housing.copy()
grocery_housing_stand[["mean house price", "nutripoints"]] = scaler.fit_transform(
    grocery_housing[["mean house price", "nutripoints"]])

X = grocery_housing_stand[["mean house price"]]
y = grocery_housing_stand["nutripoints"]

In [None]:
# Create the models

lin_reg = LinearRegression()
gb_boost_reg = GradientBoostingRegressor(learning_rate=0.1, n_estimators=100)
ridge_reg = Ridge(alpha=.5)
mlp_reg = MLPRegressor(solver='lbfgs', alpha=1e-5,
                       hidden_layer_sizes=(5, 2))
dt_reg = DecisionTreeRegressor()

mse_scores = {}

reg_models = [lin_reg, gb_boost_reg, ridge_reg, mlp_reg, dt_reg]

mse = 'neg_mean_squared_error'

for model in reg_models:
    model_scores = cross_validate(model, X, y, cv=5, scoring=[mse])

    mse_scores[type(model).__name__] = model_scores["test_" + mse]

pp.pprint(mse_scores)

In [None]:
mse_scores_df = pd.DataFrame(mse_scores).abs()

In [None]:
ax = mse_scores_df.plot.bar(figsize=(15, 5))

In [None]:
best_model = mse_scores_df.mean(axis=0).idxmin()

print(best_model)

## IV) Conclusion