# London Housing and Population Analysis (2002 - 2015 data)

## 1. Initial EDA of House Sales Figures

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


In [2]:
# Import housing data set:
housing_df = pd.read_csv('../raw_data/housing_in_london_monthly_variables.csv')

# Understand data set structure:
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13549 entries, 0 to 13548
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           13549 non-null  object 
 1   area           13549 non-null  object 
 2   average_price  13549 non-null  int64  
 3   code           13549 non-null  object 
 4   houses_sold    13455 non-null  float64
 5   no_of_crimes   7439 non-null   float64
 6   borough_flag   13549 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 741.1+ KB


In [3]:
# Make date datetime object:
housing_df['date'] = pd.to_datetime(housing_df['date'])

NaN values noticeable in houses_sold and no_of_crimes.  
NaN houses_sold to be analysed; 
no_of_crimes not expected to be required in this EDA and disregarded for now.


In [4]:
# Understand area and code referents:
sales_per_area = housing_df['houses_sold'].groupby(housing_df['area']).sum().sort_values(ascending=False)
housing_df['area'].unique()
unique_codes = housing_df['code'].unique()

The data set contains data for England at four levels: Country, England NUTS I, London Inner and Outer, London Borough.
The data overlaps.

In [5]:
# Test hypothesis that codes beginning E090... are London:
codes = housing_df['area'].groupby(housing_df['code']).unique()

# Consider area names when code starts 'E090':
london_boroughs = housing_df[housing_df['code'].str.startswith('E090')]
london_boroughs['area'].unique()

# Categorise areas into the four types:
gss_map = {'E09': 'London Borough', 'E12': 'English Region', 'E13': 'Inner/Outer London', 'E92': 'Country'}
housing_df['GSS_prefix'] = housing_df['code'].str[:3]
housing_df['area_type'] = housing_df['GSS_prefix'].map(gss_map)
housing_df[housing_df['area_type'] == 'Country']
sales_by_area_type = housing_df['houses_sold'].groupby(housing_df['area_type']).sum().sort_values(ascending=False)

English Region numbers show fewer houses sold than in Country, when they should be the same.

In [6]:
# List regions:
regions = housing_df['area'][housing_df['area_type'] == 'English Region'].unique()

Hackney appears incorrectly as English region i.e. incorrectly listed against 'E13' code.

In [7]:
# Identify if "hackney" area appearing as both borough and region is duplication:
hackney = housing_df[housing_df['area'].str.contains('hackney')]
hackney[hackney['area_type'] == 'English Region']
hackney[hackney['date'] == '1998-04-01']

# Remove duplicate entry:
duplicate = housing_df[(housing_df['area'] == 'hackney') & (housing_df['area_type'] == 'English Region') 
& (housing_df['date'] == '1998-04-01')]
clean_housing_df = housing_df.drop([3354])

# Check removal:
check = clean_housing_df['area'].groupby(clean_housing_df['code']).unique()

In [8]:
# Examine and resolve other NaN in 'houses_sold':
missing_values = clean_housing_df.isna().sum()
missing_sales_numbers = clean_housing_df[clean_housing_df['houses_sold'].isnull() == True]
clean_housing_df.set_index('date', inplace=True, drop=False)
dec19_jan20 = clean_housing_df.loc['2019-12-01' : '2020-01-01']
nineties_enfield = clean_housing_df[clean_housing_df['area'] == 'enfield'].loc['1995-09-01':'1996-03-01']
nineties_tower_hamlets = clean_housing_df[clean_housing_df['area'] == 'tower hamlets'].loc['1995-09-01':'1996-03-01']
se = clean_housing_df[clean_housing_df['code'] == 'E09000012'].loc['1998-01-01':'1998-06-01']

I can see three reasons for NaN values in 'houses_sold':
1. No data is available yet for Dec 2019 and Jan 2020;
2. Enfield and Tower Hamlets both have duplicate entries for 1996-02-01; 
3. Hackney has a duplicate entry for 1998-04-01.

I am now confident that remaining rows containing ['houses_sold'] == NaN can be dropped.

In [9]:
# Remove NaN:
clean_housing_df.dropna(subset=['houses_sold'], inplace=True)

In [12]:
# Filter to look at regions (data output to csv for further analysis):
regions = clean_housing_df[clean_housing_df['area_type'] == 'English Region']
region_df = regions[['area', 'average_price', 'houses_sold']].groupby('area').resample('A').agg({'average_price': 'mean', 'houses_sold': 'sum'})
region_df.to_csv('../raw_data/regions_dataset.csv')

# Filter to look at London boroughs (data output to csv for further analysis):
boroughs = clean_housing_df[clean_housing_df['area_type'] == 'London Borough']
boroughs[['area', 'average_price', 'houses_sold']].to_csv('../raw_data/borough_dataset.csv')