In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [91]:
#importing the correct data
ward_tesco = pd.read_csv('data/tesco/year_osward_grocery.csv')
ward_crime = pd.read_csv('data/crime_wards.csv', header=2)
ward_demographics = pd.read_csv('data/demographics_ward.csv', header=2)
ward_education = pd.read_csv('data/education_ward.csv', header=2)
ward_environment = pd.read_csv('data/environment_ward.csv', header=2)
ward_property = pd.read_csv('data/property_wards.csv', header=2)

In [92]:
#getting rid of all the badly formated data
ward_crime = ward_crime.dropna()
ward_demographics = ward_demographics.dropna(axis=1)
ward_education = ward_education.dropna()
ward_property= ward_property.dropna()

In [93]:
#rename all the "New Code " to "area id" for merging purposes
master_data = {'crime': ward_crime, 'demographics': ward_demographics, 'education': ward_education,
              'environment': ward_environment, 'property': ward_property}

for key in master_data:
    master_data[key].rename(columns= {"New Code":"area_id"}, inplace=True)

In [94]:
#we want to create a data set with only the latest data possible (since tesco is 2015) -- we use census data from 2011
column_names = ["area_id","Names","All Household spaces - 2011 Census",
                               "Household composition - 2011 Census All Households",
                               "Household composition - 2011 Census Couple household with dependent children",
                               "Household composition - 2011 Census Couple household without dependent children",
                               "Household composition - 2011 Census Lone parent household",
                               "Household composition - 2011 Census One person household",
                               "Household composition - 2011 Census Other multi person household",
                               "Accomodation Type - 2011 Census Whole house or bungalow: Detached",
                               "Accomodation Type - 2011 Census Whole house or bungalow: Semi-detached",
                               "Accomodation Type - 2011 Census Whole house or bungalow: Terraced",
                               "Accomodation Type - 2011 Census Flat, maisonette or apartment"]
housing = pd.DataFrame(data=ward_demographics, 
                       columns=column_names)

#get data from tesco for density and area sq km:
column_names.append("population")
column_names.append("area_sq_km")
column_names.append("people_per_sq_km")

In [95]:
housing = housing.merge(ward_tesco, on='area_id', how='inner')
household_type = ["Household composition - 2011 Census All Households",
                  "Household composition - 2011 Census Couple household with dependent children",
                  "Household composition - 2011 Census Couple household without dependent children",
                  "Household composition - 2011 Census Lone parent household",
                  "Household composition - 2011 Census One person household",
                  "Household composition - 2011 Census Other multi person household"]
#assuming 2 dependent child , we are calculating the average number of people found in a household
housing = housing.assign(avg_people_per_household= ((housing[household_type[1]]*4 + housing[household_type[2]]*2 
                        + housing[household_type[3]]+housing[household_type[4]]+3*housing[household_type[5]])
                        / housing[household_type[0]]))
#calculating the amount of bedrooms found on average in a dwelling
rooms = ["1 bedroom","2 bedrooms", "3 bedrooms", "4+ bedrooms", "All properties (2015)", "Annex/Other/Unknown"]
rooms_temp = pd.DataFrame(data=ward_property, columns=rooms)
rooms_temp = rooms_temp.assign(avg_rooms= round((1*rooms_temp["1 bedroom"]+2*rooms_temp["2 bedrooms"]+
                               3*rooms_temp["3 bedrooms"]+4*rooms_temp["4+ bedrooms"])
                                                /(rooms_temp[rooms[4]]-rooms_temp[rooms[5]])))
housing = housing.assign(avg_rooms_per_household = rooms_temp["avg_rooms"])
housing = housing.assign(rooms_shared_per_household = housing["avg_rooms_per_household"]/housing["avg_people_per_household"])

#this sequence of codes gives us a rough estimate of the household density per area. we can improve by 
#giving weights to the different types of dwelling -- i.e apartments are more densly packed
housing = housing.assign(household_per_sq_km = housing["All Household spaces - 2011 Census"]/housing["area_sq_km"])
housing = housing.assign(household_density = housing["household_per_sq_km"]/housing["people_per_sq_km"])

column_names = ["area_id","Median House Price 2014"]
house_cost = pd.DataFrame(data=master_data["demographics"], columns= column_names)
house_cost.rename(columns={"Median House Price 2014":"median_house_price"}, inplace=True)

housing = housing.merge(house_cost, on='area_id', how='inner')

column_names = ["area_id","Names","median_house_price", "population","area_sq_km", "people_per_sq_km","avg_people_per_household",
               "avg_rooms_per_household","household_per_sq_km","household_density"]
housing = pd.DataFrame(data=housing, columns=column_names)

In [96]:
housing

Unnamed: 0,area_id,Names,median_house_price,population,area_sq_km,people_per_sq_km,avg_people_per_household,avg_rooms_per_household,household_per_sq_km,household_density
0,E05000026,Abbey,173000,14370.0,1.26,11404.761905,2.260061,2.0,3772.222222,0.330759
1,E05000027,Alibon,215000,10845.0,1.36,7974.264706,2.096701,2.0,2974.264706,0.372983
2,E05000028,Becontree,210000,13856.0,1.29,10741.085271,2.104980,2.0,3393.798450,0.315964
3,E05000029,Chadwell Heath,240500,10850.0,3.38,3210.059172,2.055048,2.0,1201.775148,0.374378
4,E05000030,Eastbrook,240000,11348.0,3.45,3289.275362,2.194237,2.0,1152.753623,0.350458
...,...,...,...,...,...,...,...,...,...,...
478,E05000645,Tachbrook,715650,8996.0,0.36,24988.888889,1.709201,3.0,14444.444444,0.578035
479,E05000646,Vincent Square,840000,11276.0,0.60,18793.333333,1.791183,3.0,9523.333333,0.506740
480,E05000647,Warwick,857250,10086.0,0.58,17389.655172,1.772537,3.0,9434.482759,0.542534
481,E05000648,Westbourne,499975,13668.0,0.67,20400.000000,1.904780,2.0,8165.671642,0.400278


In [102]:
#Can choose between 'standardise' (xi - mean)/std or 're_scale' (rescales between 0 and 1), also can remove outliers or not
# must be a list though, if remove all outliers just [True] or [False], if some columns yes and others no : [True, False...]
# must match 'columns' lenght. (can be only 1 column) 

def normalize(d, columns,method='standardize', remove_outliers = [True]):
    data  = d.copy()
    if len(remove_outliers) == 1 :
        remove_outliers = remove_outliers*len(columns)
    elif len(remove_outliers) != len(columns) : print('Warning, remove_outliers size should match the one of colums')
    for i,col in enumerate(columns) :
        if remove_outliers[i] :
            #remove outliers
            data.loc[:,col] = data.loc[:,col].clip(lower = data.loc[:,col].quantile(0.025))
            data.loc[:,col] = data.loc[:,col].clip(upper = data.loc[:,col].quantile(0.975))
        if method == 'standardize' :
            data.loc[:,col] = (data.loc[:,col] - data.loc[:,col].mean())/data.loc[:,col].std()
        elif method == 're_scale' :
            data.loc[:,col] = ((data.loc[:,col] - data.loc[:,col].min()) / (data.loc[:,col].max() - data.loc[:,col].min()))
        else : print('Warning, unknown method ', method)
    return data

In [106]:
c = ['median_house_price','population','area_sq_km','people_per_sq_km', 'avg_people_per_household', 'avg_rooms_per_household',
                    'household_per_sq_km', 'household_density']
housing_normalized = normalize(housing, c, 're_scale', [False])
housing_normalized

Unnamed: 0,area_id,Names,median_house_price,population,area_sq_km,people_per_sq_km,avg_people_per_household,avg_rooms_per_household,household_per_sq_km,household_density
0,E05000026,Abbey,0.000000,0.407621,0.031381,0.383150,0.663354,0.0,0.257314,0.196936
1,E05000027,Alibon,0.012624,0.249966,0.034868,0.266042,0.474495,0.0,0.201783,0.304884
2,E05000028,Becontree,0.011121,0.384633,0.032427,0.360494,0.484066,0.0,0.230979,0.159114
3,E05000029,Chadwell Heath,0.020289,0.250190,0.105300,0.103404,0.426340,0.0,0.078435,0.308450
4,E05000030,Eastbrook,0.020138,0.272463,0.107741,0.106109,0.587256,0.0,0.075023,0.247299
...,...,...,...,...,...,...,...,...,...,...
478,E05000645,Tachbrook,0.163105,0.167270,0.000000,0.846876,0.026509,1.0,1.000000,0.829107
479,E05000646,Vincent Square,0.200481,0.269243,0.008368,0.635376,0.121288,1.0,0.657537,0.646839
480,E05000647,Warwick,0.205666,0.216020,0.007671,0.587458,0.099732,1.0,0.651354,0.738348
481,E05000648,Westbourne,0.098279,0.376224,0.010809,0.690223,0.252616,0.0,0.563056,0.374665


In [100]:
housing.median_house_price.std()

242726.29281528242

In [66]:

housing_normalized.loc[:,c ].quantile(0.025)


median_house_price          0.000000
population                  0.000103
area_sq_km                  0.000000
people_per_sq_km            0.000020
avg_people_per_household    0.000098
avg_rooms_per_household     0.000000
household_per_sq_km         0.000064
household_density           0.000100
Name: 0.025, dtype: float64

In [67]:
housing_normalized.loc[:, c].min()

median_house_price          0.0
population                  0.0
area_sq_km                  0.0
people_per_sq_km            0.0
avg_people_per_household    0.0
avg_rooms_per_household     0.0
household_per_sq_km         0.0
household_density           0.0
dtype: float64

1