# London Demographics

## Imports

In [1]:
import pandas as pd
import numpy as np
import csv
from pandas_profiling import ProfileReport

## Reading, Processing And Generating Reports

In [2]:
# Reads the demographics files
demographics_data = pd.read_csv('D:/Universidade-Fcul/2_semestre/TPD/projeto/novoscsvs/london-borough-profiles-2016 Data set.csv')
borough_population = pd.read_excel('D:/Universidade-Fcul/2_semestre/TPD/projeto/novoscsvs/london_borough_population.xlsx')

# Replace '.' values with nans and drop unecessary columns
demographics_data = demographics_data.replace('.', np.nan)

del demographics_data['Code']
del demographics_data['New code']

# Removing unecessary columns that doesn't bring much information
del demographics_data['Life satisfaction score 2011-14 (out of 10)']
del demographics_data['Worthwhileness score 2011-14 (out of 10)']
del demographics_data['Happiness score 2011-14 (out of 10)']
del demographics_data['Number of cars per household, (2011 Census)']
del demographics_data['Teenage conception rate (2014)']
del demographics_data['People aged 17+ with diabetes (%)']

demographics_data.head()

Unnamed: 0,Area name,Inner/ Outer London,GLA Population Estimate 2016,GLA Household Estimate 2016,Inland Area (Hectares),Population density (per hectare) 2016,"Average Age, 2016","Proportion of population aged 0-15, 2016","Proportion of population of working-age, 2016","Proportion of population aged 65 and over, 2016",...,"Male life expectancy, (2012-14)","Female life expectancy, (2012-14)",Anxiety score 2011-14 (out of 10),Childhood Obesity Prevalance (%) 2014/15,Mortality rate from causes considered preventable 2012/14,Political control in council,Proportion of seats won by Conservatives in 2014 election,Proportion of seats won by Labour in 2014 election,Proportion of seats won by Lib Dems in 2014 election,Turnout at 2014 local elections
0,,,,,,,,,,,...,,,,,,,,,,
1,City of London,Inner London,8548.0,5179.0,290.4,28.9,42.9,27.2,90.6,9.4,...,,,5.57,,128.8,,,,,
2,Barking and Dagenham,Outer London,205773.0,76841.0,3610.8,57.3,32.9,21.0,86.1,13.9,...,77.6,82.1,3.05,25.3,227.6,Lab,0.0,100.0,0.0,36.5
3,Barnet,Outer London,385108.0,149147.0,8674.8,44.5,37.2,21.0,83.3,16.7,...,82.1,85.1,2.75,18.4,133.8,Cons,50.8,47.6,1.6,40.5
4,Bexley,Outer London,243303.0,97233.0,6058.1,39.9,38.9,20.8,89.0,11.0,...,80.4,84.4,3.29,21.4,164.3,Cons,71.4,23.8,0.0,39.6


In [3]:
print(demographics_data.shape)

(40, 77)


In [4]:
# Removes rows that are not necessary. Nan rows and generic places that are not boroughs
comparison_rows = demographics_data.iloc[[0, 1, 34, 35, 36, 37, 38, 39]]
demographics_data = demographics_data.drop([0, 1, 34, 35, 36, 37, 38, 39])
demographics = demographics_data.copy()
demographics.head()

Unnamed: 0,Area name,Inner/ Outer London,GLA Population Estimate 2016,GLA Household Estimate 2016,Inland Area (Hectares),Population density (per hectare) 2016,"Average Age, 2016","Proportion of population aged 0-15, 2016","Proportion of population of working-age, 2016","Proportion of population aged 65 and over, 2016",...,"Male life expectancy, (2012-14)","Female life expectancy, (2012-14)",Anxiety score 2011-14 (out of 10),Childhood Obesity Prevalance (%) 2014/15,Mortality rate from causes considered preventable 2012/14,Political control in council,Proportion of seats won by Conservatives in 2014 election,Proportion of seats won by Labour in 2014 election,Proportion of seats won by Lib Dems in 2014 election,Turnout at 2014 local elections
2,Barking and Dagenham,Outer London,205773,76841,3610.8,57.3,32.9,21.0,86.1,13.9,...,77.6,82.1,3.05,25.3,227.6,Lab,0.0,100.0,0.0,36.5
3,Barnet,Outer London,385108,149147,8674.8,44.5,37.2,21.0,83.3,16.7,...,82.1,85.1,2.75,18.4,133.8,Cons,50.8,47.6,1.6,40.5
4,Bexley,Outer London,243303,97233,6058.1,39.9,38.9,20.8,89.0,11.0,...,80.4,84.4,3.29,21.4,164.3,Cons,71.4,23.8,0.0,39.6
5,Brent,Outer London,328568,119166,4323.3,76.1,35.5,20.1,82.5,17.5,...,80.1,85.1,2.92,23.9,169.4,Lab,9.5,88.9,1.6,36.3
6,Bromley,Outer London,326560,139654,15013.5,21.7,40.1,15.8,88.4,11.6,...,81.4,84.9,3.26,16.5,148.5,Cons,85.0,11.7,0.0,40.8


In [5]:
# Removes pound signs and commas and makes column float
count = 0
for col in demographics.columns:
    try:
        demographics[col] = demographics[col].apply(lambda x: str(x).replace('£', '').replace(',', ''))
        demographics[col] = demographics[col].astype(float)
    except:
        count +=1
print('There are {0} string columns'.format(count))

There are 9 string columns


In [6]:
# Replaces NaN values with the mean
count = 0
for col in demographics.columns:
    try:
        demographics[col] = demographics[col].replace(np.nan, demographics[col].mean())
    except:
        count += 1
print('{0} columns remained intanct'.format(count))

9 columns remained intanct


In [7]:
# Report of Borough Population
borough_population_profile = ProfileReport(borough_population, 
                                           title="Borough Population Profiling Report", 
                                           explorative=1)

borough_population_profile.to_file("borough_population_profile.html")

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Export report to file'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [8]:
# Report of Borough Demographics
borough_demographics_profile = ProfileReport(demographics, 
                                             title='Borough Demographics Profiling Report',
                                             explorative=1, 
                                             minimal=1)
borough_demographics_profile.to_file("borough_demographics_profile.html")

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=86.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Export report to file'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




## Reports Analysis

### GLA Population Estimate 2016

Have a pretty standar deviation with a uniform distribution of values. \
Most values between 200K and 350k, with a few outliers:
- Croydon, with value 383,408
- Barnet, with value 385,108
- Kensington and Chelsea, with value 158,447
- Kingston upon Thames, with value 173,853

### GLA Household Estimate 2016

Quite a difference between min and max values:

- Kingston upon Thames, with value 69,132
- Croydon, with value 156,979

### Inland Area (Hectares)

Most values centered on Q1(2834.9), with some extreme values:

- Bromley, with value 15,013.5
- Havering, with value 11,235
- Hillingdon, with value 11,570.1

### Population density (per hectare) 2016

Standard deviation with some outliers:

- Bromley, with value 21.7
- Havering, with value 22.3
- Tower Hamlets with value 149.1
- Islington, with value 153

### Average Age, 2016

pretty standar with a mean of 36.04 and median of 35.9

### Proportion of population aged 0-15, 2016

All but one value around the mean of 19.456:
- Camden, with value 8.8

### Proportion of population of working-age, 2016

Have a pretty distribution of values but looking above, we see that 15 must already a working age

for instance Barking and Dagenham has aged 0-15 -> 21.0 and working-age -> 86.1

### Proportion of population aged 65 and over, 2016

Now analysing this collumn we see that the sum of this one and the above adds to 100, so working age is 65 and under, wich doesnt seem right to count this way

i.e. Barking and Dagenham working-age -> 86.1 , 65+ age ->13.9

### Net internal migration (2014)

with a mean of -2149.125 and we see that there are more people leaving London, rather than entering.

### Net international migration (2014)

we have a mean of 3348.5, meaning lots of people are coming to London from other countries, and seeing that the difference bettween both migrations is positve we can infer that population is increasing in London

### Net natural change (2014)

has a mean of 2572.59 and one outlier:
- Newham, with value 5092

### % of resident population born abroad (2014)

Quite standar deviation
- Enfield, with value 32.8
- Greenwich, with value 32.9


### Largest migrant population by country of birth (2011)

we can see that India as the most instances with 10, nearly a 3rd

### % of largest migrant population (2011)

Most values are bellow 5, with some instances around 10 ad an extreme occurence:
- Tower Hamlets, with value 15.3

### Second largest migrant population by country of birth (2011)
Like the first migration, most instances are from India but unlike before now India only has 6 instances (1/5) and thers more variation, since distinct values went from 11 to 15

### % of second largest migrant population (2011)

similar distribution to first one, since most values are bellow value 5, but this time only 3 values above it:
- Ealing, with value 6.4
- Newham, with value 6.8
- Redbridge, with value 5.3

### Third largest migrant population by country of birth (2011)

biggest number of distinct values of the 3, with 17. \
greatest difference is that Ireland has greatest number of instances: 8

### % of third largest migrant population (2011)
distribution is closer to the 2nd migration, instead of 1st and most values are bellow 3 with only 2 outliers:
- Harrow, with value 4.3
- Newham, with value 5.3

### % of population from BAME groups (2016)

presents a standar deviation with a mean of 41.153 and an extremes at:
- Newham, with value 73.1
- Havering, with value 15.7
- Richmond upon Thames, with value 15.7

### % people aged 3+ whose main language is not English (2011 Census)

quite the values, having a mean of 21.78 (1/5) of London's population are not English men and an extreme value of:
- Newham, with a value of 41.4

### Overseas nationals entering the UK (NINo), (2014/15)
mostly centered around 10k with some outilers:
- Brent, with value 25,130
- Newham, with value 26,478

### New migrant (NINo) rates, (2014/15)

Quite a standar distribution with extremes at:
-  Bexley, with value 14
- Bromley, with value 14
- Newham, with value 116

### Largest migrant population arrived during 2014/15

Quite interesting since there's only 5 distinct values:
- Romania, with number of instances 18
- Italy, with number of instances 8
- Poland, with number of instances 3
- Spain, with number of instances 2
- Bulgaria, with number of instances 1

### Second largest migrant population arrived during 2014/15

with 9 distict values and Bulgaria jumping from last to first place
- Italy, with number of instances 6
- Bulgary, with number of instances 6

### Third largest migrant population arrived during 2014/15
Still 9 distinct values, but now Poland takes 1st place with most instances: 7

### Employment rate (%) (2015)

Not bad values having min and max:
- Westminster, with value 65.6
- Richmond upon Thames, with value 79.6

### Male employment rate (2015)
having a mean of 79.42, with a outier at:
- Westminster, with value 68.6

### Female employment rate (2015)
more standar distribution compared to males, but as per usual lower mean : 66.53

### Unemployment rate (2015)
Mostly bellow 10, wich is good, but has an outlier:
- Barking and Dagenham, with value 11

### Youth Unemployment (claimant) rate 18-24 (Dec-14)

Pretty standar, has a mean of 5.13 and min and max:
- Kingston upon Thames, with value 2.8
- Waltham Forest, with value 8

### Proportion of 16-18 year olds who are NEET (%) (2014)
very nice values beeing bellow 5, with some outliers:
- Barking and Dagenham, with value 5.7
- Islington, with value 5.2
- Greenwich, with value 5

### Proportion of the working-age population who claim out-of-work benefits (%) (Aug-2015)

Standar deviation with extremes:
- Barking and Dagenham, with value 11.7
- Hackney, with value 11.7
- Richmond upon Thames, with value 4.7

### % working-age with a disability (2015)
Centered around it«s mean (15.96) and an outlier:
- Camden, with value 21.3

### Proportion of working age people with no qualifications (%) 2015
some extremes at
- Waltham Forest, with value 11.6
- Richmond upon Thames, with value 3.2

### Proportion of working age with degree or equivalent and above (%) 2015
quite a low mean (50) and a really low outlier:
- Havering, with value 26

### Gross Annual Pay, (2015)
mainly arround its mean of 33,622 with a max:
- Westminster, with value 42,798

### Gross Annual Pay - Male (2015)
Very similar distribution as above, but higher mean (36,339) and max:
- Westminster, with value 45,872

### Gross Annual Pay - Female (2015)
Similar distribution but lower values with mean of 30,520 and max:
- Westminster, with value 37,918

### Modelled Household median income estimates 2012/13
centered arround it's mean of 52,173 with an outlier:
- Kensington and Chelsea, with value 116,350

### % adults that volunteered in past 12 months (2010/11 to 2012/13)

Have a pretty standar deviation with a uniform distribution of values.
However, there are maybe 3 values that are distinguishable from others that are:
- Newham, with value 8%
- Richmond upon Thames, with value 49%
- Kingston upon Thames, with value 43%

### Number of jobs by workplace (2014)

Have a peculiar distribution of values, clearly identifying the outliers of the distribution:
- Westminster, with value 730700
- Camden, with value 377400

### % of employment that is in public sector (2014)

Have a pretty distribution of values, being the values pretty close inside a certain range, but there is one value that outvalues the average in almost 10
- Greenwich, with value 27.3%

### Jobs Density, 2014

Have a similar distribution equal to _Number of jobs by workplace (2014)_, being the two major outliers the values.
For example, a jobs density of 1.0 would mean that there is one job for every resident aged 16-64.
- Westminster, with value 4.33
- Camden, with value 2.25

### Number of active businesses, 2014

Similar to to features like _Jobs Density, 2014_ the distribution is pretty similar, but there is one clear value that outvalues all the rest, but having other values that are somehow apart from the mean too.
- Westminster, with value 53160

### Two-year business survival rates (started in 2012)

In this case, there is a different way of looking to data since the since there is one value that is apart from all the rest but with lower value, indicating that business have less chances of survival
- Westminster, with value 64

### Crime rates per thousand population 2014/15

Looking at the distribution, once again its possible to see that the values are closed together with a little space, but there is one clear outlier that shows the big difference:
- Westminster, with value 212.4

### Fires per thousand population (2014)

The distribution of values in this case appears to have two clear outliers that besides being outliers, dont' differ too much from the others
- Westminster, with value 4
- Tower Hamlets, with value 5

### Ambulance incidents per hundred population (2014)

Similar to previous distributions the values are concentrated in a particular value, being clear the one outlier:
- Westminster, with value 19.9

### Median House Price, 2014

In terms of the distribution of values we now see that we are talking about thousands of pounds, and clearly identify two outliers:
- Kensington and Chelsea, with value of 1195000 pounds (1 million)
- Westminster, with value 875000 pounds

### Average Band D Council Tax charge (£), 2015/16

The distribution of values in this case is quite interesting because there are two outliers that are clearly distinguishable:
- Westminster, with value 674
- Wandsworth, with value 683

### New Homes (net) 2014/15 (provisional)

This distribution of these values are quite uniform, having one outlier above the rest, and one outlier below the rest of the rest which are
- Newham, with value 2050
- Haringey, with value 130

### Homes Owned outright, (2014) %

The distribution of values is pretty much standard, no outliers or values or values to far from the mean, but with high range levels. Examples of the min and max:
- Tower Hamlets, with value 7
- Bexley, with value 38.1

### Being bought with mortgage or loan, (2014) %

Once again the values are well distributed not having clear outliers, but again the range is noticeable:
- Westminster, with value 11.6
- Sutton, with value 41.8

### Rented from Local Authority or Housing Association, (2014) %

This distribution is somehow interesting in terms of data separation. It seems to have like 2 clusters spearating  sets of values. TODO: Might be analysed better later
- Richmond upon Thames, with value 8.7
- Hackney, with value 45.4

### Rented from Private landlord, (2014) %

In terms of values it seems to have a uniform distribution but some values are over and under the mean and seem a little to deviated from the others:
- Westminster, with value 43.3
- Bexley, with value 11.4
- Havering, with value 13.8

### % of area that is Greenspace, 2005

It's somehow a standard distribution but there is a clear value that is superior to the rest of the distribution.
- Havering, with value 59.3

### Total carbon emissions (2013)

Similar to previous distributions there is a clear distribution of values where a couple of values stand out from the rest. Values in tons
- Westminster, with value 3048.4
- Hillingdon, with value 1998.4
- Tower Hamlets, with value 1948.3

### Household Waste Recycling Rate, 2014/15

In terms of value distribution it's pretty much uniform, having only a max value that might be a little away from the average but besides that everything standard.
- Lewisham, with value 17.1
- Bexley, with value 54

### Number of cars, (2011 Census)

The distribution of values is pretty standard but there are once again values that are a bit bigger than the average, and a bit low too. The range of values is indcredibly high.
- Islington, with value 38629
- Bromley, with value 153908

### % of adults who cycle at least once per month, 2013/14

The values are pretty standard with a reasonable mean value, but there is one place where a lot of people cycle at least onece a month that is way bigger than the rest
- Richmond upon Thames, with value 31.7

### Average Public Transport Accessibility score, 2014

the distribution is quite normal but there are a couple of values that are "away" from the most common values.
- Westminster, with value 6.5
- Kensington and Chelsea, with value 5.8
- Camden, with value 5.7
- Islington, with value 5.7

### Achievement of 5 or more A*- C grades at GCSE or equivalent including English and Maths, 2013/14

A bit odd distribution similar to previous cases where it seems to have two separate groups of data. In each of those groups the values that stand out are:
- Richmond upon Thames, with value 70.5
- Newham, with value 55.7

### Rates of Children Looked After (2015)

Once again this distribution almost suggest two great groups. Values that stand out are:
- Islington, with value 90
- Richmond upon Thames, with value 22

### % of pupils whose first language is not English (2015)

In this distribution we can see that in many of the boroughs there is wide variety of pupils whose firts language is not english. However, there are still Boroughs that have a very low percentage compared to the rest.
- Bromley, with value 10.8
- Havering, with value 12.5
- Tower Hamlets, with value 73.8

### % children living in out-of-work households (2014)

In this case the distribution of values is pretty standard where the values are near the mean, but there are values that stand out because of it's value compared to the mean
- Richmond upon Thames, with value 6.1
- Islington, with value 26.4

### Male life expectancy, (2012-14)

In terms of of values the distribution is pretty standard having a slightly inclination to more higher values. One of low values is:
- Barking and Dagenham, with value 77.6

### Female life expectancy, (2012-14)

The values on this distribution seem to have a 3 cluster group apart from each other even though their values are really close. Two values that are more apart from the mean are
- Barking and Dagenham, with value 82.1
- Camden, with value 86.7

### Anxiety score 2011-14 (out of 10)

The distribution has a little more tendency for higher values even though the difference is really short. However, one value to stand out is:
- Enfield, with value 2.6

### Childhood Obesity Prevalance (%) 2014/15

In terms of data there is a clear distinction of values that are apart from the mean, and the most evident one is:
- Richmond upon Thames, with value 11.2

### Mortality rate from causes considered preventable 2012/14

This distribution is somehow standard more to the side of less values, but there are two incredible values that stand out and represent places where a lot of mortality could be avoided
- Tower Hamlets, with value 238.7
- Barking and Dagenham, with value 227.6

### Political control in council

A non standard distribution where it's clear the most common values. The ones apart from the "regular" are
- Tower Hamlets, with 'Tower Hamlets First'
- Sutton, with 'Lib Dem'
- Havering, with 'No Overall Control'

### Proportion of seats won by Conservatives in 2014 election

An odd distribution where the most common value is 0, but some high values too
- Barking and Dagenham, Haringey, Islington, Lewisham, Newham, with value 0
- Bromley, with value 85

### Proportion of seats won by Labour in 2014 election

A somewhat similiar distribution like above, where there are a smaller percentage of 0's, but some high values too
- Richmond upon Thames, Sutton, with value 0
- Barking and Dagenham, Newham, with value 0

### Proportion of seats won by Lib Dems in 2014 election

Nothing like the distributions above, this one is very clear in terms of transmiting information. Most of the boroughs have 0 seats won by the liberal and only a few percentage had. From those, the majority has a low proportion and only one had a great value. All the boroughs that are not mentioned below either has 0, or below 6%
- Sutton, wtih value 83.3
- Kingston upon Thames, with value 37.5
- Richmond upon Thames, with value 27.8
- Southwark, with value 20.6
- Haringey, with value 15.8

### Turnout at 2014 local elections

The distribution is quite standard but with clear values that are apart from all the others
- Tower Hamlets, with value 47.2
- Richmond upon Thames, with value 46.1
- Kensington and Chelsea, with value 29.8
- Westminster, with valiue 32.3
- Lambeth, with value 34.5

## Merging Into Final Demographics Files

In [9]:
print(demographics.shape)
demographics.head()

(32, 77)


Unnamed: 0,Area name,Inner/ Outer London,GLA Population Estimate 2016,GLA Household Estimate 2016,Inland Area (Hectares),Population density (per hectare) 2016,"Average Age, 2016","Proportion of population aged 0-15, 2016","Proportion of population of working-age, 2016","Proportion of population aged 65 and over, 2016",...,"Male life expectancy, (2012-14)","Female life expectancy, (2012-14)",Anxiety score 2011-14 (out of 10),Childhood Obesity Prevalance (%) 2014/15,Mortality rate from causes considered preventable 2012/14,Political control in council,Proportion of seats won by Conservatives in 2014 election,Proportion of seats won by Labour in 2014 election,Proportion of seats won by Lib Dems in 2014 election,Turnout at 2014 local elections
2,Barking and Dagenham,Outer London,205773.0,76841.0,3610.8,57.3,32.9,21.0,86.1,13.9,...,77.6,82.1,3.05,25.3,227.6,Lab,0.0,100.0,0.0,36.5
3,Barnet,Outer London,385108.0,149147.0,8674.8,44.5,37.2,21.0,83.3,16.7,...,82.1,85.1,2.75,18.4,133.8,Cons,50.8,47.6,1.6,40.5
4,Bexley,Outer London,243303.0,97233.0,6058.1,39.9,38.9,20.8,89.0,11.0,...,80.4,84.4,3.29,21.4,164.3,Cons,71.4,23.8,0.0,39.6
5,Brent,Outer London,328568.0,119166.0,4323.3,76.1,35.5,20.1,82.5,17.5,...,80.1,85.1,2.92,23.9,169.4,Lab,9.5,88.9,1.6,36.3
6,Bromley,Outer London,326560.0,139654.0,15013.5,21.7,40.1,15.8,88.4,11.6,...,81.4,84.9,3.26,16.5,148.5,Cons,85.0,11.7,0.0,40.8


In [10]:
crimes_df = crimes_df.reset_index(drop=True)
crimes_df['crime_id'] = crimes_df.index 

NameError: name 'crimes_df' is not defined

In [None]:
demographics = demographics.reset_index(drop=True)
demographics['borough_id'] = demographics.index

In [None]:
print(demographics.shape)
demographics.head()

In [None]:
compression_opts = dict(method='zip',
                        archive_name='demographics.csv')

demographics.to_csv('demographics.zip',
                    compression=compression_opts,
                    index=False)


In [None]:
for r in crimes.iterrows():
    print(r[1][1])
    crimes.at[r[0],'crime_id'] = crimes_df.loc[(crimes_df['major_category'] == r[1][1]) & (crimes_df['minor_category'] == r[1][2])].index[0]
    crimes.at[r[0],'date_id'] = date_df.loc[(date_df['year'] == r[1][4]) & (date_df['month'] == r[1][5])].index[0]
    crimes.at[r[0],'borough_id'] = demographics.loc[(demographics['Area Name'] == r[1][1])]
    break

crimes.head()