# Introduction to `pandas`

In [1]:
import numpy as np
import pandas as pd

In [2]:
AHP_URL = 'https://files.datapress.com/london/dataset/average-house-prices-borough/2017-09-01T16:00:50.63/land-registry-house-prices-borough.xls'
INCOMES_URL = 'https://files.datapress.com/london/dataset/average-income-tax-payers-borough/2017-05-31T14:05:04.25/income-of-tax-payers.xls'

## Exercise 1

Read in the 'Median' sheet from the *Average House Prices by Borough* dataset at `AHP_URL` into a DataFrame called `ahp`.

In [3]:
ahp = pd.read_excel(AHP_URL, sheet_name='Median')

## Exercise 2

Print the `head` and `tail` of `ahp`.

In [4]:
ahp.head()

Unnamed: 0,Code,Area,1995,1996,1997,1998,1999,2000,2001,2002,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,,,,,,,,,,,...,,,,,,,,,,
1,E09000001,City of London,105000.0,138000.0,132000.0,185000.0,187000.0,203000.0,240000.0,265000.0,...,417500.0,365000.0,365000.0,410000.0,455000.0,445000.0,625000.0,730000.0,768900.0,631750.0
2,E09000002,Barking and Dagenham,49000.0,49995.0,53500.0,59995.0,65000.0,78000.0,89500.0,115250.0,...,185000.0,190000.0,160000.0,170000.0,170000.0,174000.0,180000.0,215000.0,244000.0,287500.0
3,E09000003,Barnet,85500.0,87000.0,100000.0,114000.0,135000.0,159000.0,174975.0,207000.0,...,295000.0,280000.0,290000.0,325000.0,325000.0,330000.0,345000.0,392500.0,444000.0,460000.0
4,E09000004,Bexley,62000.0,65000.0,71000.0,79000.0,86000.0,98500.0,116972.5,139500.0,...,212000.0,210000.0,196000.0,215000.0,215000.0,210000.0,225000.0,250000.0,275000.0,320000.0


In [5]:
ahp.tail()

Unnamed: 0,Code,Area,1995,1996,1997,1998,1999,2000,2001,2002,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
46,E12000009,South West,55000.0,57250.0,60000.0,66500.0,74000.0,85000.0,96250.0,123000.0,...,190000.0,185000.0,175000.0,190000.0,185000.0,187500.0,190000.0,200000.0,215000.0,227000.0
47,,,,,,,,,,,...,,,,,,,,,,
48,E92000001,England,55000.0,57500.0,60000.0,66000.0,74000.0,82000.0,92000.0,114000.0,...,178000.0,173500.0,170000.0,185000.0,180000.0,183000.0,187000.0,195000.0,210000.0,220000.0
49,W92000004,Wales,44000.0,45000.0,47950.0,49500.0,53000.0,56500.0,60000.0,71000.0,...,140000.0,135000.0,132000.0,134950.0,130000.0,133000.0,133500.0,136000.0,141000.0,145000.0
50,K04000001,England And Wales,54950.0,57000.0,60000.0,65000.0,72950.0,80000.0,90000.0,214950.0,...,175000.0,170000.0,168500.0,182000.0,176000.0,180000.0,184995.0,190000.0,205000.0,215000.0


## Exercise 3

Filter the DataFrame so that only boroughs are included.

**Hint**: check the structure of `ahp.Code`.

In [6]:
ahp = ahp[ahp.Code.str.startswith('E09', na=False)]

## Exercise 4

Set `Code` as row names (index), dropping it from the DataFrame.

In [7]:
ahp.set_index('Code', drop=True, inplace=True)

## Exercise 5

Examine data types and shape.

In [8]:
ahp.dtypes

Area     object
1995    float64
1996    float64
1997    float64
1998    float64
1999    float64
2000    float64
2001    float64
2002    float64
2003    float64
2004    float64
2005    float64
2006    float64
2007    float64
2008    float64
2009    float64
2010    float64
2011    float64
2012    float64
2013    float64
2014    float64
2015    float64
2016    float64
dtype: object

In [9]:
ahp.shape

(33, 23)

## Exercise 6

Calculate mean house prices by year.

In [10]:
ahp.mean()

1995     78588.409091
1996     84625.606061
1997     94710.878788
1998    108960.303030
1999    127674.696970
2000    151809.545455
2001    170867.045455
2002    196926.363636
2003    215975.227273
2004    232032.742424
2005    243993.030303
2006    260976.863636
2007    294757.424242
2008    295305.227273
2009    285096.181818
2010    314285.575758
2011    321724.090909
2012    333931.818182
2013    368291.651515
2014    425016.303030
2015    462640.151515
2016    493474.545455
dtype: float64

## *Interlude*: melting

We will now convert (`melt`) the dataset from 'wide' to 'long' format.

In [11]:
ahp = pd.melt(ahp, id_vars='Area', var_name='Year', value_name='Price')
ahp.head()

Unnamed: 0,Area,Year,Price
0,City of London,1995,105000.0
1,Barking and Dagenham,1995,49000.0
2,Barnet,1995,85500.0
3,Bexley,1995,62000.0
4,Brent,1995,68000.0


## Exercise 7

How would you convert the dataset back to 'long' format?

In [12]:
ahp.pivot_table(values='Price', index='Area', columns='Year', aggfunc='sum')

Year,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Barking and Dagenham,49000.0,49995.0,53500.0,59995.0,65000.0,78000.0,89500.0,115250.0,140000.0,159995.0,...,185000.0,190000.0,160000.0,170000.0,170000.0,174000.0,180000.0,215000.0,244000.0,287500.0
Barnet,85500.0,87000.0,100000.0,114000.0,135000.0,159000.0,174975.0,207000.0,233000.0,249000.0,...,295000.0,280000.0,290000.0,325000.0,325000.0,330000.0,345000.0,392500.0,444000.0,460000.0
Bexley,62000.0,65000.0,71000.0,79000.0,86000.0,98500.0,116972.5,139500.0,163000.0,175000.0,...,212000.0,210000.0,196000.0,215000.0,215000.0,210000.0,225000.0,250000.0,275000.0,320000.0
Brent,68000.0,73000.0,83500.0,96000.0,112000.0,139000.0,158000.0,187000.0,210000.0,223500.0,...,280000.0,275000.0,275000.0,283000.0,300000.0,321000.0,345000.0,380000.0,405000.0,437500.0
Bromley,76850.0,80000.0,86000.0,99995.0,116500.0,135000.0,152725.0,176820.0,205000.0,225000.0,...,260000.0,250000.0,249000.0,275000.0,275000.0,277500.0,295000.0,332747.5,372250.0,415000.0
Camden,114000.0,127500.0,145000.0,160000.0,190000.0,235000.0,249995.0,279950.0,273000.0,305000.0,...,410000.0,439975.0,417500.0,478700.0,485000.0,500000.0,590000.0,670000.0,705000.0,720000.0
City of London,105000.0,138000.0,132000.0,185000.0,187000.0,203000.0,240000.0,265000.0,310000.0,248863.0,...,417500.0,365000.0,365000.0,410000.0,455000.0,445000.0,625000.0,730000.0,768900.0,631750.0
Croydon,60000.0,64500.0,71025.0,79000.0,89000.0,110000.0,126000.0,150300.0,172000.0,185000.0,...,222500.0,226500.0,200000.0,220000.0,220000.0,230000.0,237750.0,260000.0,300000.0,331000.0
Ealing,75000.0,80000.0,90000.0,103000.0,125000.0,145000.0,163000.0,190000.0,210000.0,231000.0,...,270000.0,270000.0,257750.0,279275.0,293000.0,310000.0,325000.0,385000.0,428150.0,450000.0
Enfield,66000.0,70000.0,78000.0,85000.0,94000.0,114000.0,130000.0,153000.0,180000.0,193000.0,...,233000.0,234600.0,225000.0,241000.0,240000.0,248000.0,250000.0,280000.0,320000.0,371500.0


## Exercise 8

Convert `Year` to integer.

In [13]:
ahp['Year'] = ahp.Year.astype('int')

## Exercise 9

Calculate mean house prices by year.

In [14]:
ahp.groupby('Year').Price.mean()

Year
1995     78588.409091
1996     84625.606061
1997     94710.878788
1998    108960.303030
1999    127674.696970
2000    151809.545455
2001    170867.045455
2002    196926.363636
2003    215975.227273
2004    232032.742424
2005    243993.030303
2006    260976.863636
2007    294757.424242
2008    295305.227273
2009    285096.181818
2010    314285.575758
2011    321724.090909
2012    333931.818182
2013    368291.651515
2014    425016.303030
2015    462640.151515
2016    493474.545455
Name: Price, dtype: float64

## Exercise 10

Calculate mean house prices by borough using only data from 2010 onwards.

In [15]:
ahp[ahp.Year >= 2010].groupby('Area').Price.mean()

Area
Barking and Dagenham      205785.714286
Barnet                    374500.000000
Bexley                    244285.714286
Brent                     353071.428571
Bromley                   320356.785714
Camden                    592671.428571
City of London            580807.142857
Croydon                   256964.285714
Ealing                    352917.857143
Enfield                   278642.857143
Greenwich                 293899.285714
Hackney                   391966.428571
Hammersmith and Fulham    582071.428571
Haringey                  359992.857143
Harrow                    343703.571429
Havering                  251285.714286
Hillingdon                294278.428571
Hounslow                  298735.714286
Islington                 482064.285714
Kensington and Chelsea    999267.857143
Kingston upon Thames      357035.714286
Lambeth                   378750.000000
Lewisham                  289350.000000
Merton                    346357.142857
Newham                    256285.71

## Exercise 11

Identify the three boroughs with the highest mean house prices.

In [16]:
ahp.groupby('Area').Price.mean().sort_values(ascending=False).head(3)

Area
Kensington and Chelsea    595151.136364
Westminster               451033.068182
Camden                    371732.727273
Name: Price, dtype: float64

## Exercise 12

Read in the *Average Income of Tax Payers by Borough* from `INCOMES_URL` into a DataFrame called `incomes`, keeping only the columns indicating the borough and the medians for each year.
Rename the columns to only include the starting year (e.g. '1999-00' = 1999).

In [17]:
incomes = pd.read_excel(INCOMES_URL, sheet_name='Total Income').reset_index()
incomes = incomes[incomes.iloc[:, 0].str.startswith('E09', na=False)]
column_names = ['Area'] + list(incomes.columns[2::3].str.extract('(\\d+)-', expand=False))
incomes = pd.concat([incomes.iloc[:, 1], incomes.iloc[:, 4::3]], axis=1)
incomes.columns = column_names

## Exercise 13

'Melt' the DataFrame and adjust data types as needed.

In [18]:
incomes = pd.melt(incomes, id_vars='Area', var_name='Year', value_name='Income')
incomes['Year'] = incomes.Year.astype('int')
incomes['Income'] = incomes.Income.astype('float')

## Exercise 14

Merge `incomes` with `ahp`, keeping only observations found in both DataFrames.

In [19]:
ahp_incomes = pd.merge(ahp, incomes, how='inner')

## Exercise 15

Compute mean house prices and incomes by year.

In [20]:
ahp_incomes.pivot_table(values=['Price', 'Income'], index='Year')

Unnamed: 0_level_0,Income,Price
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1999,17361.290323,125800.483871
2000,19116.129032,149555.322581
2001,19129.032258,169084.274194
2002,19500.0,195181.290323
2003,19761.290323,214418.951613
2004,19877.419355,230026.790323
2005,21435.483871,242316.774194
2006,21867.741935,259104.435484
2007,23135.483871,292128.870968
2009,24958.064516,282637.064516


## Exercise 16

Compute the correlation between house prices and incomes by area.

In [21]:
ahp_incomes.groupby('Area').Price.corr(ahp_incomes.Income)

Area
Barking and Dagenham      0.868372
Barnet                    0.919618
Bexley                    0.846362
Brent                     0.920543
Bromley                   0.951147
Camden                    0.967402
City of London            0.595057
Croydon                   0.905613
Ealing                    0.969878
Enfield                   0.902590
Greenwich                 0.920023
Hackney                   0.968930
Hammersmith and Fulham    0.959749
Haringey                  0.947072
Harrow                    0.932091
Havering                  0.927207
Hillingdon                0.970027
Hounslow                  0.949986
Islington                 0.955748
Kensington and Chelsea    0.923848
Lambeth                   0.968563
Lewisham                  0.959669
Merton                    0.915172
Newham                    0.779182
Redbridge                 0.900974
Southwark                 0.987826
Sutton                    0.937435
Tower Hamlets             0.943302
Waltham Forest 