# Spatial Analysis of Food Business Data

### Exploring gentrification through consumer preferences

In [174]:
#Load packages
import pandas as pd
import numpy as np

from scipy import stats

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors
from matplotlib.colors import Normalize
from matplotlib.collections import PatchCollection
from descartes import PolygonPatch

import pyproj
from pyproj import Proj, transform

import shapefile
import shapely
from shapely.geometry import Polygon
from shapely.geometry import Point

## Step 1 - Data import and cleaning

In [175]:
#Set working directory
import os
os.chdir("C:/Users/Claire/Google Drive/LondonGentrification")

In [176]:
#Import .csv data 

#Word tokens 
tokens_df = pd.read_csv("Data/FoodPremises/tokens_spatial.csv")

#Food businesses
food_bus_df = pd.read_csv("data/FoodPremises/london_premises.csv")
#Remove records with no coordinates
food_bus_df = food_bus_df.loc[food_bus_df['Latitude']>0]

In [177]:
#Check column names and data types
print tokens_df.dtypes
print food_bus_df.dtypes

BusinessID                float64
BusinessName               object
BusinessType               object
BusinessTypeID            float64
ConfidenceInManagement    float64
Hygiene                   float64
LocalAuthorityCode        float64
LocalAuthorityName         object
PostCode                   object
RatingValue               float64
Structural                float64
Token                      object
lat                       float64
lon                       float64
dtype: object
Index                       int64
BusinessName               object
BusinessType               object
BusinessTypeID              int64
PostCode                   object
RatingValue               float64
RatingDate                 object
LocalAuthorityCode          int64
LocalAuthorityName         object
Hygiene                   float64
Structural                float64
ConfidenceInManagement    float64
Longitude                 float64
Latitude                  float64
dtype: object


In [178]:
#Function to add eastings and northings from lat an lon
#Add eastings and northings to word tokens

def transform_coordinates (data,input_espg, output_espg, input_x, input_y):

    #define input and output projection
    input_projection = pyproj.Proj("+init=" + input_espg) #wgs84
    output_projection = pyproj.Proj("+init=" + output_espg) #osgb36

    eastings = []
    northings = []

    x_list = input_x.tolist()
    y_list = input_y.tolist()

    for i in range (len(input_x)):
        x = x_list[i]
        y = y_list[i]
        new_x, new_y = pyproj.transform(input_projection, output_projection, x, y)
        eastings.append(new_x)
        northings.append(new_y)

    #Add to tokens dataframe
    data['eastings'] = eastings
    data['northings'] = northings

In [179]:
#Apply funtion to dataframe

transform_coordinates(tokens_df, "EPSG:4326", "EPSG:27700", 
                       tokens_df['lon'], tokens_df['lat'])

transform_coordinates(food_bus_df, "EPSG:4326", "EPSG:27700", 
                       food_bus_df['Longitude'], food_bus_df['Latitude'])



In [180]:
#Create subsetted dataframes for each token of interest

#List of tokens of interest:
tokens_lst = ['cafe', 'coffee', 'pizza', 'wine', 'sushi', 'thai', 'chicken', 
                   'fried', 'fish', 'kebab', 'costcutter', 'waitrose', 'sainsburys', 'tesco']

df = {}

for x in tokens_lst:
    df[x] = pd.DataFrame(tokens_df.loc[tokens_df['Token'] == x])

In [181]:
#Import london wards shapefile and save as matplot lib patches for plotting

#Load the shapefile of polygons and convert it to shapely polygon objects
polygons_sf = shapefile.Reader("Data/ESRI/london_wards.shp")
polygon_shapes = polygons_sf.shapes()
polygon_points = [q.points for q in polygon_shapes ]
polygons = [Polygon(q) for q in polygon_points]

#Create matplotlib patches from shapely polygons
ward_patches = []
for x in range (len(polygons)):
    a = PolygonPatch(polygons[x])
    ward_patches.append(a)
    
#Define bounding box (eastings and northings)
xmin = polygons_sf.bbox[0]
xmax = polygons_sf.bbox[2]
ymin = polygons_sf.bbox[1]
ymax = polygons_sf.bbox[3]

In [182]:
#Create wards dataframe from shapefile

#polygons_sf.fields #Access shapefile fields
#polygons_sf.records #Access shapefile records

records = polygons_sf.records()

position = []
ward_name = []
ward_code = []

for x in range (len(records)):
    position.append(records[x][0] - 1)
    ward_name.append(records[x][1])
    ward_code.append(records[x][2])
    
ward_variables = pd.DataFrame({'position': position, 'ward_name': ward_name,'ward_code':ward_code})


In [183]:
#Calculate centroids for each ward and add to ward dataframe
centroids = []

for x in range(len(polygons)):
    a = polygons[x].centroid
    centroids.append(a)
    
ward_variables['centroid'] = centroids

centroid_x = []
centroid_y = []


for x in ward_variables['centroid']:
    centroid_x.append(x.x)
    centroid_y.append(x.y)
    
ward_variables['centroid_x'] = centroid_x 
ward_variables['centroid_y'] = centroid_y

In [184]:
#Join income data to ward variables dataframe

#Import income .csv
ward_income = pd.read_csv("Data/modelled-household-income-estimates-wards.csv")

#Change code for city of london to be consistent with shapefile
ward_income.set_value(0,['Code'], 'E05001554')

#Join to ward variables dataframe
ward_variables = ward_variables.merge(ward_income[['Code','Median 2012_13']], left_on='ward_code', right_on = 'Code')

#Rename Column
ward_variables=ward_variables.rename(columns = {'Median 2012_13':'med_income_2012_13'})

In [185]:
#Categorise as high or low income

#Caluculate median
med_income = np.median(ward_variables['med_income_2012_13'])

#Define function for categories
def income_category (row):
    if row['med_income_2012_13'] >= med_income:
          return 'high'
    if row['med_income_2012_13'] < med_income:
          return 'low'

#Apply function to create new dataframe coloumn
ward_variables['income_category'] = ward_variables.apply(lambda row: income_category (row), axis=1)

In [186]:
#Import occupation data by ward

#Import occupation .csv
ward_occupation = pd.read_csv("Data/ward_occupation_data.csv")

ward_occupation['combined_professionals_pct'] = ((ward_occupation['Sex: All persons; Occupation: 1. Managers, directors and senior officials; measures: Value'] +
    ward_occupation['Sex: All persons; Occupation: 2. Professional occupations; measures: Value'])/
    ward_occupation['Sex: All persons; Occupation: All categories: Occupation; measures: Value']) * 100

In [187]:
ward_occupation.to_csv('spatial_analysis/occupation_check.csv')

In [188]:
ward_variables = ward_variables.merge(ward_occupation[['geography code', 'combined_professionals_pct']], left_on='ward_code', right_on='geography code')

In [189]:
ward_variables.to_csv('spatial_analysis/check.csv')

## Step 2 - Create KDE surface and attach KDE values to ward centroids

In [190]:
kernels = {}

for x in tokens_lst:

    #Set up grid and KDE calculation
    X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    values_coffee = np.vstack([df[x]['eastings'], df[x]['northings']])
    kernels[x] = stats.gaussian_kde(values_coffee, 0.1) #Bandwidth set ot 0.1
    Z = np.reshape(kernels[x](positions).T, X.shape)
    
    #Plot KDE surface and save
    token = x

    import matplotlib.pyplot as plt
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111)
    
    #Add london wards as patches
    ax.add_collection(PatchCollection(ward_patches, alpha=1, facecolor='None', lw = 0.1, 
                                      edgecolor = '0'))
    kde_surface = ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
               extent=[xmin, xmax, ymin, ymax])
    ax.plot(df[x]['eastings'], df[x]['northings'], 'k.', markersize=2)
    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title(token)
    plt.xlabel('eastings')
    plt.ylabel('northings')

    #ax.set_ylim([ymin, ymax])
    plt.colorbar(kde_surface, cmap=plt.cm.gist_earth_r )

    #plt.show()
    fig.savefig('spatial_analysis/figures/' + token + '_kde.png', dpi=200, figsize = (12,8))

    #Evaluate density at each ward centroid
    ward_kde = []
    
    for y in range(len(ward_variables['ward_name'])):
        kde = kernels[x].evaluate([ward_variables['centroid'].iloc[y].x, 
                             ward_variables['centroid'].iloc[y].y])
        ward_kde.append(kde[0])
        
    ward_variables[x + '_kde'] = ward_kde

## Step 3 - Calculate number of each token within each ward

In [191]:
# Convert token coordinates to shapely point file for each dataframe
from shapely.geometry import Point

token_points = {}
token_points_coords = {}

for x in tokens_lst:
    #Make shapely points
    token_points[x] = [Point(xy) for xy in zip(df[x]['eastings'], df[x]['northings'])]
    
    #convert shapely points into coordinate tuples
    point_coords = []
    for i in range(len(token_points[x])):
        a = ([token_points[x][i].x, token_points[x][i].y])
        point_coords.append(a)
    token_points_coords[x] = point_coords

In [192]:
#Build a spatial index based on the bounding boxes of the polygons
from rtree import index
idx = index.Index()
count = -1
for q in polygon_shapes:
    count +=1
    idx.insert(count, q.bbox)

In [193]:
#Assign one or more matching polygons to each point

for x in tokens_lst:
    
    matches = []

    for i in range(len(df[x]['Token'])): #Iterate through each point
        temp= None
        #print "Point ", i

        #Iterate only through the bounding boxes which contain the point
        for j in idx.intersection(token_points_coords[x][i]):
            #Verify that point is within the polygon itself not just the bounding box
            if token_points[x][i].within(polygons[j]):
                temp=j
                break
        matches.append(temp) #Either the first match found, or None for no matches
    
    df[x]['ward_no'] = matches

    df[x] = df[x].merge(ward_variables[['position','ward_name', 'ward_code']], left_on='ward_no', right_on='position')
    

    

In [194]:
#Names of wards
london_wards = ward_variables['ward_name']

#Count the number of tokens for each ward and join to ward_variables dataframe
for x in tokens_lst:
    token_count = []
    for b in london_wards:
            temp_df = df[x].loc[(df[x]['Token'] == x) & 
                                        (df[x]['ward_name'] == b)]
            token_count.append(len(temp_df.index))
    ward_variables[x + '_count'] = token_count

## Step 4 - Calculate number of businesses within each ward

In [195]:
# Convert token coordinates to shapely point file for each dataframe
from shapely.geometry import Point

#Make shapely points
food_bus_points = [Point(xy) for xy in zip(food_bus_df['eastings'], food_bus_df['northings'])]
    
#convert shapely points into coordinate tuples
point_coords = []
for i in range(len(food_bus_points)):
    a = ([food_bus_points[i].x, food_bus_points[i].y])
    point_coords.append(a)
food_bus_coords = point_coords

In [196]:
matches = []

for i in range(len(food_bus_df['Latitude'])): #Iterate through each point
    temp= None
    #print "Point ", i

    #Iterate only through the bounding boxes which contain the point
    for j in idx.intersection(food_bus_coords[i]):
        #Verify that point is within the polygon itself not just the bounding box
        if food_bus_points[i].within(polygons[j]):
            temp=j
            break
    matches.append(temp) #Either the first match found, or None for no matches
    
food_bus_df['ward_no'] = matches

food_bus_df = food_bus_df.merge(ward_variables[['position','ward_name', 'ward_code']], left_on='ward_no', right_on='position')


In [197]:
token_count = []

for b in london_wards:
        temp_df = food_bus_df.loc[food_bus_df['ward_name'] == b]
        token_count.append(len(temp_df.index))
ward_variables['all_business_count'] = token_count

## Step 5 - Calculate Location Quotient

In [198]:
#Calculate LQ for each token

for x in tokens_lst:
    ward_variables[x +'_lq'] = (ward_variables[x + '_count']/(ward_variables['all_business_count']))/(len(df[x])/float(len(food_bus_df.index)))

In [199]:
#Try Visualise the LQ

for x in tokens_lst:
    
    import matplotlib.pyplot as plt
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111)

    #Add london wards as patches
    cmap = plt.get_cmap('Blues')
    test = PatchCollection(ward_patches, alpha=1, facecolor=cmap(ward_variables[x + '_lq']), lw = 0.1, 
                                          edgecolor = '0')
    ax.add_collection(test)

    ax.plot(df[x]['eastings'], df[x]['northings'], 'k.', markersize=6, markerfacecolor='red', alpha=0.8)
    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title(x + ' LQ')
    plt.xlabel('eastings')
    plt.ylabel('northings')


    m = cm.ScalarMappable(cmap=cm.Blues)
    m.set_array(ward_variables[x + '_lq'])
    plt.colorbar(m)

    #plt.show()
    
     #plt.show()
    fig.savefig('spatial_analysis/figures/' + x + '_LQ.png', dpi=200, figsize = (12,8))


## Step 6 - Calculate smoothed location quotient

In [200]:
#Calculate kde for all wards

#Add values for LQ for all businesses by ward
#Set up grid and KDE calculation
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([food_bus_df['eastings'], food_bus_df['northings']])
kernels = stats.gaussian_kde(values, 0.1) #Bandwidth set ot 0.1
Z = np.reshape(kernels(positions).T, X.shape)
    

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
    
#Add london wards as patches
ax.add_collection(PatchCollection(ward_patches, alpha=1, facecolor='None', lw = 0.1, 
                                      edgecolor = '0'))
kde_surface = ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
               extent=[xmin, xmax, ymin, ymax])
ax.plot(food_bus_df['eastings'], food_bus_df['northings'], 'k.', markersize=2)
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
plt.title('food_business_KDE')
plt.xlabel('eastings')
plt.ylabel('northings')

#ax.set_ylim([ymin, ymax])
plt.colorbar(kde_surface, cmap=plt.cm.gist_earth_r )

#plt.show()
fig.savefig('spatial_analysis/figures/foodbusiness_kde.png', dpi=200, figsize = (12,8))

#Evaluate density at each ward centroid
ward_kde = []
    
for y in range(len(ward_variables['ward_name'])):
    kde = kernels.evaluate([ward_variables['centroid'].iloc[y].x, 
                             ward_variables['centroid'].iloc[y].y])
    ward_kde.append(kde[0])
        
ward_variables['all_business_kde'] = ward_kde

In [201]:
for x in tokens_lst:
    ward_variables[x +'_smooth_lq'] = (ward_variables[x + '_kde']/(ward_variables['all_business_kde']))/(sum(ward_variables[x + '_kde'])/float(sum(ward_variables['all_business_kde'])))

In [202]:
ward_variables

Unnamed: 0,position,ward_code,ward_name,centroid,centroid_x,centroid_y,Code,med_income_2012_13,income_category,geography code,...,sushi_smooth_lq,thai_smooth_lq,chicken_smooth_lq,fried_smooth_lq,fish_smooth_lq,kebab_smooth_lq,costcutter_smooth_lq,waitrose_smooth_lq,sainsburys_smooth_lq,tesco_smooth_lq
0,0,E05000405,Chessington South,POINT (517652.3434795503 162339.1609310878),517652.343480,162339.160931,E05000405,38310,high,E05000405,...,1.380314e-14,8.373234e-06,2.034534,3.047417,2.092624,2.098936,0.000002,3.391052e-07,3.137974,5.832419e-02
1,1,E05000414,Tolworth and Hook Rise,POINT (519124.9351873993 165300.0168107432),519124.935187,165300.016811,E05000414,37840,low,E05000414,...,3.183376e-03,1.112350e+00,0.591534,1.090986,0.642651,1.491646,0.939468,2.135553e-01,3.413788,1.709067e+00
2,2,E05000401,Berrylands,POINT (519108.406980712 167344.3249926214),519108.406981,167344.324993,E05000401,42330,high,E05000401,...,1.583157e-02,2.524468e+00,0.490083,0.716221,0.724135,1.148679,1.502875,2.132931e+00,2.317537,2.757362e-01
3,3,E05000400,Alexandra,POINT (520118.1401777423 166393.3293737157),520118.140178,166393.329374,E05000400,41390,high,E05000400,...,4.618977e-01,3.842511e-01,0.521756,1.488705,0.542988,1.482573,1.742734,5.058457e-01,0.413352,1.355478e+00
4,4,E05000402,Beverley,POINT (521204.9458895464 168516.7882485987),521204.945890,168516.788249,E05000402,40700,high,E05000402,...,1.857454e+00,1.726968e-02,0.071723,0.002836,0.696751,1.442905,2.197082,3.941100e+00,0.065761,1.281378e+00
5,5,E05000406,Coombe Hill,POINT (520755.1783848286 170623.2344953832),520755.178385,170623.234495,E05000406,45650,high,E05000406,...,3.667128e-03,6.095854e-01,0.210204,0.029624,0.903743,0.611975,0.215452,8.868991e-01,1.207561,4.300950e-01
6,6,E05000404,Chessington North and Hook,POINT (518064.3902391941 164595.0191127136),518064.390239,164595.019113,E05000404,37230,low,E05000404,...,1.386950e-06,5.200541e-01,1.315066,0.692906,0.749167,1.859986,0.173956,3.624646e-02,7.106886,2.193644e+00
7,7,E05000413,Surbiton Hill,POINT (518303.1766019526 166453.847793755),518303.176602,166453.847794,E05000413,43160,high,E05000413,...,9.497274e-05,4.569783e+00,0.598943,0.916065,0.717714,1.063128,2.162570,2.220716e+00,2.856802,7.224158e-01
8,8,E05000410,Old Malden,POINT (521495.0642951772 166551.3661053034),521495.064295,166551.366105,E05000410,41760,high,E05000410,...,6.776935e+00,4.963256e-03,0.268776,0.187637,0.757987,1.670406,0.491553,4.671473e+00,0.087281,1.796966e+00
9,9,E05000412,St. Mark's,POINT (518248.3378622264 167703.0217714706),518248.337862,167703.021771,E05000412,44930,high,E05000412,...,1.866450e-02,3.241981e+00,0.788001,1.478903,0.747690,0.922346,0.529081,3.492923e+00,3.815740,8.371581e-02


In [203]:
#Try Visualise the Smooth LQ

for x in tokens_lst:
    
    import matplotlib.pyplot as plt
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111)

    #Add london wards as patches
    cmap = plt.get_cmap('BuGn')
    test = PatchCollection(ward_patches, alpha=1, facecolor=cmap(ward_variables[x + '_smooth_lq']), lw = 0.1, 
                                          edgecolor = '0')
    ax.add_collection(test)

    ax.plot(df[x]['eastings'], df[x]['northings'], 'k.', markersize=6, markerfacecolor='red', alpha=0.8)
    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title(x + ' Smoooth_LQ')
    plt.xlabel('eastings')
    plt.ylabel('northings')


    m = cm.ScalarMappable(cmap=cm.BuGn)
    m.set_array(ward_variables[x + '_smooth_lq'])
    plt.colorbar(m)

    #plt.show()
    
     #plt.show()
    fig.savefig('spatial_analysis/figures/' + x + '_Smooth_LQ.png', dpi=200, figsize = (12,8))

## Step 6 - Moran's I Calculation

In [249]:
#Global Moran's I

import pysal
import numpy as np


#Create rooks weights matrix
w = pysal.rook_from_shapefile("Data/ESRI/london_wards.shp")

morans_value = []
morans_EI = []
morans_p = []

for x in tokens_lst:
    #Import variable as array
    y = np.array(ward_variables[x + '_kde'])

    #Calculate Moran's I
    mi = pysal.Moran(y, w, two_tailed=False)

    
    morans_value.append("%.3f"%mi.I)
    morans_EI.append(mi.EI)
    morans_p.append("%.5f"%mi.p_norm)

global_stats_df = pd.DataFrame({'morans_value': morans_value, 'morans_EI': morans_EI, 'morans_p':morans_p}, 
                                index=tokens_lst)

In [250]:
global_stats_df

Unnamed: 0,morans_EI,morans_p,morans_value
cafe,-0.001603,0.0,0.883
coffee,-0.001603,0.0,0.822
pizza,-0.001603,0.0,0.806
wine,-0.001603,0.0,0.811
sushi,-0.001603,0.0,0.727
thai,-0.001603,0.0,0.745
chicken,-0.001603,0.0,0.79
fried,-0.001603,0.0,0.767
fish,-0.001603,0.0,0.827
kebab,-0.001603,0.0,0.735


In [206]:
#Local Moran's I

for x in tokens_lst:

    y = np.array(ward_variables[x + '_kde'])
    lm = pysal.Moran_Local(y,w)
    
    ward_variables[x + '_lmoran_value'] = lm.Is
    ward_variables[x + '_lmoran_p'] = lm.p_sim

In [208]:
#Visualise Local Morans Values

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)

for x in tokens_lst:
    
    cmap = plt.get_cmap('bwr')
    test = PatchCollection(ward_patches, alpha=1, facecolor=cmap(ward_variables[x + '_lmoran_p']), lw = 0.1, 
                                          edgecolor = '0')
    ax.add_collection(test)

    ax.plot(df[x]['eastings'], df[x]['northings'], 'k.', markersize=6, markerfacecolor='green', alpha=0.8)
    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title(x + ' Local Morans P Values')
    plt.xlabel('eastings')
    plt.ylabel('northings')


    #m = cm.ScalarMappable(cmap=cm.bwr)
    #m.set_array(ward_variables[x + '_lmoran_value'])
    #plt.colorbar(m)
    
    #plt.show()
    fig.savefig('spatial_analysis/figures/' + x + '_local_morans_p.png', dpi=200, figsize = (12,8))



In [209]:
#Visualise Local Morans P Values

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)

for x in tokens_lst:
    
    cmap = plt.get_cmap('bwr')
    test = PatchCollection(ward_patches, alpha=1, facecolor=cmap(ward_variables[x + '_lmoran_value']), lw = 0.1, 
                                          edgecolor = '0')
    ax.add_collection(test)

    ax.plot(df[x]['eastings'], df[x]['northings'], 'k.', markersize=6, markerfacecolor='green', alpha=0.8)
    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title(x + ' Local Morans Values')
    plt.xlabel('eastings')
    plt.ylabel('northings')


    #m = cm.ScalarMappable(cmap=cm.bwr)
    #m.set_array(ward_variables[x + '_lmoran_value'])
    #plt.colorbar(m)
    
    #plt.show()
    fig.savefig('spatial_analysis/figures/' + x + '_local_morans.png', dpi=200, figsize = (12,8))


## Step 7 - Correlation

In [278]:
#Create scatter Matrix of Correlations

#Subset Data
lst = ['med_income_2012_13', 'combined_professionals_pct']

subset_columns = []

for x in lst:
    subset_columns.append(x)

for x in tokens_lst:
    a = x + '_smooth_lq'
    subset_columns.append(a)
    
ward_variables_subset = ward_variables[subset_columns]

#Rename columns for visualisation
for x in tokens_lst:
    #Rename columns for visualisation
    ward_variables_subset.rename(columns={x + '_smooth_lq': x}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [279]:
# takes the dataset, an alpha value for opacity, a figure size setting, and a specification of the diagonal charts

from pandas.tools.plotting import scatter_matrix
a = pd.scatter_matrix(ward_variables_subset, alpha=0.2, diagonal='kde', figsize=(12,12))
plt.show()

In [298]:
#Calculate correlation coefficients
a = ward_variables_subset.corr()
subset = a.iloc[2:16]

global_stats_df['income_correlation'] = subset['med_income_2012_13']
global_stats_df['professional_pct_correlation'] = subset['combined_professionals_pct']

In [299]:
global_stats_df

Unnamed: 0,morans_EI,morans_p,morans_value,income_r2,income_ar2,incomemoran_res,incomecoefficient,incomet_stat,professionals_pct_r2,professionals_pct_ar2,...,income_coefficient,income_t_stat,professionals_pct_moran_res,professionals_pct_coefficient,professionals_pct_t_stat,income_professionals_moran_res,income_professionals_coefficient,income_professionals_t_stat,income_correlation,professional_pct_correlation
cafe,-0.001603,0.0,0.883,0.024742,0.023176,"(0.865262661502, 36.7879471005, 2.87761672922e...","[[1.52945253772e-10], [2.16222366336e-14]]","[(0.703666161568, 0.48190359777), (3.975569157...",0.108671,0.10724,...,"[[1.52945253772e-10], [2.16222366336e-14]]","[(0.703666161568, 0.48190359777), (3.975569157...","(0.857470165403, 36.4691239767, 3.42380475962e...","[[-1.0462430304e-10], [3.29249106764e-11]]","[(-0.788188733408, 0.430886113059), (8.7152910...","(0.876395638557, 37.375727952, 9.68899835583e-...","[[1.00679504615e-09], [-5.9905575067e-14], [6....","[(4.60406356944, 5.0256056669e-06), (-6.289561...",0.0071,0.098086
coffee,-0.001603,0.0,0.822,0.072582,0.071093,"(0.796343390326, 33.8666405165, 2.06493054556e...","[[-1.2799334748e-09], [5.89093739768e-14]]","[(-3.79625661141, 0.000161256353228), (6.98265...",0.12902,0.127622,...,"[[-1.2799334748e-09], [5.89093739768e-14]]","[(-3.79625661141, 0.000161256353228), (6.98265...","(0.80641019424, 34.3044312513, 6.73985490349e-...","[[-8.84819033267e-10], [5.70664314833e-11]]","[(-4.2391859044, 2.58326221267e-05), (9.606570...","(0.814969932243, 34.7663672506, 7.84311674101e...","[[-3.75403110431e-10], [-2.74575513085e-14], [...","[(-1.06131811013, 0.288957270315), (-1.7822245...",0.334537,0.41445
pizza,-0.001603,0.0,0.806,0.010935,0.009348,"(0.787044335273, 33.4724780379, 1.2123363094e-...","[[5.64364076747e-10], [9.88781653517e-15]]","[(3.74831349883, 0.000194587001536), (2.624490...",0.068193,0.066698,...,"[[5.64364076747e-10], [9.88781653517e-15]]","[(3.74831349883, 0.000194587001536), (2.624490...","(0.766531356112, 32.6137638101, 2.61710313097e...","[[3.49645442089e-10], [1.79407254456e-11]]","[(3.74524334109, 0.000196928744339), (6.752301...","(0.766960329512, 32.7269219079, 6.4680015832e-...","[[1.08047394517e-09], [-3.93917037562e-14], [4...","[(6.99741931072, 6.76436426959e-12), (-5.85708...",-0.191656,-0.211303
wine,-0.001603,0.0,0.811,0.014917,0.013336,"(0.78804723578, 33.5149883518, 2.91549134266e-...","[[4.60414855924e-10], [1.26643154123e-14]]","[(2.79418209996, 0.00536317410238), (3.0715326...",0.084062,0.082592,...,"[[4.60414855924e-10], [1.26643154123e-14]]","[(2.79418209996, 0.00536317410238), (3.0715326...","(0.759623334088, 32.3208975064, 3.55842540286e...","[[2.23556889287e-10], [2.18432238637e-11]]","[(2.20253550057, 0.0279930011797), (7.56155716...","(0.749715841601, 31.9943769887, 1.30559037404e...","[[1.07199290079e-09], [-4.57307560946e-14], [5...","[(6.41037668645, 2.87066771468e-10), (-6.27845...",-0.117326,-0.097904
sushi,-0.001603,0.0,0.727,0.109434,0.108004,"(0.682447404347, 29.0388893222, 2.12566775857e...","[[-3.19009151947e-09], [1.09617187346e-13]]","[(-6.3715127421, 3.64017423894e-10), (8.749557...",0.131869,0.130476,...,"[[-3.19009151947e-09], [1.09617187346e-13]]","[(-6.3715127421, 3.64017423894e-10), (8.749557...","(0.69850421255, 29.7297460669, 3.16939199636e-...","[[-1.82443446661e-09], [8.74293011504e-11]]","[(-5.7774260282, 1.19855623739e-08), (9.727982...","(0.691955403102, 29.5407166668, 8.6422563805e-...","[[-2.32414696446e-09], [2.69345360721e-14], [7...","[(-4.33658275493, 1.68801829435e-05), (1.15384...",0.345556,0.409616
thai,-0.001603,0.0,0.745,0.170472,0.169141,"(0.63298494664, 26.9423058718, 7.02046889666e-...","[[-2.4332405269e-09], [8.9606700179e-14]]","[(-7.68832492685, 5.83293057487e-14), (11.3150...",0.273053,0.271886,...,"[[-2.4332405269e-09], [8.9606700179e-14]]","[(-7.68832492685, 5.83293057487e-14), (11.3150...","(0.635931187023, 27.0769562184, 1.83967185513e...","[[-1.68419317815e-09], [8.23984670198e-11]]","[(-8.8987447545, 6.10902200601e-18), (15.29732...","(0.644717651284, 27.5340595095, 6.86862644894e...","[[-1.24925834266e-09], [-2.34430158659e-14], [...","[(-3.8938846264, 0.000109322699028), (-1.67763...",0.453133,0.505881
chicken,-0.001603,0.0,0.79,0.087108,0.085643,"(0.776646168553, 33.0317271074, 2.84671323472e...","[[2.39161314448e-09], [-3.50506969915e-14]]","[(13.164086208, 4.28543502653e-35), (-7.710179...",0.013595,0.012012,...,"[[2.39161314448e-09], [-3.50506969915e-14]]","[(13.164086208, 4.28543502653e-35), (-7.710179...","(0.775394268468, 32.9895078904, 1.14859451141e...","[[1.35351839785e-09], [-1.00608709192e-11]]","[(11.2194625949, 9.83236886506e-27), (-2.93023...","(0.731121931934, 31.2045087057, 9.25486914125e...","[[2.89429219561e-09], [-8.30478077158e-14], [4...","[(15.2192983441, 1.0731757469e-44), (-10.02613...",-0.512788,-0.480941
fried,-0.001603,0.0,0.767,0.034215,0.032665,"(0.759747099919, 32.3154200481, 4.24825654076e...","[[2.41179786741e-09], [-3.38916091088e-14]]","[(8.36556270549, 3.93025202302e-16), (-4.69801...",0.001889,0.000287,...,"[[2.41179786741e-09], [-3.38916091088e-14]]","[(8.36556270549, 3.93025202302e-16), (-4.69801...","(0.75555180988, 32.1482848191, 9.33333133976e-...","[[1.27556241638e-09], [-5.78656041907e-12]]","[(6.81291458107, 2.25890295012e-11), (-1.08595...","(0.738438706781, 31.5153249078, 5.35713145602e...","[[3.06956308037e-09], [-9.66967522429e-14], [5...","[(10.0514677822, 3.97059648844e-22), (-7.26972...",-0.345304,-0.280755
fish,-0.001603,0.0,0.827,0.015412,0.013832,"(0.811576381168, 34.5123269215, 5.24045739491e...","[[1.26781894074e-09], [-9.58809929175e-15]]","[(10.3324391966, 3.31522683317e-23), (-3.12281...",0.000695,-0.000909,...,"[[1.26781894074e-09], [-9.58809929175e-15]]","[(10.3324391966, 3.31522683317e-23), (-3.12281...","(0.808419776731, 34.3896277076, 3.60348711783e...","[[8.41646000629e-10], [1.47907671795e-12]]","[(10.6580914384, 1.77013891987e-24), (0.658112...","(0.782255030438, 33.3766399809, 2.99268571644e...","[[1.58965224702e-09], [-4.03175852356e-14], [2...","[(12.3297520269, 2.09838492856e-31), (-7.17958...",-0.311339,-0.462713
kebab,-0.001603,0.0,0.735,0.031252,0.029697,"(0.73568424467, 31.2954589216, 5.37972709319e-...","[[1.53924391992e-09], [-1.58625375931e-14]]","[(10.8853220334, 2.20971538501e-25), (-4.48305...",2e-06,-0.001603,...,"[[1.53924391992e-09], [-1.58625375931e-14]]","[(10.8853220334, 2.20971538501e-25), (-4.48305...","(0.724567188354, 30.8346886049, 8.98712569824e...","[[9.13106549068e-10], [9.86527833606e-14]]","[(9.94918412695, 9.61597034183e-22), (0.037768...","(0.679756585319, 29.0225115433, 3.42159633698e...","[[1.96938365273e-09], [-5.69334044534e-14], [3...","[(13.4104620559, 3.3461264639e-36), (-8.900890...",-0.32702,-0.405374


## Step 8 - Regression

In [300]:
#Regression - Income only

w = pysal.rook_from_shapefile("Data/ESRI/london_wards.shp")
x = np.reshape(ward_variables['med_income_2012_13'], newshape=(625, 1))

var = 'income'

r2 = []
ar2 = []
moran_res = []
coefficient = []
t_stat = []

for i in tokens_lst:
    y = np.reshape(a=ward_variables[i + '_smooth_lq'],newshape=(625,1))
    a = pysal.spreg.ols.OLS(y,x,w, spat_diag=True, moran=True)
    ward_variables[i + '_' + var + '_residuals'] = a.u
    
    r2.append(a.r2)
    ar2.append(a.ar2)
    moran_res.append(a.moran_res)
    coefficient.append(a.betas)
    t_stat.append(a.t_stat)

global_stats_df[var + '_r2'] = r2
global_stats_df[var + '_ar2'] = ar2
global_stats_df[var + '_moran_res'] = moran_res
global_stats_df[var + '_coefficient'] = coefficient
global_stats_df[var + '_t_stat'] = t_stat

In [301]:
#Regression - proportion of population professional only

w = pysal.rook_from_shapefile("Data/ESRI/london_wards.shp")
x = np.reshape(ward_variables['combined_professionals_pct'], newshape=(625, 1))

var = 'professionals_pct'

r2 = []
ar2 = []
moran_res = []
coefficient = []
t_stat = []

for i in tokens_lst:
    y = np.reshape(a=ward_variables[i + '_smooth_lq'],newshape=(625,1))
    a = pysal.spreg.ols.OLS(y,x,w, spat_diag=True, moran=True)
    ward_variables[i + '_' + var + '_residuals'] = a.u
    
    r2.append(a.r2)
    ar2.append(a.ar2)
    moran_res.append(a.moran_res)
    coefficient.append(a.betas)
    t_stat.append(a.t_stat)

global_stats_df[var + '_r2'] = r2
global_stats_df[var + '_ar2'] = ar2
global_stats_df[var + '_moran_res'] = moran_res
global_stats_df[var + '_coefficient'] = coefficient
global_stats_df[var + '_t_stat'] = t_stat

In [302]:
#Regression - proportion of population professional and income

w = pysal.rook_from_shapefile("Data/ESRI/london_wards.shp")
x = ward_variables[['med_income_2012_13', 'combined_professionals_pct']].as_matrix()

var = 'income_professionals'

r2 = []
ar2 = []
moran_res = []
coefficient = []
t_stat = []

for i in tokens_lst:
    y = np.reshape(a=ward_variables[i + '_smooth_lq'],newshape=(625,1))
    a = pysal.spreg.ols.OLS(y,x,w, spat_diag=True, moran=True)
    ward_variables[i + '_' + var + '_residuals'] = a.u
    
    r2.append(a.r2)
    ar2.append(a.ar2)
    moran_res.append(a.moran_res)
    coefficient.append(a.betas)
    t_stat.append(a.t_stat)

global_stats_df[var + '_r2'] = r2
global_stats_df[var + '_ar2'] = ar2
global_stats_df[var + '_moran_res'] = moran_res
global_stats_df[var + '_coefficient'] = coefficient
global_stats_df[var + '_t_stat'] = t_stat

In [305]:
global_stats_df.to_csv('spatial_analysis/test_global_stats.csv')

## Step 9 - Explore Clustering Techniques

In [303]:
list(ward_variables.columns.values)

['position',
 'ward_code',
 'ward_name',
 'centroid',
 'centroid_x',
 'centroid_y',
 'Code',
 'med_income_2012_13',
 'income_category',
 'geography code',
 'combined_professionals_pct',
 'cafe_kde',
 'coffee_kde',
 'pizza_kde',
 'wine_kde',
 'sushi_kde',
 'thai_kde',
 'chicken_kde',
 'fried_kde',
 'fish_kde',
 'kebab_kde',
 'costcutter_kde',
 'waitrose_kde',
 'sainsburys_kde',
 'tesco_kde',
 'cafe_count',
 'coffee_count',
 'pizza_count',
 'wine_count',
 'sushi_count',
 'thai_count',
 'chicken_count',
 'fried_count',
 'fish_count',
 'kebab_count',
 'costcutter_count',
 'waitrose_count',
 'sainsburys_count',
 'tesco_count',
 'all_business_count',
 'cafe_lq',
 'coffee_lq',
 'pizza_lq',
 'wine_lq',
 'sushi_lq',
 'thai_lq',
 'chicken_lq',
 'fried_lq',
 'fish_lq',
 'kebab_lq',
 'costcutter_lq',
 'waitrose_lq',
 'sainsburys_lq',
 'tesco_lq',
 'all_business_kde',
 'cafe_smooth_lq',
 'coffee_smooth_lq',
 'pizza_smooth_lq',
 'wine_smooth_lq',
 'sushi_smooth_lq',
 'thai_smooth_lq',
 'chicken_smoo

In [307]:
cluster_var = [['med_income_2012_13','coffee'],['med_income_2012_13','chicken'], ['coffee','chicken'],
              ['med_income_2012_13','coffee','chicken'], ['med_income_2012_13','coffee','chicken', 'thai', 'kebab'], 
             ['coffee','chicken', 'thai', 'kebab']]

cluster_names=['income_coffee', 'income_chicken', 'coffee_chicken', 'income_coffee_chicken', 'income_coffee_chicken_thai_kebab',
              'coffee_chicken_thai_kebab']
for x in cluster_names:
    print x

income_coffee
income_chicken
coffee_chicken
income_coffee_chicken
income_coffee_chicken_thai_kebab
coffee_chicken_thai_kebab


In [313]:
cluster_var = [['med_income_2012_13','coffee'],['med_income_2012_13', 'chicken'], ['coffee','chicken'],
              ['med_income_2012_13','coffee','chicken'], ['med_income_2012_13','coffee','chicken', 'thai', 'kebab']]

cluster_names=['income_coffee', 'income_chicken', 'coffee_chicken', 'income_coffee_chicken', 'income_coffee_chicken_thai_kebab',
              'coffee_chicken_thai_kebab']

for i in range(len(cluster_var)):

    for x in range(7):

        from sklearn.cluster import AgglomerativeClustering

        subset = ward_variables_subset[cluster_var[i]]

        import sklearn as sk
        scaled = sk.preprocessing.scale(subset)
        AgClustering = AgglomerativeClustering(n_clusters=x+1)
        AgClustering.fit(scaled)
        AgClustering_labels = AgClustering.labels_
        ward_variables['ag_clusters'+ cluster_names[i] + str(x)] = AgClustering_labels

        #Visualise Scatter
        fig = plt.figure()
        cmap = plt.get_cmap('gist_ncar')
        ward_variables.plot(kind = 'scatter', x='centroid_x', y = 'centroid_y', c='ag_clusters'+ cluster_names[i] + str(x), cmap =  cmap)
        #plt.show()

        #cloropleth
        plt.clf()
        fig = plt.figure()
        ax = fig.add_subplot(111, axisbg='w', frame_on=False)

        # use a blue colour ramp - we'll be converting it to a map using cmap()
        cmap = plt.get_cmap('gist_ncar')
        pc = PatchCollection(ward_patches, alpha=1, lw = 0.1, edgecolor = '0')

        # impose our colour map onto the patch collection
        norm = Normalize()
        pc.set_facecolor(cmap(norm(ward_variables['ag_clusters' + cluster_names[i] + str(x)].values)))
        ax.add_collection(pc)

        ax.set_xlim([xmin, xmax])
        ax.set_ylim([ymin, ymax])
        plt.title('ag_cluster' + cluster_names[i] + str(x))
        plt.xlabel('eastings')
        plt.ylabel('northings')

        #plt.show()

        fig.savefig('spatial_analysis/figures/' + cluster_names[i] + str(x+1) + '_cluster.png', dpi=200, figsize = (12,8))


In [310]:
cluster_names[0]

'income_coffee'

In [142]:
from sklearn.cluster import DBSCAN

subset = ward_variables_subset[['med_income_2012_13','coffee','chicken', 'sushi', 'kebab']]

import sklearn as sk
scaled = sk.preprocessing.scale(subset)
dbscan = DBSCAN(eps=0.8, min_samples=5)
dbscan.fit(scaled)
dbscan_labels = dbscan.labels_
ward_variables['clusters'] = dbscan_labels

#Visualise
cmap = plt.get_cmap('gist_ncar')
ward_variables.plot(kind = 'scatter', x='centroid_x', y = 'centroid_y', c='clusters', cmap =  cmap)
plt.show()
