# Spatial Analysis of Food Business Data

### Exploring gentrification through consumer preferences

In [69]:
#Load packages
import pandas as pd
import numpy as np

from scipy import stats

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors
from matplotlib.colors import Normalize
from matplotlib.collections import PatchCollection
from descartes import PolygonPatch

import pyproj
from pyproj import Proj, transform

import shapefile
import shapely
from shapely.geometry import Polygon
from shapely.geometry import Point

## Step 1 - Data import and cleaning

In [70]:
#Set working directory
import os
os.chdir("C:/Users/Claire/Google Drive/LondonGentrification")

In [71]:
#Import .csv data 

#Word tokens 
tokens_df = pd.read_csv("Data/FoodPremises/tokens_spatial.csv")

#Food businesses
food_bus_df = pd.read_csv("data/FoodPremises/london_premises.csv")
#Remove records with no coordinates
food_bus_df = food_bus_df.loc[food_bus_df['Latitude']>0]

In [72]:
#Check column names and data types
print tokens_df.dtypes
print food_bus_df.dtypes

BusinessID                float64
BusinessName               object
BusinessType               object
BusinessTypeID            float64
ConfidenceInManagement    float64
Hygiene                   float64
LocalAuthorityCode        float64
LocalAuthorityName         object
PostCode                   object
RatingValue               float64
Structural                float64
Token                      object
lat                       float64
lon                       float64
dtype: object
Index                       int64
BusinessName               object
BusinessType               object
BusinessTypeID              int64
PostCode                   object
RatingValue               float64
RatingDate                 object
LocalAuthorityCode          int64
LocalAuthorityName         object
Hygiene                   float64
Structural                float64
ConfidenceInManagement    float64
Longitude                 float64
Latitude                  float64
dtype: object


In [73]:
#Function to add eastings and northings from lat an lon
#Add eastings and northings to word tokens

def transform_coordinates (data,input_espg, output_espg, input_x, input_y):

    #define input and output projection
    input_projection = pyproj.Proj("+init=" + input_espg) #wgs84
    output_projection = pyproj.Proj("+init=" + output_espg) #osgb36

    eastings = []
    northings = []

    x_list = input_x.tolist()
    y_list = input_y.tolist()

    for i in range (len(input_x)):
        x = x_list[i]
        y = y_list[i]
        new_x, new_y = pyproj.transform(input_projection, output_projection, x, y)
        eastings.append(new_x)
        northings.append(new_y)

    #Add to tokens dataframe
    data['eastings'] = eastings
    data['northings'] = northings

In [74]:
#Apply funtion to dataframe

transform_coordinates(tokens_df, "EPSG:4326", "EPSG:27700", 
                       tokens_df['lon'], tokens_df['lat'])

transform_coordinates(food_bus_df, "EPSG:4326", "EPSG:27700", 
                       food_bus_df['Longitude'], food_bus_df['Latitude'])



In [75]:
#Create subsetted dataframes for each token of interest

#List of tokens of interest:
tokens_lst = ['cafe', 'coffee', 'pizza', 'wine', 'sushi', 'thai', 'chicken', 
                   'fried', 'fish', 'kebab', 'costcutter', 'waitrose', 'sainsburys', 'tesco']

df = {}

for x in tokens_lst:
    df[x] = pd.DataFrame(tokens_df.loc[tokens_df['Token'] == x])

In [76]:
#Import london wards shapefile and save as matplot lib patches for plotting

#Load the shapefile of polygons and convert it to shapely polygon objects
polygons_sf = shapefile.Reader("Data/ESRI/london_wards.shp")
polygon_shapes = polygons_sf.shapes()
polygon_points = [q.points for q in polygon_shapes ]
polygons = [Polygon(q) for q in polygon_points]

#Create matplotlib patches from shapely polygons
ward_patches = []
for x in range (len(polygons)):
    a = PolygonPatch(polygons[x])
    ward_patches.append(a)
    
#Define bounding box (eastings and northings)
xmin = polygons_sf.bbox[0]
xmax = polygons_sf.bbox[2]
ymin = polygons_sf.bbox[1]
ymax = polygons_sf.bbox[3]

In [77]:
#Create wards dataframe from shapefile

#polygons_sf.fields #Access shapefile fields
#polygons_sf.records #Access shapefile records

records = polygons_sf.records()

position = []
ward_name = []
ward_code = []

for x in range (len(records)):
    position.append(records[x][0] - 1)
    ward_name.append(records[x][1])
    ward_code.append(records[x][2])
    
ward_variables = pd.DataFrame({'position': position, 'ward_name': ward_name,'ward_code':ward_code})


In [78]:
#Calculate centroids for each ward and add to ward dataframe
centroids = []

for x in range(len(polygons)):
    a = polygons[x].centroid
    centroids.append(a)
    
ward_variables['centroid'] = centroids

centroid_x = []
centroid_y = []


for x in ward_variables['centroid']:
    centroid_x.append(x.x)
    centroid_y.append(x.y)
    
ward_variables['centroid_x'] = centroid_x 
ward_variables['centroid_y'] = centroid_y

In [79]:
#Join income data to ward variables dataframe

#Import income .csv
ward_income = pd.read_csv("Data/modelled-household-income-estimates-wards.csv")

#Change code for city of london to be consistent with shapefile
ward_income.set_value(0,['Code'], 'E05001554')

#Join to ward variables dataframe
ward_variables = ward_variables.merge(ward_income[['Code','Median 2012_13']], left_on='ward_code', right_on = 'Code')

#Rename Column
ward_variables=ward_variables.rename(columns = {'Median 2012_13':'med_income_2012_13'})

In [81]:
#Categorise as high or low income

#Caluculate median
med_income = np.median(ward_variables['med_income_2012_13'])

#Define function for categories
def income_category (row):
    if row['med_income_2012_13'] >= med_income:
          return 'high'
    if row['med_income_2012_13'] < med_income:
          return 'low'

#Apply function to create new dataframe coloumn
ward_variables['income_category'] = ward_variables.apply(lambda row: income_category (row), axis=1)

In [82]:
#Import occupation data by ward

#Import occupation .csv
ward_occupation = pd.read_csv("Data/ward_occupation_data.csv")

ward_occupation['combined_professionals_pct'] = ((ward_occupation['Sex: All persons; Occupation: 1. Managers, directors and senior officials; measures: Value'] +
    ward_occupation['Sex: All persons; Occupation: 2. Professional occupations; measures: Value'])/
    ward_occupation['Sex: All persons; Occupation: All categories: Occupation; measures: Value']) * 100

In [83]:
ward_occupation.to_csv('spatial_analysis/occupation_check.csv')

In [84]:
ward_variables = ward_variables.merge(ward_occupation[['geography code', 'combined_professionals_pct']], left_on='ward_code', right_on='geography code')

In [85]:
ward_variables.to_csv('spatial_analysis/check.csv')

## Step 2 - Create KDE surface and attach KDE values to ward centroids

In [86]:
kernels = {}

for x in tokens_lst:

    #Set up grid and KDE calculation
    X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    values_coffee = np.vstack([df[x]['eastings'], df[x]['northings']])
    kernels[x] = stats.gaussian_kde(values_coffee, 0.1) #Bandwidth set ot 0.1
    Z = np.reshape(kernels[x](positions).T, X.shape)
    
    #Plot KDE surface and save
    token = x

    import matplotlib.pyplot as plt
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111)
    
    #Add london wards as patches
    ax.add_collection(PatchCollection(ward_patches, alpha=1, facecolor='None', lw = 0.1, 
                                      edgecolor = '0'))
    kde_surface = ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
               extent=[xmin, xmax, ymin, ymax])
    ax.plot(df[x]['eastings'], df[x]['northings'], 'k.', markersize=2)
    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title(token)
    plt.xlabel('eastings')
    plt.ylabel('northings')

    #ax.set_ylim([ymin, ymax])
    plt.colorbar(kde_surface, cmap=plt.cm.gist_earth_r )

    #plt.show()
    fig.savefig('spatial_analysis/figures/' + token + '_kde.png', dpi=200, figsize = (12,8))

    #Evaluate density at each ward centroid
    ward_kde = []
    
    for y in range(len(ward_variables['ward_name'])):
        kde = kernels[x].evaluate([ward_variables['centroid'].iloc[y].x, 
                             ward_variables['centroid'].iloc[y].y])
        ward_kde.append(kde[0])
        
    ward_variables[x + '_kde'] = ward_kde

## Step 3 - Calculate number of each token within each ward

In [87]:
# Convert token coordinates to shapely point file for each dataframe
from shapely.geometry import Point

token_points = {}
token_points_coords = {}

for x in tokens_lst:
    #Make shapely points
    token_points[x] = [Point(xy) for xy in zip(df[x]['eastings'], df[x]['northings'])]
    
    #convert shapely points into coordinate tuples
    point_coords = []
    for i in range(len(token_points[x])):
        a = ([token_points[x][i].x, token_points[x][i].y])
        point_coords.append(a)
    token_points_coords[x] = point_coords

In [88]:
#Build a spatial index based on the bounding boxes of the polygons
from rtree import index
idx = index.Index()
count = -1
for q in polygon_shapes:
    count +=1
    idx.insert(count, q.bbox)

In [89]:
#Assign one or more matching polygons to each point

for x in tokens_lst:
    
    matches = []

    for i in range(len(df[x]['Token'])): #Iterate through each point
        temp= None
        #print "Point ", i

        #Iterate only through the bounding boxes which contain the point
        for j in idx.intersection(token_points_coords[x][i]):
            #Verify that point is within the polygon itself not just the bounding box
            if token_points[x][i].within(polygons[j]):
                temp=j
                break
        matches.append(temp) #Either the first match found, or None for no matches
    
    df[x]['ward_no'] = matches

    df[x] = df[x].merge(ward_variables[['position','ward_name', 'ward_code']], left_on='ward_no', right_on='position')
    

    

In [90]:
#Names of wards
london_wards = ward_variables['ward_name']

#Count the number of tokens for each ward and join to ward_variables dataframe
for x in tokens_lst:
    token_count = []
    for b in london_wards:
            temp_df = df[x].loc[(df[x]['Token'] == x) & 
                                        (df[x]['ward_name'] == b)]
            token_count.append(len(temp_df.index))
    ward_variables[x + '_count'] = token_count

## Step 4 - Calculate number of businesses within each ward

In [91]:
# Convert token coordinates to shapely point file for each dataframe
from shapely.geometry import Point

#Make shapely points
food_bus_points = [Point(xy) for xy in zip(food_bus_df['eastings'], food_bus_df['northings'])]
    
#convert shapely points into coordinate tuples
point_coords = []
for i in range(len(food_bus_points)):
    a = ([food_bus_points[i].x, food_bus_points[i].y])
    point_coords.append(a)
food_bus_coords = point_coords

In [92]:
matches = []

for i in range(len(food_bus_df['Latitude'])): #Iterate through each point
    temp= None
    #print "Point ", i

    #Iterate only through the bounding boxes which contain the point
    for j in idx.intersection(food_bus_coords[i]):
        #Verify that point is within the polygon itself not just the bounding box
        if food_bus_points[i].within(polygons[j]):
            temp=j
            break
    matches.append(temp) #Either the first match found, or None for no matches
    
food_bus_df['ward_no'] = matches

food_bus_df = food_bus_df.merge(ward_variables[['position','ward_name', 'ward_code']], left_on='ward_no', right_on='position')


In [93]:
token_count = []

for b in london_wards:
        temp_df = food_bus_df.loc[food_bus_df['ward_name'] == b]
        token_count.append(len(temp_df.index))
ward_variables['all_business_count'] = token_count

## Step 5 - Calculate Location Quotient

In [94]:
#Calculate LQ for each token

for x in tokens_lst:
    ward_variables[x +'_lq'] = (ward_variables[x + '_count']/(ward_variables['all_business_count']))/(len(df[x])/float(len(food_bus_df.index)))

In [95]:
#Try Visualise the LQ

for x in tokens_lst:
    
    import matplotlib.pyplot as plt
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111)

    #Add london wards as patches
    cmap = plt.get_cmap('Blues')
    test = PatchCollection(ward_patches, alpha=1, facecolor=cmap(ward_variables[x + '_lq']), lw = 0.1, 
                                          edgecolor = '0')
    ax.add_collection(test)

    ax.plot(df[x]['eastings'], df[x]['northings'], 'k.', markersize=6, markerfacecolor='red', alpha=0.8)
    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title(x + ' LQ')
    plt.xlabel('eastings')
    plt.ylabel('northings')


    m = cm.ScalarMappable(cmap=cm.Blues)
    m.set_array(ward_variables[x + '_lq'])
    plt.colorbar(m)

    #plt.show()
    
     #plt.show()
    fig.savefig('spatial_analysis/figures/' + x + '_LQ.png', dpi=200, figsize = (12,8))


## Step 6 - Calculate smoothed location quotient

In [96]:
#Calculate kde for all wards

#Add values for LQ for all businesses by ward
#Set up grid and KDE calculation
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([food_bus_df['eastings'], food_bus_df['northings']])
kernels = stats.gaussian_kde(values, 0.1) #Bandwidth set ot 0.1
Z = np.reshape(kernels(positions).T, X.shape)
    

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
    
#Add london wards as patches
ax.add_collection(PatchCollection(ward_patches, alpha=1, facecolor='None', lw = 0.1, 
                                      edgecolor = '0'))
kde_surface = ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
               extent=[xmin, xmax, ymin, ymax])
ax.plot(food_bus_df['eastings'], food_bus_df['northings'], 'k.', markersize=2)
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
plt.title('food_business_KDE')
plt.xlabel('eastings')
plt.ylabel('northings')

#ax.set_ylim([ymin, ymax])
plt.colorbar(kde_surface, cmap=plt.cm.gist_earth_r )

#plt.show()
fig.savefig('spatial_analysis/figures/foodbusiness_kde.png', dpi=200, figsize = (12,8))

#Evaluate density at each ward centroid
ward_kde = []
    
for y in range(len(ward_variables['ward_name'])):
    kde = kernels.evaluate([ward_variables['centroid'].iloc[y].x, 
                             ward_variables['centroid'].iloc[y].y])
    ward_kde.append(kde[0])
        
ward_variables['all_business_kde'] = ward_kde

In [97]:
for x in tokens_lst:
    ward_variables[x +'_smooth_lq'] = (ward_variables[x + '_kde']/(ward_variables['all_business_kde']))/(sum(ward_variables[x + '_kde'])/float(sum(ward_variables['all_business_kde'])))

In [98]:
ward_variables

Unnamed: 0,position,ward_code,ward_name,centroid,centroid_x,centroid_y,Code,med_income_2012_13,income_category,geography code,...,sushi_smooth_lq,thai_smooth_lq,chicken_smooth_lq,fried_smooth_lq,fish_smooth_lq,kebab_smooth_lq,costcutter_smooth_lq,waitrose_smooth_lq,sainsburys_smooth_lq,tesco_smooth_lq
0,0,E05000405,Chessington South,POINT (517652.3434795503 162339.1609310878),517652.343480,162339.160931,E05000405,38310,high,E05000405,...,1.380314e-14,8.373234e-06,2.034534,3.047417,2.092624,2.098936,0.000002,3.391052e-07,3.137974,5.832419e-02
1,1,E05000414,Tolworth and Hook Rise,POINT (519124.9351873993 165300.0168107432),519124.935187,165300.016811,E05000414,37840,low,E05000414,...,3.183376e-03,1.112350e+00,0.591534,1.090986,0.642651,1.491646,0.939468,2.135553e-01,3.413788,1.709067e+00
2,2,E05000401,Berrylands,POINT (519108.406980712 167344.3249926214),519108.406981,167344.324993,E05000401,42330,high,E05000401,...,1.583157e-02,2.524468e+00,0.490083,0.716221,0.724135,1.148679,1.502875,2.132931e+00,2.317537,2.757362e-01
3,3,E05000400,Alexandra,POINT (520118.1401777423 166393.3293737157),520118.140178,166393.329374,E05000400,41390,high,E05000400,...,4.618977e-01,3.842511e-01,0.521756,1.488705,0.542988,1.482573,1.742734,5.058457e-01,0.413352,1.355478e+00
4,4,E05000402,Beverley,POINT (521204.9458895464 168516.7882485987),521204.945890,168516.788249,E05000402,40700,high,E05000402,...,1.857454e+00,1.726968e-02,0.071723,0.002836,0.696751,1.442905,2.197082,3.941100e+00,0.065761,1.281378e+00
5,5,E05000406,Coombe Hill,POINT (520755.1783848286 170623.2344953832),520755.178385,170623.234495,E05000406,45650,high,E05000406,...,3.667128e-03,6.095854e-01,0.210204,0.029624,0.903743,0.611975,0.215452,8.868991e-01,1.207561,4.300950e-01
6,6,E05000404,Chessington North and Hook,POINT (518064.3902391941 164595.0191127136),518064.390239,164595.019113,E05000404,37230,low,E05000404,...,1.386950e-06,5.200541e-01,1.315066,0.692906,0.749167,1.859986,0.173956,3.624646e-02,7.106886,2.193644e+00
7,7,E05000413,Surbiton Hill,POINT (518303.1766019526 166453.847793755),518303.176602,166453.847794,E05000413,43160,high,E05000413,...,9.497274e-05,4.569783e+00,0.598943,0.916065,0.717714,1.063128,2.162570,2.220716e+00,2.856802,7.224158e-01
8,8,E05000410,Old Malden,POINT (521495.0642951772 166551.3661053034),521495.064295,166551.366105,E05000410,41760,high,E05000410,...,6.776935e+00,4.963256e-03,0.268776,0.187637,0.757987,1.670406,0.491553,4.671473e+00,0.087281,1.796966e+00
9,9,E05000412,St. Mark's,POINT (518248.3378622264 167703.0217714706),518248.337862,167703.021771,E05000412,44930,high,E05000412,...,1.866450e-02,3.241981e+00,0.788001,1.478903,0.747690,0.922346,0.529081,3.492923e+00,3.815740,8.371581e-02


In [99]:
#Try Visualise the Smooth LQ

for x in tokens_lst:
    
    import matplotlib.pyplot as plt
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111)

    #Add london wards as patches
    cmap = plt.get_cmap('BuGn')
    test = PatchCollection(ward_patches, alpha=1, facecolor=cmap(ward_variables[x + '_smooth_lq']), lw = 0.1, 
                                          edgecolor = '0')
    ax.add_collection(test)

    ax.plot(df[x]['eastings'], df[x]['northings'], 'k.', markersize=6, markerfacecolor='red', alpha=0.8)
    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title(x + ' Smoooth_LQ')
    plt.xlabel('eastings')
    plt.ylabel('northings')


    m = cm.ScalarMappable(cmap=cm.BuGn)
    m.set_array(ward_variables[x + '_smooth_lq'])
    plt.colorbar(m)

    #plt.show()
    
     #plt.show()
    fig.savefig('spatial_analysis/figures/' + x + '_Smooth_LQ.png', dpi=200, figsize = (12,8))

## Step 6 - Moran's I Calculation

In [103]:
#Global Moran's I

import pysal
import numpy as np


#Create rooks weights matrix
w = pysal.rook_from_shapefile("Data/ESRI/london_wards.shp")

morans_value = []
morans_EI = []
morans_p = []

for x in tokens_lst:
    #Import variable as array
    y = np.array(ward_variables[x + '_kde'])

    #Calculate Moran's I
    mi = pysal.Moran(y, w, two_tailed=False)

    
    morans_value.append("%.3f"%mi.I)
    morans_EI.append(mi.EI)
    morans_p.append("%.5f"%mi.p_norm)

global_morans_df = pd.DataFrame({'morans_value': morans_value, 'morans_EI': morans_EI, 'morans_p':morans_p}, 
                                index=tokens_lst)

In [104]:
global_morans_df

Unnamed: 0,morans_EI,morans_p,morans_value
cafe,-0.001603,0.0,0.883
coffee,-0.001603,0.0,0.822
pizza,-0.001603,0.0,0.806
wine,-0.001603,0.0,0.811
sushi,-0.001603,0.0,0.727
thai,-0.001603,0.0,0.745
chicken,-0.001603,0.0,0.79
fried,-0.001603,0.0,0.767
fish,-0.001603,0.0,0.827
kebab,-0.001603,0.0,0.735


In [112]:
#Local Moran's I

for x in tokens_lst:

    y = np.array(ward_variables[x + '_kde'])
    lm = pysal.Moran_Local(y,w)
    
    ward_variables[x + '_lmoran_value'] = lm.Is
    ward_variables[x + '_lmoran_p'] = lm.p_sim

In [113]:
ward_variables

Unnamed: 0,position,ward_code,ward_name,centroid,centroid_x,centroid_y,Code,med_income_2012_13,income_category,geography code,...,kebab_lmoran_value,kebab_lmoran_p,costcutter_lmoran_value,costcutter_lmoran_p,waitrose_lmoran_value,waitrose_lmoran_p,sainsburys_lmoran_value,sainsburys_lmoran_p,tesco_lmoran_value,tesco_lmoran_p
0,0,E05000405,Chessington South,POINT (517652.3434795503 162339.1609310878),517652.343480,162339.160931,E05000405,38310,high,E05000405,...,0.611771,0.343,0.890744,0.169,0.384511,0.334,-0.326732,0.175,0.374731,0.449
1,1,E05000414,Tolworth and Hook Rise,POINT (519124.9351873993 165300.0168107432),519124.935187,165300.016811,E05000414,37840,low,E05000414,...,0.278705,0.208,0.164763,0.364,0.166883,0.388,0.059255,0.308,0.153881,0.217
2,2,E05000401,Berrylands,POINT (519108.406980712 167344.3249926214),519108.406981,167344.324993,E05000401,42330,high,E05000401,...,0.086588,0.350,0.008305,0.409,0.119076,0.123,0.148508,0.114,0.512422,0.072
3,3,E05000400,Alexandra,POINT (520118.1401777423 166393.3293737157),520118.140178,166393.329374,E05000400,41390,high,E05000400,...,0.235632,0.127,0.063727,0.266,-0.052399,0.355,0.025539,0.497,0.237289,0.071
4,4,E05000402,Beverley,POINT (521204.9458895464 168516.7882485987),521204.945890,168516.788249,E05000402,40700,high,E05000402,...,0.002482,0.308,-0.174475,0.155,0.359984,0.157,-0.078463,0.303,0.076998,0.022
5,5,E05000406,Coombe Hill,POINT (520755.1783848286 170623.2344953832),520755.178385,170623.234495,E05000406,45650,high,E05000406,...,0.627183,0.045,0.469056,0.037,-0.159007,0.170,-0.010260,0.417,0.641590,0.006
6,6,E05000404,Chessington North and Hook,POINT (518064.3902391941 164595.0191127136),518064.390239,164595.019113,E05000404,37230,low,E05000404,...,0.466848,0.118,0.722509,0.084,0.368003,0.182,-0.068562,0.456,0.246370,0.133
7,7,E05000413,Surbiton Hill,POINT (518303.1766019526 166453.847793755),518303.176602,166453.847794,E05000413,43160,high,E05000413,...,0.212598,0.200,-0.110116,0.266,0.013754,0.381,0.160310,0.214,0.399098,0.054
8,8,E05000410,Old Malden,POINT (521495.0642951772 166551.3661053034),521495.064295,166551.366105,E05000410,41760,high,E05000410,...,0.209479,0.255,0.484237,0.052,0.047471,0.300,0.317207,0.093,0.317589,0.019
9,9,E05000412,St. Mark's,POINT (518248.3378622264 167703.0217714706),518248.337862,167703.021771,E05000412,44930,high,E05000412,...,0.188680,0.162,0.213638,0.238,0.264186,0.285,0.697290,0.160,0.820426,0.013


In [119]:
#Visualise Local Morans Values

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)

for x in tokens_lst:
    
    cmap = plt.get_cmap('bwr')
    test = PatchCollection(ward_patches, alpha=1, facecolor=cmap(ward_variables[x + '_lmoran_p']), lw = 0.1, 
                                          edgecolor = '0')
    ax.add_collection(test)

    ax.plot(df[x]['eastings'], df[x]['northings'], 'k.', markersize=6, markerfacecolor='green', alpha=0.8)
    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title(x + ' Local Morans P Values')
    plt.xlabel('eastings')
    plt.ylabel('northings')


    #m = cm.ScalarMappable(cmap=cm.bwr)
    #m.set_array(ward_variables[x + '_lmoran_value'])
    #plt.colorbar(m)
    
    #plt.show()
    fig.savefig('spatial_analysis/figures/' + x + '_local_morans_p.png', dpi=200, figsize = (12,8))



In [120]:
#Visualise Local Morans P Values

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)

for x in tokens_lst:
    
    cmap = plt.get_cmap('bwr')
    test = PatchCollection(ward_patches, alpha=1, facecolor=cmap(ward_variables[x + '_lmoran_value']), lw = 0.1, 
                                          edgecolor = '0')
    ax.add_collection(test)

    ax.plot(df[x]['eastings'], df[x]['northings'], 'k.', markersize=6, markerfacecolor='green', alpha=0.8)
    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title(x + ' Local Morans Values')
    plt.xlabel('eastings')
    plt.ylabel('northings')


    #m = cm.ScalarMappable(cmap=cm.bwr)
    #m.set_array(ward_variables[x + '_lmoran_value'])
    #plt.colorbar(m)
    
    #plt.show()
    fig.savefig('spatial_analysis/figures/' + x + '_local_morans.png', dpi=200, figsize = (12,8))


## Step 7 - Geographically Weighted Regression

In [156]:
y = np.array([[ward_variables['coffee_kde']]])
x = 
            
            
            
            
            ), ndmin=2)
x = np.array([ward_variables['med_income_2012_13']], ndmin=2)

a = pysal.spreg.ols.OLS(y,x,w)

SyntaxError: invalid syntax (<ipython-input-156-c943fba266cd>, line 2)

In [152]:
x

array([[38310, 37840, 42330, 41390, 40700, 45650, 37230, 43160, 41760,
        44930, 41410, 45600, 34650, 44320, 41850, 48530, 42660, 45320,
        46610, 34630, 44160, 44600, 47030, 39770, 39130, 31000, 32400,
        33530, 36810, 28770, 42260, 26720, 39500, 31220, 38600, 33890,
        33720, 37430, 34430, 38070, 42810, 45660, 40510, 44850, 40930,
        46210, 41650, 44700, 44590, 48020, 43460, 47190, 48530, 34060,
        36090, 43170, 39460, 46250, 32330, 38090, 31740, 32230, 31870,
        31150, 30860, 37040, 29490, 34030, 42140, 35040, 51830, 34010,
        34340, 32850, 32530, 31890, 33580, 39920, 34790, 35830, 49680,
        47080, 29020, 27150, 28550, 29350, 30500, 36480, 48480, 32930,
        27930, 38970, 46240, 32520, 31950, 36220, 41730, 35420, 36040,
        43600, 45540, 34300, 50170, 36500, 44760, 47580, 34470, 33160,
        34730, 35250, 37160, 35820, 40010, 39740, 38130, 43790, 41740,
        37010, 42250, 33590, 29560, 29400, 39890, 33170, 37990, 32680,
      

In [146]:
np.array([1, 2, 3], ndmin=2)

array([[1, 2, 3]])

In [134]:
x = []
x.append([])
x[0].append(ward_variables['med_income_2012_13'])
x[0].append(ward_variables['combined_professionals_pct'])

In [135]:
x

[[0      38310
  1      37840
  2      42330
  3      41390
  4      40700
  5      45650
  6      37230
  7      43160
  8      41760
  9      44930
  10     41410
  11     45600
  12     34650
  13     44320
  14     41850
  15     48530
  16     42660
  17     45320
  18     46610
  19     34630
  20     44160
  21     44600
  22     47030
  23     39770
  24     39130
  25     31000
  26     32400
  27     33530
  28     36810
  29     28770
         ...  
  595    32310
  596    30740
  597    34550
  598    31630
  599    32380
  600    31080
  601    31570
  602    32340
  603    32330
  604    34270
  605    31580
  606    29730
  607    29480
  608    32580
  609    33490
  610    31960
  611    33920
  612    40570
  613    32870
  614    33250
  615    31530
  616    33000
  617    32470
  618    33200
  619    31340
  620    37180
  621    37400
  622    33920
  623    31860
  624    63620
  Name: med_income_2012_13, dtype: int64, 0      26.755113
  1      28.841285
  2    

## Step 7 - Explore Clustering Techniques

In [136]:
list(ward_variables.columns.values)

['position',
 'ward_code',
 'ward_name',
 'centroid',
 'centroid_x',
 'centroid_y',
 'Code',
 'med_income_2012_13',
 'income_category',
 'geography code',
 'combined_professionals_pct',
 'cafe_kde',
 'coffee_kde',
 'pizza_kde',
 'wine_kde',
 'sushi_kde',
 'thai_kde',
 'chicken_kde',
 'fried_kde',
 'fish_kde',
 'kebab_kde',
 'costcutter_kde',
 'waitrose_kde',
 'sainsburys_kde',
 'tesco_kde',
 'cafe_count',
 'coffee_count',
 'pizza_count',
 'wine_count',
 'sushi_count',
 'thai_count',
 'chicken_count',
 'fried_count',
 'fish_count',
 'kebab_count',
 'costcutter_count',
 'waitrose_count',
 'sainsburys_count',
 'tesco_count',
 'all_business_count',
 'cafe_lq',
 'coffee_lq',
 'pizza_lq',
 'wine_lq',
 'sushi_lq',
 'thai_lq',
 'chicken_lq',
 'fried_lq',
 'fish_lq',
 'kebab_lq',
 'costcutter_lq',
 'waitrose_lq',
 'sainsburys_lq',
 'tesco_lq',
 'all_business_kde',
 'cafe_smooth_lq',
 'coffee_smooth_lq',
 'pizza_smooth_lq',
 'wine_smooth_lq',
 'sushi_smooth_lq',
 'thai_smooth_lq',
 'chicken_smoo

In [147]:
#Create scatter Matrix of Correlations

#Subset Data
lst = ['med_income_2012_13', 'combined_professionals_pct']

subset_columns = []

for x in lst:
    subset_columns.append(x)

for x in tokens_lst:
    a = x + '_smooth_lq'
    subset_columns.append(a)
    
ward_variables_subset = ward_variables[subset_columns]

#Rename columns for visualisation
for x in tokens_lst:
    #Rename columns for visualisation
    ward_variables_subset.rename(columns={x + '_smooth_lq': x}, inplace=True)

In [148]:
# takes the dataset, an alpha value for opacity, a figure size setting, and a specification of the diagonal charts

from pandas.tools.plotting import scatter_matrix

a = pd.scatter_matrix(ward_variables_subset, alpha=0.2, diagonal='kde', figsize=(12,12))

plt.show()r

In [149]:
#Calculate correlation coefficients

ward_variables_subset.corr()

Unnamed: 0,med_income_2012_13,combined_professionals_pct,cafe,coffee,pizza,wine,sushi,thai,chicken,fried,fish,kebab,costcutter,waitrose,sainsburys,tesco
med_income_2012_13,1.0,0.864697,0.011773,0.322872,-0.188277,-0.11503,0.332464,0.455396,-0.512733,-0.342476,-0.3088,-0.324942,-0.17918,0.412454,0.084619,-0.094055
combined_professionals_pct,0.864697,1.0,0.106937,0.416787,-0.210705,-0.09553,0.411923,0.508384,-0.484032,-0.279398,-0.468047,-0.408628,-0.264116,0.406241,0.052243,-0.104012
cafe,0.011773,0.106937,1.0,0.161265,-0.009975,-0.183108,0.060346,0.039879,-0.002654,0.166965,0.011098,0.081446,-0.056293,-0.045642,-0.05716,-0.091326
coffee,0.322872,0.416787,0.161265,1.0,-0.072129,-0.19389,0.301811,0.267482,-0.303784,-0.209188,-0.359817,-0.220067,-0.204655,0.191651,0.052386,-0.092915
pizza,-0.188277,-0.210705,-0.009975,-0.072129,1.0,0.056752,-0.164339,-0.041257,0.294167,0.069308,0.143843,0.264059,-0.065838,-0.110813,-0.122068,0.144514
wine,-0.11503,-0.09553,-0.183108,-0.19389,0.056752,1.0,-0.065418,-0.017326,0.153688,0.028947,-9e-06,0.006606,0.053377,-0.049691,-0.131439,0.230798
sushi,0.332464,0.411923,0.060346,0.301811,-0.164339,-0.065418,1.0,0.264629,-0.261847,-0.127685,-0.395438,-0.2809,-0.20251,0.262574,-0.081309,-0.020326
thai,0.455396,0.508384,0.039879,0.267482,-0.041257,-0.017326,0.264629,1.0,-0.276747,-0.156053,-0.210065,-0.099268,-0.205431,0.199938,0.112459,-0.077094
chicken,-0.512733,-0.484032,-0.002654,-0.303784,0.294167,0.153688,-0.261847,-0.276747,1.0,0.678215,0.296923,0.278714,-0.00852,-0.297823,-0.060511,0.022118
fried,-0.342476,-0.279398,0.166965,-0.209188,0.069308,0.028947,-0.127685,-0.156053,0.678215,1.0,0.201447,0.201289,0.006171,-0.263855,-0.033316,0.097611


In [142]:
from sklearn.cluster import DBSCAN

subset = ward_variables_subset[['med_income_2012_13','coffee','chicken', 'sushi', 'kebab']]

import sklearn as sk
scaled = sk.preprocessing.scale(subset)
dbscan = DBSCAN(eps=0.8, min_samples=5)
dbscan.fit(scaled)
dbscan_labels = dbscan.labels_
ward_variables['clusters'] = dbscan_labels

#Visualise
cmap = plt.get_cmap('gist_ncar')
ward_variables.plot(kind = 'scatter', x='centroid_x', y = 'centroid_y', c='clusters', cmap =  cmap)
plt.show()


In [206]:
for x in range(5):

    from sklearn.cluster import AgglomerativeClustering

    subset = ward_variables_subset[['coffee','chicken','sushi', 'kebab']]

    import sklearn as sk
    scaled = sk.preprocessing.scale(subset)
    AgClustering = AgglomerativeClustering(n_clusters=x+1)
    AgClustering.fit(scaled)
    AgClustering_labels = AgClustering.labels_
    ward_variables['ag_clusters'] = AgClustering_labels

    #Visualise Scatter
    cmap = plt.get_cmap('gist_ncar')
    ward_variables.plot(kind = 'scatter', x='centroid_x', y = 'centroid_y', c='ag_clusters', cmap =  cmap)
    #plt.show()

    #cloropleth
    plt.clf()
    fig = plt.figure()
    ax = fig.add_subplot(111, axisbg='w', frame_on=False)

    # use a blue colour ramp - we'll be converting it to a map using cmap()
    cmap = plt.get_cmap('gist_ncar')
    pc = PatchCollection(ward_patches, alpha=1, lw = 0.1, edgecolor = '0')

    # impose our colour map onto the patch collection
    norm = Normalize()
    pc.set_facecolor(cmap(norm(ward_variables['ag_clusters'].values)))
    ax.add_collection(pc)

    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title('ag_cluster')
    plt.xlabel('eastings')
    plt.ylabel('northings')

    #plt.show()

    fig.savefig('spatial_analysis/figures/' + str(x) + 'cluster.png', dpi=200, figsize = (12,8))
