# Spatial Analysis of Food Business Data

### Exploring gentrification through consumer preferences

In [93]:
#LOAD REQUIRED PACKAGES

#Stats and data structures
import pandas as pd
import numpy as np

from scipy import stats

#Matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors
from matplotlib.colors import Normalize
from matplotlib.collections import PatchCollection
from descartes import PolygonPatch

#Coordinate system transformation
import pyproj
from pyproj import Proj, transform

#Shapefile reading and manipulataion
import shapefile
import shapely
from shapely.geometry import Polygon
from shapely.geometry import Point

## Step 1 - Data import and cleaning

In [94]:
#Set working directory
import os
os.chdir("C:/Users/Claire/Google Drive/LondonGentrification")

In [95]:
#Import .csv data 

#Word tokens 
tokens_df = pd.read_csv("Data/FoodPremises/tokens_spatial.csv")

#Food businesses
food_bus_df = pd.read_csv("data/FoodPremises/london_premises.csv")
#Remove records with no coordinates
food_bus_df = food_bus_df.loc[food_bus_df['Latitude']>0]

In [96]:
#Check column names and data types
print tokens_df.dtypes
print food_bus_df.dtypes

BusinessID                float64
BusinessName               object
BusinessType               object
BusinessTypeID            float64
ConfidenceInManagement    float64
Hygiene                   float64
LocalAuthorityCode        float64
LocalAuthorityName         object
PostCode                   object
RatingValue               float64
Structural                float64
Token                      object
lat                       float64
lon                       float64
dtype: object
Index                       int64
BusinessName               object
BusinessType               object
BusinessTypeID              int64
PostCode                   object
RatingValue               float64
RatingDate                 object
LocalAuthorityCode          int64
LocalAuthorityName         object
Hygiene                   float64
Structural                float64
ConfidenceInManagement    float64
Longitude                 float64
Latitude                  float64
dtype: object


In [97]:
#Function to add eastings and northings from lat an lon
#Add eastings and northings to word tokens

def transform_coordinates (data,input_espg, output_espg, input_x, input_y):

    #define input and output projection
    input_projection = pyproj.Proj("+init=" + input_espg) #wgs84
    output_projection = pyproj.Proj("+init=" + output_espg) #osgb36

    eastings = []
    northings = []

    x_list = input_x.tolist()
    y_list = input_y.tolist()

    for i in range (len(input_x)):
        x = x_list[i]
        y = y_list[i]
        new_x, new_y = pyproj.transform(input_projection, output_projection, x, y)
        eastings.append(new_x)
        northings.append(new_y)

    #Add to tokens dataframe
    data['eastings'] = eastings
    data['northings'] = northings

In [98]:
#Apply funtion to dataframe - convert tokens and businesses to eastings and northings

transform_coordinates(tokens_df, "EPSG:4326", "EPSG:27700", 
                       tokens_df['lon'], tokens_df['lat'])

transform_coordinates(food_bus_df, "EPSG:4326", "EPSG:27700", 
                       food_bus_df['Longitude'], food_bus_df['Latitude'])

In [99]:
#Create subsetted dataframes for each token of interest for operations later

#List of tokens of interest to perform iteration operations:
tokens_lst = ['cafe', 'coffee', 'pizza', 'wine', 'sushi', 'thai', 'chicken', 
                   'fried', 'fish', 'kebab', 'costcutter', 'waitrose', 'sainsburys', 'tesco', 'grill']

#Colour codes to associate with each token
tokens_color = ['#9fc54d', '#75c156', '#33a457', '#71b67b', '#61bdf0', '#1e71b8', '#e03c00', 
                '#e56000', '#f08c00', '#ffea00', '#18563e', '#729f1e', '#ee7a01', '#0053a0', '#1ea86c']

df = {}

for x in tokens_lst:
    df[x] = pd.DataFrame(tokens_df.loc[tokens_df['Token'] == x])

In [100]:
#Import london wards shapefile and save as matplot lib patches for plotting

#Load the shapefile of polygons and convert it to shapely polygon objects
polygons_sf = shapefile.Reader("Data/ESRI/london_wards.shp")
polygons_south_sf = shapefile.Reader("Data/ESRI/south_london_wards.shp") #Separate shapefile for south london wards
polygons_north_sf = shapefile.Reader("Data/ESRI/north_london_wards.shp") #Separate shapefile for north london wards

polygon_shapes = polygons_sf.shapes() #Create object with shapes
polygon_points = [q.points for q in polygon_shapes ] #Extract point information from shapes
polygons = [Polygon(q) for q in polygon_points]

#Create matplotlib patches from shapely polygons for mapping figures
ward_patches = []
for x in range (len(polygons)):
    a = PolygonPatch(polygons[x])
    ward_patches.append(a)
    
#Define bounding box of the shapefile (eastings and northings) - plus boundary increase for figures
xmin = polygons_sf.bbox[0] - 2000
xmax = polygons_sf.bbox[2] + 2000
ymin = polygons_sf.bbox[1] - 2000
ymax = polygons_sf.bbox[3] + 2000

In [101]:
#Create wards dataframe from shapefile

#polygons_sf.fields #Access shapefile fields
#polygons_sf.records #Access shapefile records

records = polygons_sf.records()

position = []
ward_name = []
ward_code = []

for x in range (len(records)):
    position.append(records[x][0] - 1)
    ward_name.append(records[x][1])
    ward_code.append(records[x][2])

#Compile dataframe
ward_variables = pd.DataFrame({'position': position, 'ward_name': ward_name,'ward_code':ward_code})

In [102]:
#Create field to define wards north and south of the river

records_north = polygons_north_sf.records()
records_south = polygons_south_sf.records()

code_north = []
n=[]
code_south = []
s=[]

for x in range (len(records_north)):
    code_north.append(records_north[x][2])
    n.append('n')
    
for x in range (len(records_south)):
    code_south.append(records_south[x][2])
    s.append('s')

#Compile dataframe
north_wards_df = pd.DataFrame({'code':code_north, 'n_s':n})
south_wards_df = pd.DataFrame({'code':code_south, 'n_s':s})
north_south_df = north_wards_df.append(south_wards_df)

#Merge north south dataframe to ward variables dataframe by ward code
ward_variables = pd.merge(ward_variables, north_south_df,left_on='ward_code', right_on='code' , how='inner')


In [103]:
#Calculate centroids for each ward polygon and add to ward variables dataframe
centroids = []

for x in range(len(polygons)):
    a = polygons[x].centroid
    centroids.append(a)
    
ward_variables['centroid'] = centroids

centroid_x = []
centroid_y = []


for x in ward_variables['centroid']:
    centroid_x.append(x.x)
    centroid_y.append(x.y)
    
ward_variables['centroid_x'] = centroid_x 
ward_variables['centroid_y'] = centroid_y

In [104]:
#Join income data to ward variables dataframe

#Import income .csv
ward_income = pd.read_csv("Data/modelled-household-income-estimates-wards.csv")

#Change code for city of london to be consistent with shapefile
ward_income.set_value(0,['Code'], 'E05001554')

#Join to ward variables dataframe
ward_variables = ward_variables.merge(ward_income[['Code','Median 2012_13']], left_on='ward_code', right_on = 'Code')

#Rename Column
ward_variables=ward_variables.rename(columns = {'Median 2012_13':'med_income_2012_13'})

In [105]:
#Categorise as high or low income (below or above median)

#Caluculate median
med_income = np.median(ward_variables['med_income_2012_13'])

#Define function for categories
def income_category (row):
    if row['med_income_2012_13'] >= med_income:
          return 'high'
    if row['med_income_2012_13'] < med_income:
          return 'low'

#Apply function to create new dataframe coloumn
ward_variables['income_category'] = ward_variables.apply(lambda row: income_category (row), axis=1)

In [106]:
#Import occupation data by ward

#Import occupation .csv
ward_occupation = pd.read_csv("Data/ward_occupation_data.csv")

#Calculate a new field to show the percentage of the employed population employed as professionals
ward_occupation['combined_professionals_pct'] = ((ward_occupation['Sex: All persons; Occupation: 1. Managers, directors and senior officials; measures: Value'] +
    ward_occupation['Sex: All persons; Occupation: 2. Professional occupations; measures: Value'])/
    ward_occupation['Sex: All persons; Occupation: All categories: Occupation; measures: Value']) * 100

In [107]:
#Merge ward occupation information to be stored in the ward variables dataframe
ward_variables = ward_variables.merge(ward_occupation[['geography code', 'combined_professionals_pct']], left_on='ward_code', right_on='geography code')

In [108]:
#Add colours for web mapping - define function

#FUNCTION TO ASSIGN COLOURS (EQUAL INTERVAL)
import numpy as np

def color_assign (row, data, var, N, c):
    """
    :param row: marker for apply function of dataframe
    :param data: column of dataframe for calculations
    :param var: column name
    :param N: number of desired colour intervals
    :param x: desired colour as tring either 'r' (red), 'g' (green), 'p'(purple) or 'b' (blue)
    :return: <returns a colour accordining to the location of the value >returns the colour according to the interval the value belongs to
    :example of application to a Selected column of a dataframe:
    df['Color'] = df.apply(lambda row: color_assign (row, df['<Selected Column>'], '<Selected Column>'), axis=1)
    """
    #CREATE A SPECTRUM OF COLOURS
    increment = 255/(N+1)
    RGB_tuples = []
    
    for x in range(N):

        if c == 'r':
            R = 255
            G = (230-(x*increment))
            B = (230-(x*increment))

        if c == 'g':        
            R = (230-(x*increment))
            G = (255-(x/2*increment))
            B = (230-(x*increment))

        if c == 'p':        
            R = (230-(x/4*increment))
            G = (230-(x*increment))
            B = 255
            
        if c == 'b':        
            R = (230-(x*increment))
            G = (230-(x*increment))
            B = 255
            
        RGB_tuples.append((R,G,B))

    #Convert to hex color format for mapping
    hex_colour = []
    for x in RGB_tuples:
        hexa = '#%02x%02x%02x' % (x)
        hex_colour.append(hexa)

    minimum = data.min()
    maximum = data.max()

    N=len(hex_colour)
    intervals = np.linspace(minimum, maximum, num=N+1)
    for x in range(N):
        if (row[var] <= intervals[x+1]) & (row[var] >= intervals [x]):
            return hex_colour[x]

In [109]:
#Assign color range for income
ward_variables['income_color'] = ward_variables.apply(lambda row: color_assign(row, 
                                                ward_variables['med_income_2012_13'], 'med_income_2012_13', 7, 'b'), axis=1)

In [110]:
#Save intervals to file for website legend

minimum = ward_variables['med_income_2012_13'].min()
maximum = ward_variables['med_income_2012_13'].max()
income_intervals_df = pd.DataFrame()
intervals = np.linspace(minimum, maximum, num=8)
income_intervals_df['income_intervals'] = intervals[1:8]
income_intervals_df.to_csv('Data/income_intervals_legend.csv')

## Step 2 - Calculate number of each token within each ward

In [111]:
# Convert token coordinates to shapely point file for each dataframe
from shapely.geometry import Point

token_points = {}
token_points_coords = {}

for x in tokens_lst:
    #Make shapely points
    token_points[x] = [Point(xy) for xy in zip(df[x]['eastings'], df[x]['northings'])]
    
    #convert shapely points into coordinate tuples
    point_coords = []
    for i in range(len(token_points[x])):
        a = ([token_points[x][i].x, token_points[x][i].y])
        point_coords.append(a)
    token_points_coords[x] = point_coords

In [112]:
#Build a spatial index based on the bounding boxes of the polygons
from rtree import index
idx = index.Index()
count = -1
for q in polygon_shapes:
    count +=1
    idx.insert(count, q.bbox)

In [113]:
#Assign one or more matching polygons to each point

for x in tokens_lst:
    matches = []

    for i in range(len(df[x]['Token'])): #Iterate through each point
        temp= None
        #print "Point ", i

        #Iterate only through the bounding boxes which contain the point
        for j in idx.intersection(token_points_coords[x][i]):
            #Verify that point is within the polygon itself not just the bounding box
            if token_points[x][i].within(polygons[j]):
                temp=j
                break
        matches.append(temp) #Either the first match found, or None for no matches
    
    df[x]['ward_no'] = matches
    
    #Merge ward name and code information based on the polygons matched in the spatial join operation above
    df[x] = df[x].merge(ward_variables[['position','ward_name', 'ward_code','n_s']], left_on='ward_no', right_on='position')
    

In [None]:
#Names of wards
london_wards = ward_variables['ward_name']

#Count the number of tokens for each ward and join to ward_variables dataframe
for x in tokens_lst:
    token_count = []
    for b in london_wards:
            temp_df = df[x].loc[(df[x]['Token'] == x) & 
                                        (df[x]['ward_name'] == b)]
            token_count.append(len(temp_df.index))
    ward_variables[x + '_count'] = token_count

## Step 3 - Create KDE surface and attach KDE values to ward centroids

Separate surfances are used for wards north and south of the river to take into account the accessibility barrier effect of the thames. This ensures that a high concerntration of tokens along one side of the river does not effect calculations for the other side of the river.

In [None]:
#In order to record density surface for each token
kernels = {}
kernels_n = {}
kernels_s = {}

for x in tokens_lst:
    
    #Set up north south subsets
    n_subset_df = df[x].loc[df[x]['n_s']=='n']
    s_subset_df = df[x].loc[df[x]['n_s']=='s']

    #Set up grid and KDE calculation
    X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    
    #Calculate entiire density surface
    values_coffee = np.vstack([df[x]['eastings'], df[x]['northings']])
    kernels[x] = stats.gaussian_kde(values_coffee, 0.1) #Bandwidth set ot 0.1
    Z = np.reshape(kernels[x](positions).T, X.shape)
    
    #Calculate north density surface
    values_coffee_n = np.vstack([n_subset_df['eastings'], n_subset_df['northings']])
    kernels_n[x] = stats.gaussian_kde(values_coffee_n, 0.1) #Bandwidth set ot 0.1

    #Calculate south density surface
    values_coffee_s = np.vstack([s_subset_df['eastings'], s_subset_df['northings']])
    kernels_s[x] = stats.gaussian_kde(values_coffee_s, 0.1) #Bandwidth set ot 0.1
    
    #Plot KDE surface and save
    token = x

    import matplotlib.pyplot as plt
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111)
    plt.setp(ax.get_xticklabels(), fontsize=8)
    plt.setp(ax.get_yticklabels(), fontsize=8)
    
    #Add london wards as patches
    ax.add_collection(PatchCollection(ward_patches, alpha=1, facecolor='None', lw = 0.1, 
                                      edgecolor = '0'))
    kde_surface = ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
               extent=[xmin, xmax, ymin, ymax])
    ax.plot(df[x]['eastings'], df[x]['northings'], 'k.', markersize=2)
    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title("Kernel Density Surface (KDE) for Word Token '" + token +"'", weight = 'heavy', y=1.05)
    ax.arrow(560000, 158000, 0, 4000, head_width=700, head_length=700, fc='k', ec='k')

    #ax.set_ylim([ymin, ymax])
    plt.colorbar(kde_surface, cmap=plt.cm.gist_earth_r )

    #plt.show()
    fig.savefig('spatial_analysis/figures/KDE/' + token + '_kde.png', dpi=200, figsize = (12,8), transparent=True)

    #Evaluate density at each ward centroid
    
    ward_variables[x + '_kde'] = ""
    
    for y in range(len(ward_variables['ward_name'])):
        
        #Use north density surface if ward is located north of the river
        if ward_variables['n_s'].iloc[y] == 'n':
            kde = kernels_n[x].evaluate([ward_variables['centroid'].iloc[y].x, 
                                 ward_variables['centroid'].iloc[y].y])
            ward_variables.set_value(y, [x + '_kde'], kde)
        
        #Use south density surface if ward is located south of the river
        if ward_variables['n_s'].iloc[y] == 's':
            kde = kernels_s[x].evaluate([ward_variables['centroid'].iloc[y].x, 
                                     ward_variables['centroid'].iloc[y].y])
            ward_variables.set_value(y, [x + '_kde'], kde)
    
    #Ensure output is type 'float' for later operations
    ward_variables[x + '_kde'] = ward_variables[x + '_kde'].astype(float)


## Step 4 - Calculate number of businesses within each ward

In [None]:
# Convert business location coordinates to shapely point file for each dataframe
from shapely.geometry import Point

#Make shapely points
food_bus_points = [Point(xy) for xy in zip(food_bus_df['eastings'], food_bus_df['northings'])]
    
#convert shapely points into coordinate tuples
point_coords = []
for i in range(len(food_bus_points)):
    a = ([food_bus_points[i].x, food_bus_points[i].y])
    point_coords.append(a)
food_bus_coords = point_coords

In [None]:
#Perform spatial join operation (using same bounding box index above to reduce the time)

matches = []

for i in range(len(food_bus_df['Latitude'])): #Iterate through each point
    temp= None
    #print "Point ", i

    #Iterate only through the bounding boxes which contain the point
    for j in idx.intersection(food_bus_coords[i]):
        #Verify that point is within the polygon itself not just the bounding box
        if food_bus_points[i].within(polygons[j]):
            temp=j
            break
    matches.append(temp) #Either the first match found, or None for no matches

#Join ward information to the food business datframe based on the matching ward shapes
food_bus_df['ward_no'] = matches
food_bus_df = food_bus_df.merge(ward_variables[['position','ward_name', 'ward_code', 'n_s']], left_on='ward_no', right_on='position')

In [None]:
#Calculate number of businesses within each ward by calculating the length of dataframe slices for each ward

token_count = []

for b in london_wards:
        temp_df = food_bus_df.loc[food_bus_df['ward_name'] == b]
        token_count.append(len(temp_df.index))
        
ward_variables['all_business_count'] = token_count

## Step 5 - Calculate location quotient

The location quotient is a double ratio measure with the ratio between the count for the word token against the count for all food businesses as a ratio between the local and global ratio.

In [None]:
#Calculate LQ for each token
#Word token number for each ward centroid divided by all business number for each ward centroid divided by global aggregate

for x in tokens_lst:
    ward_variables[x +'_lq'] = (ward_variables[x + '_count']/(ward_variables['all_business_count']))/(len(df[x])/float(len(food_bus_df.index)))

In [None]:
#Visualise the LQ

for x in tokens_lst:
    
    import matplotlib.pyplot as plt
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111)
    plt.setp(ax.get_xticklabels(), fontsize=8)
    plt.setp(ax.get_yticklabels(), fontsize=8)

    #Add london wards as patches
    cmap = plt.get_cmap('Blues')
    test = PatchCollection(ward_patches, alpha=1, facecolor=cmap(ward_variables[x + '_lq']), lw = 0.1, 
                                          edgecolor = '0')
    ax.add_collection(test)
    
    #Add token locations as red points
    ax.plot(df[x]['eastings'], df[x]['northings'], 'k.', markersize=4, markerfacecolor='red', alpha=0.8)
    
    #Titles and figure text settings
    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title("Location Quotient (LQ) for Number of Word Token '" + token +"' by Ward", weight = 'heavy', y=1.05)
    ax.arrow(560000, 158000, 0, 4000, head_width=700, head_length=700, fc='k', ec='k')

    #Add colourbar
    m = cm.ScalarMappable(cmap=cm.Blues)
    m.set_array(ward_variables[x + '_lq'])
    plt.colorbar(m)
    
    #Save figures to new folder
    fig.savefig('spatial_analysis/figures/LQ_token_no/' + x + '_LQ_no.png', dpi=200, figsize = (12,8), transparent=True)


## Step 6 - Calculate smoothed location quotient

The smoothed location quotient has been calculated using the kernel density estimate evaluations performed above for each ward centroid rather than a count of businesses in each ward. This measure smooths out the arbitrary effect of ward boundaries. 

The location quotient is a double ratio measure with the ratio between the KDE for the word token against the KDE for all food businesses as a ratio between the local and global ratio.

In [None]:
#Calculate the ward centroid KDE for all businesses in order to complete operation

#Calculate kde for all wards
n_subset_df = food_bus_df.loc[food_bus_df['n_s']=='n']
s_subset_df = food_bus_df.loc[food_bus_df['n_s']=='s']

#Add values for LQ for all businesses by ward
#Set up grid and KDE calculation
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([food_bus_df['eastings'], food_bus_df['northings']])
kernels = stats.gaussian_kde(values, 0.1) #Bandwidth set ot 0.1
Z = np.reshape(kernels(positions).T, X.shape)

#Calculate north density surface
values_coffee_n = np.vstack([n_subset_df['eastings'], n_subset_df['northings']])
kernels_n = stats.gaussian_kde(values_coffee_n, 0.1) #Bandwidth set ot 0.1

#Calculate north density surface
values_coffee_s = np.vstack([s_subset_df['eastings'], s_subset_df['northings']])
kernels_s = stats.gaussian_kde(values_coffee_s, 0.1) #Bandwidth set ot 0.1

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
plt.setp(ax.get_xticklabels(), fontsize=8)
plt.setp(ax.get_yticklabels(), fontsize=8)

#Add london wards as patches
ax.add_collection(PatchCollection(ward_patches, alpha=1, facecolor='None', lw = 0.1, 
                                      edgecolor = '0'))
kde_surface = ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
               extent=[xmin, xmax, ymin, ymax])
ax.plot(food_bus_df['eastings'], food_bus_df['northings'], 'k.', markersize=2)
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
plt.title("Kernel Density Surface (KDE) for All Food Businesses", weight = 'heavy', y=1.05)
ax.arrow(560000, 158000, 0, 4000, head_width=700, head_length=700, fc='k', ec='k')

#ax.set_ylim([ymin, ymax])
plt.colorbar(kde_surface, cmap=plt.cm.gist_earth_r )

#plt.show()
fig.savefig('spatial_analysis/figures/KDE/allfoodbusiness_kde.png', dpi=200, figsize = (12,8), transparent=True)

#Evaluate density at each ward centroid
ward_kde = []
    
for y in range(len(ward_variables['ward_name'])):
    kde = kernels.evaluate([ward_variables['centroid'].iloc[y].x, 
                             ward_variables['centroid'].iloc[y].y])
    ward_kde.append(kde[0])
        
ward_variables['all_business_kde'] = ""

for y in range(len(ward_variables['ward_name'])):
     #Use north density surface if ward is located north of the river
    if ward_variables['n_s'].iloc[y] == 'n':
        kde = kernels_n.evaluate([ward_variables['centroid'].iloc[y].x, 
                                 ward_variables['centroid'].iloc[y].y])
        ward_variables.set_value(y, ['all_business_kde'], kde)
        
    #Use south density surface if ward is located south of the river
    if ward_variables['n_s'].iloc[y] == 's':
        kde = kernels_s.evaluate([ward_variables['centroid'].iloc[y].x, 
                                     ward_variables['centroid'].iloc[y].y])
        ward_variables.set_value(y, ['all_business_kde'], kde)

ward_variables['all_business_kde'] = ward_variables['all_business_kde'].astype(float)

In [None]:
#Calculate the smooth location quotient for each ward based on centroid KDEs
for x in tokens_lst:
    ward_variables[x +'_smooth_lq'] = (ward_variables[x + '_kde']/(ward_variables['all_business_kde']))/(sum(ward_variables[x + '_kde'])/float(sum(ward_variables['all_business_kde'])))
    #Convert to float
    ward_variables[x +'_smooth_lq'] = ward_variables[x +'_smooth_lq'].astype(float)

In [None]:
#Colour set for visulisation of location quotient (quantiles)

color_list = ['#0066ff', '#00FFFF', '#ffff3C', '#ff00ff', '#cc0099']
interval_list = ['Max Interval 1', 'Max Interval 2','Max Interval 3','Max Interval 4','Max Interval 5']

#Define function for categories
def lq_color (row, df, var):

    if row[var] <= np.percentile(df[var], 20):
          return color_list[0]
    elif row[var] <= np.percentile(df[var], 40):
          return color_list[1]
    elif row[var] <= np.percentile(df[var], 60):
          return color_list[2]
    elif row[var] <= np.percentile(df[var], 80):
          return color_list[3]
    elif row[var] <= np.percentile(df[var], 100):
          return color_list[4] 

In [None]:
#Calculate colour fields (bright colours) for location quotients using function defined above
for x in tokens_lst:
    ward_variables[x +'_smooth_lq_color'] = ward_variables.apply(lambda row: lq_color(row, 
                                                    ward_variables, x +'_smooth_lq'), axis=1)

In [None]:
#Generate dataframe with legend for quantiles

legend_intervals_df = pd.DataFrame(index=interval_list)

for x in tokens_lst:
    intervals = [np.percentile(ward_variables[x +'_smooth_lq'], 20), 
                 np.percentile(ward_variables[x +'_smooth_lq'] , 40), 
                 np.percentile(ward_variables[x +'_smooth_lq'], 60),
                 np.percentile(ward_variables[x +'_smooth_lq'] , 80), 
                 np.percentile(ward_variables[x +'_smooth_lq'] , 100)]
    legend_intervals_df [x +'_smooth_lq'] = intervals


In [None]:
legend_intervals_df.to_csv('Data/legend_location_quotient_intervals.csv')

In [None]:
#Visualise the Smooth LQ

for x in tokens_lst:
    
    import matplotlib.pyplot as plt
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111)
    plt.setp(ax.get_xticklabels(), fontsize=8)
    plt.setp(ax.get_yticklabels(), fontsize=8)


    #Add london wards as patches
    cmap = plt.get_cmap('BuGn')
    test = PatchCollection(ward_patches, alpha=1, facecolor=cmap(ward_variables[x + '_smooth_lq']), lw = 0.1, 
                                          edgecolor = '0')
    ax.add_collection(test)
    
    #Plot points
    ax.plot(df[x]['eastings'], df[x]['northings'], 'k.', markersize=4, markerfacecolor='red', alpha=0.8)
    
    #Axis options
    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title("KDE Location Quotient (LQ) for Word Token '" + x +"'", weight = 'heavy', y=1.05)
    ax.arrow(560000, 158000, 0, 4000, head_width=700, head_length=700, fc='k', ec='k')

    #Add Color bar
    m = cm.ScalarMappable(cmap=cm.BuGn)
    m.set_array(ward_variables[x + '_smooth_lq'])
    plt.colorbar(m)
    
    #Save figure
    fig.savefig('spatial_analysis/figures/LQ_KDE_smooth/' + x + '_smooth_LQ.png', dpi=200, figsize = (12,8), transparent = True)

## Step 6 - Moran's I Calculation

In [None]:
#Global Moran's I

import pysal
import numpy as np


#Create rooks weights matrix
w = pysal.rook_from_shapefile("Data/ESRI/london_wards.shp")

morans_value = []
morans_EI = []
morans_p = []

for x in tokens_lst:
    #Import variable as array
    y = np.array(ward_variables[x + '_kde'].astype(float))

    #Calculate Moran's I
    mi = pysal.Moran(y, w, two_tailed=False)

    
    morans_value.append("%.3f"%mi.I)
    morans_EI.append(mi.EI)
    morans_p.append("%.5f"%mi.p_norm)

global_stats_df = pd.DataFrame({'morans_value': morans_value, 'morans_EI': morans_EI, 'morans_p':morans_p}, 
                                index=tokens_lst)

In [None]:
global_stats_df

Unnamed: 0,morans_EI,morans_p,morans_value
cafe,-0.001603,0.0,0.753
coffee,-0.001603,0.0,0.681
pizza,-0.001603,0.0,0.547
wine,-0.001603,0.0,0.625
sushi,-0.001603,0.0,0.437
thai,-0.001603,0.0,0.537
chicken,-0.001603,0.0,0.655
fried,-0.001603,0.0,0.592
fish,-0.001603,0.0,0.638
kebab,-0.001603,0.0,0.535


In [None]:
#Local Moran's I

for x in tokens_lst:

    y = np.array(ward_variables[x + '_kde'])
    lm = pysal.Moran_Local(y,w)
    
    ward_variables[x + '_lmoran_value'] = lm.Is
    ward_variables[x + '_lmoran_p'] = lm.p_sim

In [None]:
#Add colours for web mapping of morans I value 

def morans_color (row, df, p_var, lq_var, mor_var):
    
    subset_high_df = df[(df[p_var] <= 0.05) & (df[lq_var] > 1)]
    subset_low_df = df[(df[p_var] <= 0.05) & (df[lq_var] < 1)]
    
    min_high = min(subset_high_df[mor_var])
    max_high = max(subset_high_df[mor_var])
    med_high = np.median(subset_high_df[mor_var])
    
    min_low = min(subset_low_df[mor_var])
    max_low = max(subset_low_df[mor_var])
    med_low = np.median(subset_low_df[mor_var])
    
    if row[p_var] > 0.05: #if not significant, colour grey
        return '#acacac'
    elif row[lq_var] < 1 and row[mor_var] < med_low: #if significant and less than median value for low concerntration color light blue
        return '#00FFFF'
    elif row[lq_var] < 1 and row[mor_var] >= med_low: #if significant and greater than median value for low color dark blue
        return '#0066ff'
    elif row[lq_var] > 1 and row[mor_var] < med_high: #if significant and less than median for high colow light pink
        return '#ff00ff'
    elif row[lq_var] > 1 and row[mor_var] >= med_high:
        return '#cc0099'

for x in tokens_lst:
    ward_variables[x +'_morans_color'] = ward_variables.apply(lambda row: morans_color(row, 
                                                    ward_variables, x + '_lmoran_p', x + '_smooth_lq', x + '_lmoran_value'), axis=1)

In [None]:
#Create dataframe for local morans values
labels = ['very low value cluster lower limit', 'very low value cluster upper limit', 'low value cluster limit', 
          'not significant', 'high value cluster lower limit', 'high value cluster upper limit', 'very high value cluster limit']

lmoran_legend_df = pd.DataFrame(index = labels)

for x in tokens_lst:

    subset_high_df = ward_variables[(ward_variables[x + '_lmoran_p'] <= 0.05) & (ward_variables[x + '_smooth_lq'] > 1)]
    subset_low_df = ward_variables[(ward_variables[x + '_lmoran_p'] <= 0.05) & (ward_variables[x + '_smooth_lq'] < 1)]

    min_high = min(subset_high_df[ x + '_lmoran_value'])
    max_high = max(subset_high_df[ x + '_lmoran_value'])
    med_high = np.median(subset_high_df[ x + '_lmoran_value'])

    min_low = min(subset_low_df[ x + '_lmoran_value'])
    max_low = max(subset_low_df[ x + '_lmoran_value'])
    med_low = np.median(subset_low_df[ x + '_lmoran_value'])

    values = [min_low, med_low, max_low, '0', min_high, med_high, max_high]

    lmoran_legend_df[x + '_lmoran_value'] = values

In [None]:
lmoran_legend_df.to_csv('Data/local_morans_legend_values.csv')

In [None]:
#Visualise Local Morans Values

for x in tokens_lst:
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111)
    plt.setp(ax.get_xticklabels(), fontsize=8)
    plt.setp(ax.get_yticklabels(), fontsize=8)
    
    cmap = plt.get_cmap('Purples')
    test = PatchCollection(ward_patches, alpha=1, facecolor=cmap(ward_variables[x + '_lmoran_value']), lw = 0.1, 
                                          edgecolor = '0')
    ax.add_collection(test)

    ax.plot(df[x]['eastings'], df[x]['northings'], 'k.', markersize=4, markerfacecolor='green', alpha=0.8)
    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title("Local Morans I Values for Word Token '" + x +"'", weight = 'heavy', y=1.05)
    ax.arrow(560000, 158000, 0, 4000, head_width=700, head_length=700, fc='k', ec='k')

    m = cm.ScalarMappable(cmap=cm.Purples)
    m.set_array(ward_variables[x + '_lmoran_value'])
    plt.colorbar(m)

    fig.savefig('spatial_analysis/figures/local_morans/' + x + '_local_morans.png', dpi=200, figsize = (12,8), transparent = True)

In [None]:
#Visualise Local Morans P Values

for x in tokens_lst:
    
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111)
    plt.setp(ax.get_xticklabels(), fontsize=8)
    plt.setp(ax.get_yticklabels(), fontsize=8)
    
    cmap = plt.get_cmap('bwr')
    test = PatchCollection(ward_patches, alpha=1, facecolor=cmap(ward_variables[x + '_lmoran_p']), lw = 0.1, 
                                          edgecolor = '0')
    ax.add_collection(test)

    ax.plot(df[x]['eastings'], df[x]['northings'], 'k.', markersize=4, markerfacecolor='green', alpha=0.8)
    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title("Local Morans I P Values for Word Token '" + x +"'", weight = 'heavy', y=1.05)
    ax.arrow(560000, 158000, 0, 4000, head_width=700, head_length=700, fc='k', ec='k')

    m = cm.ScalarMappable(cmap=cm.bwr)
    m.set_array(ward_variables[x + '_lmoran_p'])
    plt.colorbar(m)

    fig.savefig('spatial_analysis/figures/local_morans/' + x + '_local_morans_p.png', dpi=200, figsize = (12,8), transparent=True)


## Step 7 - Correlation

In [None]:
#Create scatter Matrix of Correlations

#Subset Data
lst = ['med_income_2012_13', 'combined_professionals_pct']

subset_columns = []

for x in lst:
    subset_columns.append(x)

for x in tokens_lst:
    a = x + '_smooth_lq'
    subset_columns.append(a)
    
ward_variables_subset = ward_variables[subset_columns]

#Rename columns for visualisation
for x in tokens_lst:
    #Rename columns for visualisation
    ward_variables_subset.rename(columns={x + '_smooth_lq': x}, inplace=True)

In [None]:
# takes the dataset, an alpha value for opacity, a figure size setting, and a specification of the diagonal charts

from pandas.tools.plotting import scatter_matrix
fig = plt.figure(figsize=(13,13))
a = pd.scatter_matrix(ward_variables_subset, alpha=0.2, diagonal='kde', figsize=(12,12))
#plt.show()

In [None]:
#Calculate correlation coefficients
a = ward_variables_subset.corr()
subset = a.iloc[2:16]

global_stats_df['income_correlation'] = subset['med_income_2012_13']
global_stats_df['professional_pct_correlation'] = subset['combined_professionals_pct']

In [None]:
global_stats_df

## Step 8 - Regression

In [None]:
#Regression - Income only

w = pysal.rook_from_shapefile("Data/ESRI/london_wards.shp")
x = np.reshape(ward_variables['med_income_2012_13'], newshape=(625, 1))

var = 'income'

r2 = []
ar2 = []
moran_res = []
coefficient = []
t_stat = []

for i in tokens_lst:
    y = np.reshape(a=ward_variables[i + '_smooth_lq'],newshape=(625,1))
    a = pysal.spreg.ols.OLS(y,x,w, spat_diag=True, moran=True)
    ward_variables[i + '_' + var + '_residuals'] = a.u
    
    r2.append(a.r2)
    ar2.append(a.ar2)
    moran_res.append(a.moran_res)
    coefficient.append(a.betas)
    t_stat.append(a.t_stat)

global_stats_df[var + '_r2'] = r2
global_stats_df[var + '_ar2'] = ar2
global_stats_df[var + '_moran_res'] = moran_res
global_stats_df[var + '_coefficient'] = coefficient
global_stats_df[var + '_t_stat'] = t_stat

In [None]:
#Visualise income regression residuals

for x in tokens_lst:
    
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111)
    plt.setp(ax.get_xticklabels(), fontsize=8)
    plt.setp(ax.get_yticklabels(), fontsize=8)
    
    cmap = plt.get_cmap('bwr')
    test = PatchCollection(ward_patches, alpha=1, facecolor=cmap(ward_variables[x + '_income_residuals']), lw = 0.1, 
                                          edgecolor = '0')
    ax.add_collection(test)

    ax.plot(df[x]['eastings'], df[x]['northings'], 'k.', markersize=4, markerfacecolor='green', alpha=0.8)
    ax.set_xlim([xmin, xmax])
    ax.set_ylim([ymin, ymax])
    plt.title("Local Morans I P Values for Word Token '" + x +"'", weight = 'heavy', y=1.05)
    ax.arrow(560000, 158000, 0, 4000, head_width=700, head_length=700, fc='k', ec='k')

    m = cm.ScalarMappable(cmap=cm.bwr)
    m.set_array(ward_variables[x + '_income_residuals'])
    plt.colorbar(m)

    fig.savefig('spatial_analysis/figures/regression_residuals/' + x + '_income_residuals.png', dpi=200, figsize = (12,8), transparent=True)


In [None]:
#Regression - proportion of population professional only

w = pysal.rook_from_shapefile("Data/ESRI/london_wards.shp")
x = np.reshape(ward_variables['combined_professionals_pct'], newshape=(625, 1))

var = 'professionals_pct'

r2 = []
ar2 = []
moran_res = []
coefficient = []
t_stat = []

for i in tokens_lst:
    y = np.reshape(a=ward_variables[i + '_smooth_lq'],newshape=(625,1))
    a = pysal.spreg.ols.OLS(y,x,w, spat_diag=True, moran=True)
    ward_variables[i + '_' + var + '_residuals'] = a.u
    
    r2.append(a.r2)
    ar2.append(a.ar2)
    moran_res.append(a.moran_res)
    coefficient.append(a.betas)
    t_stat.append(a.t_stat)

global_stats_df[var + '_r2'] = r2
global_stats_df[var + '_ar2'] = ar2
global_stats_df[var + '_moran_res'] = moran_res
global_stats_df[var + '_coefficient'] = coefficient
global_stats_df[var + '_t_stat'] = t_stat

In [None]:
#Regression - proportion of population professional and income

w = pysal.rook_from_shapefile("Data/ESRI/london_wards.shp")
x = ward_variables[['med_income_2012_13', 'combined_professionals_pct']].as_matrix()

var = 'income_professionals'

r2 = []
ar2 = []
moran_res = []
coefficient = []
t_stat = []

for i in tokens_lst:
    y = np.reshape(a=ward_variables[i + '_smooth_lq'],newshape=(625,1))
    a = pysal.spreg.ols.OLS(y,x,w, spat_diag=True, moran=True)
    ward_variables[i + '_' + var + '_residuals'] = a.u
    
    r2.append(a.r2)
    ar2.append(a.ar2)
    moran_res.append(a.moran_res)
    coefficient.append(a.betas)
    t_stat.append(a.t_stat)

global_stats_df[var + '_r2'] = r2
global_stats_df[var + '_ar2'] = ar2
global_stats_df[var + '_moran_res'] = moran_res
global_stats_df[var + '_coefficient'] = coefficient
global_stats_df[var + '_t_stat'] = t_stat

## Step 9 - Explore Clustering Techniques

In [None]:
list(ward_variables.columns.values)

In [None]:
cluster_var = [['med_income_2012_13','coffee'],['med_income_2012_13', 'chicken'], ['coffee','chicken'],
              ['med_income_2012_13','coffee','chicken'], ['med_income_2012_13','coffee','chicken', 'thai', 'kebab'], 
              ['coffee','chicken', 'thai', 'kebab']]

cluster_names=['income_coffee', 'income_chicken', 'coffee_chicken', 'income_coffee_chicken', 'income_coffee_chicken_thai_kebab',
              'coffee_chicken_thai_kebab']

for i in range(len(cluster_var)):

    for x in range(7):

        from sklearn.cluster import AgglomerativeClustering

        subset = ward_variables_subset[cluster_var[i]]

        import sklearn as sk
        scaled = sk.preprocessing.scale(subset)
        AgClustering = AgglomerativeClustering(n_clusters=x+1)
        AgClustering.fit(scaled)
        AgClustering_labels = AgClustering.labels_
        ward_variables['ag_clusters'+ cluster_names[i] + str(x)] = AgClustering_labels

        #cloropleth
        plt.clf()
        fig = plt.figure()
        ax = fig.add_subplot(111, axisbg='w', frame_on=False)
        plt.setp(ax.get_xticklabels(), fontsize=8)
        plt.setp(ax.get_yticklabels(), fontsize=8)

        # use a blue colour ramp - we'll be converting it to a map using cmap()
        cmap = plt.get_cmap('gist_ncar')
        pc = PatchCollection(ward_patches, alpha=1, lw = 0.1, edgecolor = '0')

        # impose our colour map onto the patch collection
        norm = Normalize()
        pc.set_facecolor(cmap(norm(ward_variables['ag_clusters' + cluster_names[i] + str(x)].values)))
        ax.add_collection(pc)

        ax.set_xlim([xmin, xmax])
        ax.set_ylim([ymin, ymax])
        plt.title('Heirachical Clustering for Variables: ' + cluster_names[i] + str(x), weight = 'heavy', y=1.05)
        ax.arrow(560000, 158000, 0, 4000, head_width=700, head_length=700, fc='k', ec='k')

        fig.savefig('spatial_analysis/figures/cluster/' + cluster_names[i] + str(x) + '_cluster.png', dpi=200, figsize = (12,8),
                   transparent = True)


In [None]:
ward_variables['ag_clustersincome_coffee_chicken_thai_kebab3']

In [None]:
ward_variables['ag_clustersincome_coffee_chicken_thai_kebab3']

def cluster_colour (row, var):
    if row[var] == 0:
          return '#33ccff' #light blue
    if row[var] == 1:
          return '#ff5050' #orangy red
    if row[var] == 2:
          return '#66ff66' #green
    if row[var] == 3:
          return '#ffff66' #yellow

#Apply function to create new dataframe coloumn
ward_variables['cluster_colors'] = ward_variables.apply(lambda row: cluster_colour(row,'ag_clustersincome_coffee_chicken_thai_kebab3'),
                                                        axis=1)
print ward_variables['cluster_colors']

## Step 10 - Export .csvs

In [None]:
ward_variables.to_csv('Data/spatial_analysis_wards_output.csv')
global_stats_df.to_csv('Data/spatial_analysis_global_output.csv')

## Step 11 - Make charts for website content

In [None]:
#Scatter Plots

for i in range(len(tokens_lst)):

    fig = plt.figure()
    ax = fig.add_subplot(111)

    y = ward_variables['med_income_2012_13']
    x = ward_variables[tokens_lst[i] + '_smooth_lq']

    plt.setp(ax.get_xticklabels(), fontsize=8)
    plt.setp(ax.get_yticklabels(), fontsize=8)

    scatter = plt.scatter(x, y, marker='^', color = tokens_color[i]) 
    m, b = np.polyfit(x, y, deg=1)
    plt.plot(x, m*x + b, '-', c='red')
    ax.patch.set_alpha(0)

    plt.title("Concerntration (LQ) of Token '" + tokens_lst[i] + "' by Income per Ward", weight = 'normal', y=1.05)
    plt.xlabel("Location Quotient")
    plt.ylabel('Income')


    plt.xlim(xmin = 0)
    plt.ylim(20000,90000)
    
    fig.savefig('spatial_analysis/figures/scatter_plot/' + tokens_lst[i] + '_income_scatter.png', dpi=200, figsize = (12,8),
                   transparent = True)



In [None]:
#Histograms


for i in range (len(tokens_lst)):
    fig = plt.figure()
    ax = fig.add_subplot(111)

    x = ward_variables[tokens_lst[i]  + '_smooth_lq']

    plt.setp(ax.get_xticklabels(), fontsize=8)
    plt.setp(ax.get_yticklabels(), fontsize=8)
    n, bins, patches = plt.hist(x, 50, facecolor=tokens_color[i], alpha=0.75) 
    ax.patch.set_alpha(0)

    plt.title("Histogram of Concentration (LQ) of Token '" + tokens_lst[i]  + "' per Ward", weight = 'normal', y=1.05)
    plt.xlabel("Location Quotient")
    plt.ylabel('Frequency')


    #plt.show()

    fig.savefig('spatial_analysis/figures/histogram/' + tokens_lst[i] + '_histogram.png', dpi=200, figsize = (12,8),
                       transparent = True)

In [None]:
ward_variables