In [1]:
# Based on Robin Lovelace's book: http://spatial-microsim-book.robinlovelace.net/

In [26]:
import pandas as pd
import numpy as np
import ipfn
import copy

# SimpleWorld Example

# Chapter 4 Data Preparation

In [646]:
# Create the SimpleWorld individual data

ids = range(1,6)
ages = [59,54,35,73,49]
sex = ["m","m","m","f","f"]
income = [2868, 2474, 2231, 3152, 2473]

ind = pd.DataFrame(np.transpose([ids,ages,sex,income]), columns=["id","age","sex","income"])
ind

Unnamed: 0,id,age,sex,income
0,1,59,m,2868
1,2,54,m,2474
2,3,35,m,2231
3,4,73,f,3152
4,5,49,f,2473


In [6]:
# Create the SimpleWorld age constraint data

con_age = pd.DataFrame([[8,4],[2,8],[7,4]],columns=["0-49","50+"])
con_age

Unnamed: 0,0-49,50+
0,8,4
1,2,8
2,7,4


In [7]:
# Create the SimpleWorld sex constraint data

con_sex = pd.DataFrame([[6,6],[4,6],[3,8]],columns=["m","f"])
con_sex

Unnamed: 0,m,f
0,6,6
1,4,6
2,3,8


In [8]:
# Check that the totals of the two constraints tables match
print "con_age total:", con_age.values.sum(), "; con_sex total:", con_sex.values.sum()

# Check the row totals (i.e. area populations match)
print "con_age row sums:", con_age.sum(axis=1).values, "; con_sex row sums:", con_sex.sum(axis=1).values

# Test row sum equivalence
con_age.sum(axis=1) == con_sex.sum(axis=1)

con_age total: 33 ; con_sex total: 33
con_age row sums: [12 10 11] ; con_sex row sums: [12 10 11]


0    True
1    True
2    True
dtype: bool

In [269]:
# Store the full individidual dataset as ind_orig. Use copy do make a 'deep' copy.
ind_orig = ind.copy()
# Drop the income field
ind = ind.drop(['income'],axis=1)
ind

Unnamed: 0,id,age,sex
0,1,59,m
1,2,54,m
2,3,35,m
3,4,73,f
4,5,49,f


In [270]:
# Now recategorise the age variable
ind['age'] = pd.to_numeric(ind['age'])

# overwrite the age variable with categorical age
ind['age'] = pd.cut(ind['age'], [0,49,120], labels = ['a0_49','a50+'])
ind

Unnamed: 0,id,age,sex
0,1,a50+,m
1,2,a50+,m
2,3,a0_49,m
3,4,a50+,f
4,5,a0_49,f


In [11]:
# Rename the con_age fields to match the categories in the ind table
con_age = con_age.rename(columns={'0-49':'a0_49','50+':'a50+'})
con_age

Unnamed: 0,a0_49,a50+
0,8,4
1,2,8
2,7,4


In [12]:
# Finally create a single contraint object by merging the constraints tables.
cons = con_age.merge(con_sex,left_index=True,right_index=True)
cons

Unnamed: 0,a0_49,a50+,m,f
0,8,4,6,6
1,2,8,4,6
2,7,4,3,8


In [13]:
# Check the dimensions of the ind and cons datasets
print "Shape of ind:",ind.shape
print "Shape of cons:",cons.shape

Shape of ind: (5, 3)
Shape of cons: (3, 4)


In [14]:
# As the dimensions are different, we need to 'flatten' the individual dataset.
# This means that responses become fields, and values become booleans, with rows reflecting individuals
age_pivot = pd.pivot_table(ind,columns=['age'],values='id', index=ind.index, aggfunc=len, fill_value=0)
# The last square bracket bit ensures that the column order is male then female.
sex_pivot = pd.pivot_table(ind,columns=['sex'],values='id', index=ind.index, aggfunc=len, fill_value=0)[['m','f']]

In [15]:
# merge pivoted data to make flatten dataframe
ind_cat = pd.DataFrame(age_pivot.to_records()).merge(pd.DataFrame(sex_pivot.to_records()),left_index=True,right_index=True)
# drop nuisance columns
ind_cat = ind_cat.drop(['index_x','index_y'],axis=1)
ind_cat

Unnamed: 0,a0_49,a50+,m,f
0,0,1,1,0
1,0,1,1,0
2,1,0,1,0
3,0,1,0,1
4,1,0,0,1


In [31]:
# Check the columns sums to be sure ind_cat is correct
ind_cat.sum(axis=0)
# store these values
ind_agg = ind_cat.sum(axis=0)

In [109]:
# Now the survey data is in the same shape as the cons data in terms of how the columns are set up.

# Chapter 5: Population Synthesis

In [18]:
# Create a weights matrix that contains a weights for each area - individual pairing.
# That is a measure of the representativeness of each individual for each area.

weights = np.ones((len(ind),len(cons)))
weights.shape

(5L, 3L)

In [81]:
# Use python to get teh basics for iterative proportional fitting.
# First create some intuitive names for the totals
n_zones = len(cons) # number of zones
n_ind = len(ind) # number of individuals
n_age = len(con_age.columns) # number of categories for age
n_sex = len(con_sex.columns) # number of categories for sex

# Now make some copies of the weights matrix
weights1 = copy.deepcopy(weights)
weights2 = copy.deepcopy(weights)

# Now create the marginal distribution of individuals in each zone.
ind_agg0 = cons.apply(lambda x: 1.0*ind_agg, axis=0).T[0:3].reset_index(drop=True)

In [82]:
# Now the fitting happens by iterative on zones and categories using nested loops
# Here we'll assign values to adapt to the age constraint
for i in range(0,n_zones):
    for j in range(0,n_age):
        index = ind_cat.iloc[:,j] == 1 # filter for selection
        weights1[index,i] = (weights[index, i] * con_age.iloc[i,j]) / ind_agg0.iloc[i, j]
    print weights1

[[ 1.33333333  1.          1.        ]
 [ 1.33333333  1.          1.        ]
 [ 4.          1.          1.        ]
 [ 1.33333333  1.          1.        ]
 [ 4.          1.          1.        ]]
[[ 1.33333333  2.66666667  1.        ]
 [ 1.33333333  2.66666667  1.        ]
 [ 4.          1.          1.        ]
 [ 1.33333333  2.66666667  1.        ]
 [ 4.          1.          1.        ]]
[[ 1.33333333  2.66666667  1.33333333]
 [ 1.33333333  2.66666667  1.33333333]
 [ 4.          1.          3.5       ]
 [ 1.33333333  2.66666667  1.33333333]
 [ 4.          1.          3.5       ]]


In [108]:
# Create additional ind_agg objects

ind_agg1 = (ind_agg0 * np.nan).copy()
ind_agg2 = (ind_agg0 * np.nan).copy()

# Assign values to the aggregated data after age constraints
for i in range(0,n_zones):
    ind_agg1.iloc[i] = ind_cat.apply(lambda x: x*weights1[:,i],axis=0).sum(axis=0)

In [117]:
# Check age constraints for each zone

# Simulated population each zone
ind_agg1.iloc[:,[0,1]].sum(axis=1)

0    12.0
1    10.0
2    11.0
dtype: float64

In [118]:
# Observed population in each zone
cons.iloc[:,[0,1]].sum(axis=1)

0    12
1    10
2    11
dtype: int64

In [129]:
# a simple way to check the goodness of fit is the Pearson correlation of a 1d representation of the data
# Initial weights
np.corrcoef(ind_agg0.values.ravel(),cons.values.ravel())[0,1]

-0.33686076842660762

In [131]:
# Age fitted weights
np.corrcoef(ind_agg1.values.ravel(),cons.values.ravel())[0,1]

0.62843395375330602

In [136]:
# Now constrain by sex
for i in range(0,n_zones):
    for j in range(0,n_sex+n_age):
        index = ind_cat.iloc[:,j] == 1 # filter for selection
        weights2[index, i] = weights1[index, i] * cons.iloc[i,j]/ind_agg1.iloc[i, j]

weights2

array([[ 1.2       ,  1.68421053,  0.64864865],
       [ 1.2       ,  1.68421053,  0.64864865],
       [ 3.6       ,  0.63157895,  1.7027027 ],
       [ 1.5       ,  4.36363636,  2.20689655],
       [ 4.5       ,  1.63636364,  5.79310345]])

In [147]:
# Get final values
for i in range(0,n_zones):
    ind_agg2.iloc[i] = ind_cat.apply(lambda x: x*weights2[:,i],axis=0).sum(axis=0)

ind_agg2

Unnamed: 0,a0_49,a50+,m,f
0,8.1,3.9,6.0,6.0
1,2.267943,7.732057,4.0,6.0
2,7.495806,3.504194,3.0,8.0


In [149]:
# Test correaltion
np.corrcoef(ind_agg2.values.ravel(),cons.values.ravel())[0,1]

0.99319919195100137

## Iterative Proportional Fitting with ipfn

In [471]:
# We can do the above easily using python ipfn package.
# It's a bit different to the equivalent R package ipfp functionality given by Lovelace.
# This example will calculate the weights for zone 1.

# The process acts on the dataframe of individual observations.
# We'll make a copy of the individuals to preserve the originals.
ind_copy = ind.copy()
# First we need to add a column that will hold the weights for each individual.
# Initial weights are all set to 1.
ind_copy['weight'] = np.ones(5)

# Now, we need to convert the age variable from the categorical data format  create by pd.cut() to string objects.
# The ipfn library can't handle categorical datatypes for some reason.
ind_copy['age'] = ind_copy['age'].tolist()

# Now get the aggregates (marginals) for zone 1 for age and sex.
age = cons.iloc[0,[0,1]]
sex = cons.iloc[0,[2,3]]

aggregates = [age,sex]
dimensions = [['age'],['sex']]

ipf = ipfn.ipfn(ind_copy,aggregates,dimensions,weight_col='weight',convergence_rate = 1e-15)
out = ipf.iteration()
out

ipfn converged: convergence_rate not updating or below rate_tolerance


Unnamed: 0,sex,age,id,weight
0,m,a50+,1,1.227998
1,m,a50+,2,1.227998
2,m,a0_49,3,3.544004
3,f,a50+,4,1.544004
4,f,a0_49,5,4.455996


In [475]:
# Set up individuals
ind_copy = ind.copy()
ind_copy['age'] = ind_copy['age'].tolist()

# Now let's do this for each zone
for i in range(0,n_zones):
    # Make weights column for zone i
    ind_copy['weight_' + str(i)] = np.ones(5)
    
    # Now get the aggregates (marginals) for zone 1 for age and sex.
    age = cons.iloc[i,[0,1]]
    sex = cons.iloc[i,[2,3]]
    
    # Do iterative proportional fitting
    ipf = ipfn.ipfn(ind_copy, [age,sex],[['age'],['sex']],weight_col='weight_'+str(i),convergence_rate = 1e-15)
    ind_copy = ipf.iteration()

ind_copy

ipfn converged: convergence_rate not updating or below rate_tolerance
ipfn converged: convergence_rate not updating or below rate_tolerance
ipfn converged: convergence_rate not updating or below rate_tolerance


Unnamed: 0,sex,age,id,weight_0,weight_1,weight_2
0,m,a50+,1,1.227998,1.725083,0.725083
1,m,a50+,2,1.227998,1.725083,0.725083
2,m,a0_49,3,3.544004,0.549834,1.549834
3,f,a50+,4,1.544004,4.549834,2.549834
4,f,a0_49,5,4.455996,1.450166,5.450166


In [487]:
# Check that the weights obtained make sense.
ind_agg3 = (ind_agg0 * np.nan).copy()

for i in range(0,n_zones):
    ind_agg3.iloc[i] = ind_cat.apply(lambda x: x*ind_copy['weight_'+str(i)],axis=0).sum(axis=0)

ind_agg3

Unnamed: 0,a0_49,a50+,m,f
0,8.0,4.0,6.0,6.0
1,2.0,8.0,4.0,6.0
2,7.0,4.0,3.0,8.0


In [486]:
# Compare above with constraints - success!
cons

Unnamed: 0,a0_49,a50+,m,f
0,8,4,6,6
1,2,8,4,6
2,7,4,3,8


# Integerisation

In [586]:
# The weights generated are fractional, to allcoate individuals to zones we need to convert these to integers.
# Ideally with a minimum loss of informtion.

# We'll start with a function for a method called 'proportional probabilities'.
def int_pp(weights):
    # convert to a vector if required
    xv = np.array(weights).ravel()
    # Sample the individuals
    rsum = round(xv.sum())
    xs = np.random.choice(len(xv),int(rsum),True,xv/xv.sum())
    # return the result
    return np.bincount(xs,minlength=len(xv))


In [605]:
# Test the function 
np.random.seed(24) # This seed reproduces the answer in Lovelace to give the intuitive result.
print int_pp([0.333,0.667,3])
print int_pp([1.333,1.333,1.333])

[0 1 3]
[1 1 2]


In [612]:
# Note that given the random nature of proportional probabilities, 
# unrepresentative arrangements have a non-zero chance of occurance.
# Lovelace and Ballas (2013) suggest a truncate, replicate, sample 'TRS' approach to deal with this.
# In effect this means that any individual with weight > 1 is sampled at least once.

def int_trs(weights):
    # convert to a vector if required
    xv = np.array(weights).ravel()
    # truncate - just get the integer part of the weight
    xint = np.floor(xv)
    # Get the decimal bit of the weight
    r = xv - xint
    # Work out the deficit population
    frac_sum = round(r.sum())
    # Sample based upon the deficit bit
    xs = np.random.choice(len(xv),int(frac_sum),True,r/r.sum())
    # Get the result of the deficit part
    topup = np.bincount(xs,minlength=len(xv))
    return xint + topup

In [632]:
# Test this function
np.random.seed(31) # This seed reproduces the answer in Lovelace.
print int_trs([0.333,0.667,3])
print int_trs([1.333,1.333,1.333])

[ 1.  0.  3.]
[ 1.  1.  2.]


In [640]:
# Now use the TRS approach to integerisation to generate some microdata for SimpleWorld
# First, get integer weights for area 1
int_weight1 = int_trs(ind_copy['weight_0'])
int_weight1

array([ 1.,  1.,  4.,  2.,  4.])

## Expansion

In [644]:
# Integerised weights correspond to the number of repetitions of a given individual.
# Firstly, expand the weights

def int_expand_vector(weights):
    return np.repeat(range(0,len(weights)),weights.astype(int))

print int_weight1
print int_expand_vector(int_weight1)

[ 1.  1.  4.  2.  4.]
[0 1 2 2 2 2 3 3 4 4 4 4]


In [648]:
# expand the indices for area 1
exp_indices = int_expand_vector(int_weight1)
# Generate the microdata from the individuals table
ind_orig.iloc[exp_indices]

Unnamed: 0,id,age,sex,income
0,1,59,m,2868
1,2,54,m,2474
2,3,35,m,2231
2,3,35,m,2231
2,3,35,m,2231
2,3,35,m,2231
3,4,73,f,3152
3,4,73,f,3152
4,5,49,f,2473
4,5,49,f,2473


In [655]:
# Now let's put the integeristation and expansion together for all areas

indivs = []
for i in range(0,n_zones):
    # Integerise and expand
    ints = int_expand_vector(int_trs(ind_copy['weight_'+str(i)]))
    # Select the relevant individuals
    temp = ind_orig.iloc[ints]
    temp['zone'] = i
    indivs.append(temp)
ints_df = pd.concat(indivs)
ints_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,id,age,sex,income,zone
0,1,59,m,2868,0
0,1,59,m,2868,0
0,1,59,m,2868,0
1,2,54,m,2474,0
2,3,35,m,2231,0
2,3,35,m,2231,0
2,3,35,m,2231,0
3,4,73,f,3152,0
4,5,49,f,2473,0
4,5,49,f,2473,0
