-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils.py
442 lines (345 loc) · 12.3 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.colors
import pandas as pd
def get_reference_layout(figure):
return [{c_x: figure._data[idc][c_x] for c_x in ['x', 'y']} for idc, c in enumerate(['lines', 'edges'])]
def get_filtered_values(v, indices):
"""Copy v and put NaNs in indices
Parameters
----------
v : ndarray,
indices : list or iterable of indices, for which the value should be set to NaN
Returns
-------
v_ : a copy of v, with NaNs at indices given by `indices`
"""
v_ = v.copy()
v_[indices] = np.NaN
return v_
def get_cols_by_type():
'''Function to get different columns
Parameters
----------
Returns
-------
cols : tuple
Tuple of lists of
1. columns with numerical values,
2. columns with general information,
3. columns related to presidential election
'''
num_cols = ['Personal income (thousands of dollars)',
'Net earnings by place of residence',
'Personal current transfer receipts',
'Income maintenance benefits',
'Unemployment insurance compensation',
'Retirement and other',
'Dividends, interest, and rent',
'Population (persons)',
'Per capita personal income',
'Per capita net earnings',
'Per capita personal current transfer receipts',
'Per capita income maintenance benefits',
'Per capita unemployment insurance compensation',
'Per capita retirement and other',
'Per capita dividends, interest, and rent',
'Earnings by place of work',
'Wages and salaries',
'Supplements to wages and salaries',
'Employer contributions for employee pension and ' +
'insurance funds',
'Employer contributions for government social insurance',
"Proprietors' income",
"Farm proprietors' income",
"Nonfarm proprietors' income",
'Total employment (number of jobs)',
'Wage and salary employment',
'Proprietors employment',
'Farm proprietors employment',
'Nonfarm proprietors employment',
'Average earnings per job (dollars)',
'Average wages and salaries',
"Average nonfarm proprietors' income"]
info_cols = ['year', 'state', 'county', 'fips', 'pres']
elec_cols = ['republican', 'democrat', 'total_votes', 'n_electors',
'winner']
return num_cols, info_cols, elec_cols
def get_cols_for_mapper():
'''Function to return columns used for Mapper Algorithm. Columns to use for
the mapper were found by comparing the distribution of each column between
the different election years. The ones differing throughout the years are
selected (selection was made by eye)
Parameters
----------
Returns
-------
cols : list
List of columns used for Mapper algorithm
'''
return ['Personal income (thousands of dollars)',
'Net earnings by place of residence',
'Income maintenance benefits',
'Unemployment insurance compensation',
'Per capita personal income',
'Per capita net earnings',
'Per capita personal current transfer receipts',
'Per capita income maintenance benefits',
'Per capita unemployment insurance compensation',
'Per capita retirement and other',
'Per capita dividends, interest, and rent',
'Earnings by place of work',
"Proprietors' income",
"Nonfarm proprietors' income",
'Total employment (number of jobs)',
'Proprietors employment',
'Farm proprietors employment',
'Nonfarm proprietors employment',
'Average earnings per job (dollars)',
'Average wages and salaries',
"Average nonfarm proprietors' income"]
def get_data(df):
'''Function to extract data for Mapper from data frame.
Parameters
----------
df : pandas data frame
Returns
-------
data : ndarray
Scaled relevant data values
'''
data_cols = get_cols_for_mapper()
# perform a log transformation on the data
df[data_cols] = (df[data_cols] +
abs(df[data_cols].min().min()) + 1).apply(np.log)
# scale data to have zero mean and a standard deviation of one
scaler = StandardScaler()
df[data_cols] = scaler.fit_transform(df[data_cols])
return df[data_cols].values
def split_data_by_year(data, df):
'''Function to split entire data into different election years.
Parameters
----------
data : ndarray (n_samples x n_dim)
Mapper input
df : pandas data frame
Data frame of entire data
Returns
-------
data_by_year : dict
Dictionary with election year as key and corresponding economic
data as values
'''
return dict(zip(['2000', '2004', '2008', '2012', '2016'],
map(lambda x: data[x, :],
map(lambda x: df[df['year'] == x].index,
df['year'].unique()))))
def log_transform_2d_filter_values(x):
'''Transformation of PCA values to obtain final filter.
Parameters
----------
x : ndarray (n_shape x n_dim)
Filter values
Returns
-------
x_transformed : ndarray (n_shape x n_dim)
Transformed PCA
'''
x[:, 0] = np.log(x[:, 0] - min(x[:, 0]) + 1)
x[:, 1] = np.log(np.abs(x[:, 1]) + 1)
return x
def get_node_size(node_elements):
'''Function to get node size
Parameters
----------
node_elements: tuple
Tuple of arrays where array at positin x contains the data points for
node x
Returns
-------
node_size : list
List of node sizes
'''
return list(map(len, node_elements))
def get_node_summary(node_elements, data, summary_stat=np.mean):
'''Function to calculate a summary statistic per node
Parameters
----------
node_elements : tuple
Tuple of arrays where array at positin x contains the data points for
node x
data: ndarray
Data to be used
summary_stat : function
Summary statistic
Returns
-------
node_summary : list
List of summary statistics
'''
return list(map(lambda x: summary_stat(data[x]),
node_elements))
def get_n_electors(node_elements, n_electors):
'''Function to calculate percentage of electors belonging to each node
Parameters
----------
node_elements : tuple
Tuple of arrays where array at positin x contains the data points for
node x
n_electors : pandas series
Pandas series of number of weighted electors per county
Returns
-------
n_electors : list
List of percentage of electors within a node(w.r.t to total number of
electors)
'''
return [100 * n_electors.iloc[x].sum() / n_electors.sum()
for x in node_elements]
def get_node_text(node_elements, n_electors, node_color, label):
'''Function to create text of node label
Parameters
----------
node_elements : tuple
Tuple of arrays where array at positin x contains the data points for
node x
n_electors : pandas series
Pandas series of number of weighted electors per county
node_color : list
List of node colors
label: str
Name of label (e.g. 'income')
Returns
-------
node_text : list
List of text for node labels
'''
return [f'Node Id: {x[0]}<br>' +
f'Node size: {len(x[1])}<br>' +
f'Percentage of Weighted Electors: {round(y, 2)}<br>' +
f'Percentage of Weighted Electors per County: '
f'{round(y / len(x[1]), 3)}<br>' +
f'Mean {label}: {z}'
for x, y, z in zip(node_elements.items(), n_electors, node_color)]
def get_subgraph(graph, vertices_to_remove):
'''Extract a subgraph out of a given one.
Parameters
----------
graph : igraph object
vertices_to_remove: List of vertices (defined by node id) to remove
from graph
Returns
-------
subgraph : igraph object
An igraph object containing all but specified vertices (and
corresponding edges)
'''
subgraph = graph.copy()
subgraph.delete_vertices(vertices_to_remove)
return subgraph
def get_county_plot_data(graph, df, col, cmap):
'''Function to create data for a plot of a map of the US.
Parameters
----------
graph : igraph object
df : pandas data frame
col : str
Column to base color of map on
cmap : colormap
Returns
-------
data : tuple
Tuple of list of color of a node (numerical value) and list of
colors to use
'''
map_col = (pd.DataFrame(np.zeros((df.shape[0],
2)),
columns=['color_sum', 'n_counties'])
.astype({'n_counties': 'int'}))
for rows, val in zip(graph['node_metadata']['node_elements'],
get_node_summary(graph['node_metadata']
['node_elements'],
df[col])):
map_col.loc[rows] = map_col.loc[rows] + [val, 1]
colors = list(map(matplotlib.colors.rgb2hex,
cmap((map_col['color_sum'] /
map_col['n_counties']).sort_values().unique()
.tolist())[:, :3]))
return ((map_col['color_sum'] / map_col['n_counties']).tolist(),
colors)
def get_regions():
'''Function to get different regions
Parameters
----------
Returns
-------
regions : dict
Dictionary with region id as key and sets of node ids belonging to them
'''
return {
0: {45, 18, 1, 7, 52, 55, 50, 49, 46, 51, 47, 30, 2, 44,
37, 54, 53, 9, 48, 13, 24},
1: {41, 42, 43},
2: {38, 39, 40},
3: {25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36},
4: {14, 15, 16, 17, 19, 20, 21, 22, 23},
5: {0, 3, 4, 5, 6, 8, 10, 11, 12}
}
def get_data_per_region(regions, node_elements):
'''Function to assign to each region the data points enclosed in it.
Parameters
----------
regions : dict
Dictionary of regions with ids as keys and set of nodes as values
node_elements: tuple
Tuple of arrays where array at positin x contains the data points for
node x
Returns
-------
Output:
Dictionary with region ids as keys and corresponding data point ids
as values
'''
# create tuples of region id with corresponding data points. The latter are
# found by taking the union of all elements belonging to a region.
return dict((region_id, set.union(*[set(node_elements[node])
for node in regions[region_id]]))
for region_id in regions.keys())
def hex2rgb(hex_colors):
'''Function to convert a hexa coded color into RGB format.
Parameters
----------
hex_colors : list
List of hexa color codes of the format '#xxyyzz'
Returns
-------
rgb_colors : list
List of tuples representing RGB codes
'''
rgb_colors = [color.lstrip('#') for color in hex_colors]
return [tuple(int(color[i:i + 2], 16) for i in (0, 2, 4))
for color in rgb_colors]
def mean_rgb(rgb_vals):
'''Function to find the elementwise mean of a list of tuples.
Parameters
----------
rgb_vals : list
List of tuples of same length
Returns
-------
mean_rgb : tuple
Tuple containing the mean along of each entry
'''
# calculate mean of list of rbg values
return tuple(map(int, np.mean(rgb_vals, axis=0)))
def get_small_cluster_ids():
'''Function the get ids of singletons/small clusters.
Parameters
----------
Returns
-------
ids : list
List of node ids of singletons/small clusters
'''
return [45, 18, 1, 7, 52, 55, 50, 49, 46, 51, 47, 30, 2, 44, 37, 54, 53, 9,
48, 13, 24]