-
Notifications
You must be signed in to change notification settings - Fork 16
/
how to map the european social survey.R
1233 lines (899 loc) · 37.3 KB
/
how to map the european social survey.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# # # # # # # # # # # # # # # # #
# # set the working directory # #
# # # # # # # # # # # # # # # # #
# setwd( "C:/My Directory/SWMAP/" )
# # # # # # # # # # # # # # # #
# # example survey data set # #
# # # # # # # # # # # # # # # #
# european social survey
# # # # # # # # # # # # # # # # # # # # #
# # different from other maps because # #
# # # # # # # # # # # # # # # # # # # # #
# displays an ordinal categorical variable
# approximates (unweighted) centroid calculations
# # # # # # # # # # # # # # # # # #
# # smallest level of geography # #
# # # # # # # # # # # # # # # # # #
# varying regions across multiple countries
# # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # asdfree.com blog post for this survey microdata # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # #
# http://www.asdfree.com/search/label/european%20social%20survey%20%28ess%29
# # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # r code repository for setup and analysis examples # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# https://github.com/ajdamico/asdfree/tree/master/European%20Social%20Survey
# # # # # # # # # # # # #
# # value of interest # #
# # # # # # # # # # # # #
# average hours of television watched every weekday
# # # # # # #
# # flaws # #
# # # # # # #
# map implies huge fluctuation in television viewing time
# when really, almost everybody watches about two hours of tv.
# # # # # # # # # # # # # # # # # # # # #
# # step 1: load the survey microdata # #
library(downloader)
# download the 2012 european social survey microdata onto the local disk
# note that this requires (free) registration before the download will work
# http://www.europeansocialsurvey.org/user/new
your.email <- "email@address.com"
source_url( "https://raw.github.com/ajdamico/asdfree/master/European%20Social%20Survey/download%20all%20microdata.R" , prompt = FALSE , echo = TRUE )
# # end of step 1 # #
# # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # step 2: conduct your analysis of interest at the smallest geography allowed # #
library(survey)
library(stringr)
# following the analysis examples in the r code repository --
# # https://github.com/ajdamico/asdfree/blob/master/European%20Social%20Survey/replication.R
# -- calculate the average number of hours of television watched
# at the smallest available geographic area, across all available countrieswithin the state of connecticut
# load the complete integrated file as a data.frame `x`
load( "./2012/integrated.rda" )
# skip israel because it's too far away from the others to map
integrated <- subset( x , cntry != 'IL' )
# and also because it does not currently include a `SDDF`
# skip iceland because it's also on the border and because
integrated <- subset( integrated , cntry != 'IS' )
# its results in the final map are boring. it majorly decreases
# the map's resolution while not adding much of a story on its own
# initiate an empty object
sddf <- NULL
# loop through all countries in the `integrated` file
for ( j in unique( integrated$cntry ) ){
# find the filepath of the `SDDF` file within the current country's 2012 folder
sddf.tn <- grep( 'SDDF' , list.files( paste0( "./2012/" , j ) ) , value = TRUE )
# load that `.rda` file into working memory
load( paste0( './2012/' , j , '/' , sddf.tn ) )
# stack what's already in the `sddf` data.frame (from previous loops)
# on top of the latest `sddf` file, until you have
# the clustering & strata variables from every country.
sddf <-
rbind(
sddf ,
x[ , c( 'cntry' , 'idno' , 'psu' , 'stratify' , 'prob' ) ]
)
}
# merge the complete european social survey integrated file
# with this complex sample information
y <- merge( integrated , sddf )
# confirm that zero records have been lost at sea.
stopifnot( nrow( y ) == nrow( integrated ) )
# construct a complex sample survey design object
integrated.design <-
svydesign(
ids = ~psu ,
strata = ~stratify ,
probs = ~prob ,
data = y ,
nest = TRUE
)
# set R to produce conservative standard errors instead of crashing
# http://r-survey.r-forge.r-project.org/survey/exmample-lonely.html
options( survey.lonely.psu = "adjust" )
# this setting matches the MISSUNIT option in SUDAAN
# and now, in the blink of an eye, calculate the median category of
# hours of television watched per weekday. `tvtot` is an ordinal variable:
# average television viewing in half-hour intervals. computing its median
# across almost every region available in the european social survey
# does throw out some information, but it's probably a reasonable
# proxy for cross-continental high versus low television viewership areas
smallest.area.statistics <-
svyby(
~ tvtot ,
~ region ,
integrated.design ,
svyquantile ,
0.5 ,
ties = "rounded" ,
interval.type = "betaWald" ,
na.rm = TRUE ,
ci = TRUE
)
# these are the statistics to be mapped
print( smallest.area.statistics )
# the standard errors are a measure of precision,
# their inverse will serve as the mapping weights
# but wait, problem problem awooooogah
summary( smallest.area.statistics )
# some of the standard errors are missing.
# that will lead to missing weights, a no-no.
# replace all missing standard errors with the non-zero minimum
smallest.area.statistics[ is.na( smallest.area.statistics$se ) , 'se' ] <-
min( subset( smallest.area.statistics , !is.na( se ) )$se )
# excellent, now everybody has a weight
summary( smallest.area.statistics )
# but wait, another problem problem awooooogah
# many of the relative standard errors (the se divided by the statistic)
# are greater than 30%, which is often considered a threshold that's too uncertain.
regions.with.high.rse <-
smallest.area.statistics[
smallest.area.statistics$se / smallest.area.statistics$tvtot > 0.3 ,
'region'
]
# many of the regions also have very low unweighted counts
unwtd.by.region <-
svyby(
~ tvtot ,
~ region ,
integrated.design ,
unwtd.count
)
# identify regions with less than 50 surveyed respondents
regions.with.low.unwtd <-
unwtd.by.region[
unwtd.by.region$count < 50 ,
'region'
]
# combine both of these too-uncertain collections of regions
# into a single vector identifying what areas are just too unstable to display
unstable.regions <-
unique( c( as.character( regions.with.high.rse ) , as.character( regions.with.low.unwtd ) ) )
# # warning note warning of critical importance
# # just because a geography is available in your microdata
# # does not mean you can mindlessly throw it on a map.
# # you need to trust your statistics at small-areas,
# # otherwise you need to aggregate them until they become stable.
# # so aggregate them we shall.
# within each country, figure out the number of
# regions that need to be aggregated.
table( substr( unstable.regions , 1 , 2 ) )
# from this table, *most* countries that have
# at least one unstable region has at least two.
# but switzerland (CH) and the netherlands (NL) do not.
# so we're going to enforce that the
# second-smallest-sample-sizes in both regions also
# be considered unstable
subset( unwtd.by.region , substr( region , 1 , 2 ) %in% c( 'CH' , 'NL' ) )
# looks like CH07 should be grouped with CH06
# and NL23 should be grouped with NL34.
# so let's manually add them to the unstable regions vector.
unstable.regions <- c( unstable.regions , "NL34 " , "CH06 " )
# note: this is not always ideal. maybe you'd prefer
# to just blank out one region and keep the other pure
# rather than aggregate both and potentially dilute
# any useful findings from the larger region.
# now that we've identified which regions potentially provide
# unstable estimates, we can aggregate those areas and re-calculate.
integrated.design <-
update(
integrated.design ,
aggregion =
ifelse(
# if the region is one causing unstable estimates
region %in% unstable.regions ,
# aggregate it with other unstable estimates in the same country
substr( as.character( region ) , 1 , 2 ) ,
# otherwise let it be.
as.character( region )
)
)
# re-compute the `tvtot` estimates,
# but this time using the `aggregion`
# variable that we hope will provide more reliable
smallest.safe.statistics <-
svyby(
~ tvtot ,
~ aggregion ,
integrated.design ,
svyquantile ,
0.5 ,
ties = "rounded" ,
interval.type = "betaWald" ,
na.rm = TRUE ,
ci = TRUE
)
# suddenly the relative standard errors look a bit more stable. nothing above 30%.
sum( smallest.safe.statistics$se / smallest.safe.statistics$tvtot > 0.3 , na.rm = TRUE )
# extract the region x aggregated region columns from the survey design object
reg.aggreg.xwalk <- unique( integrated.design$variables[ , c( 'region' , 'aggregion' ) ] )
# keeping one record per crosswalked value.
# merge our final computations with the region-aggregion crosswalk
sas <- merge( smallest.safe.statistics , reg.aggreg.xwalk )
# confirm that `sas` contains the same number of records
# (one record per region) as the object with unstable estimates
stopifnot( nrow( smallest.area.statistics ) == nrow( sas ) )
# trim the regions, use the variable name
# that eurostat uses on its shapefiles
sas$NUTS_ID <- str_trim( sas$region )
# and remove the aggregated region variable
sas$aggregion <- NULL
# whoops don't forget to replace all missing standard errors with the non-zero minimum
sas[ is.na( sas$se ) , 'se' ] <-
min( subset( sas , !is.na( se ) )$se )
# # end of step 2 # #
# # # # # # # # # # #
# # # # # # # # # # # # # # # #
# # step 3: choose outlines # #
library(raster)
library(rgdal)
library(maptools)
library(downloader)
source_url(
"https://raw.github.com/ajdamico/asdfree/master/Download%20Cache/download%20cache.R" ,
prompt = FALSE ,
echo = FALSE
)
# note: to re-download a file from scratch, add the parameter usecache = FALSE
# # # map of the world # # #
# initiate a temporary file
tf <- tempfile()
# use eurostat's map of the world
world.fn <- "http://ec.europa.eu/eurostat/cache/GISCO/geodatafiles/CNTR_2014_03M_SH.zip"
# store it to the local disk
download_cached( world.fn , tf )
# unzip it
world.uz <- unzip( tf , exdir = tempdir() )
# identify the shapefile
world.sfn <- grep( 'CNTR_RG(.*)shp$' , world.uz , value = TRUE )
# read it in
world.shp <- readOGR( world.sfn , layer = gsub( "\\.shp" , "" , basename( world.sfn ) ) )
# here's the outline of every country in the world
plot(world.shp)
# # # map of europe # # #
# use eurostat's map of europe
eu.fn <- "http://ec.europa.eu/eurostat/cache/GISCO/geodatafiles/NUTS_2010_03M_SH.zip"
# store it to the local disk
download_cached( eu.fn , tf )
# unzip it
eu.uz <- unzip( tf , exdir = tempdir() )
# identify the shapefile
eu.sfn <- grep( 'NUTS_RG(.*)shp$' , eu.uz , value = TRUE )
# read it in
eu.shp <- readOGR( eu.sfn , layer = gsub( "\\.shp" , "" , basename( eu.sfn ) ) )
# here's the outline of every country in europe
plot( eu.shp )
# including all administrative regions
# here's the same plot, but only using national borders.
plot( subset( eu.shp , STAT_LEVL_ == 0 ) )
# # # map of eu countries available in the european social survey # # #
matches.shp <- subset( eu.shp , NUTS_ID %in% unique( sas$NUTS_ID ) )
plot( matches.shp , col = 'red' )
# the canary islands (spanish land off of the coast of morocco)
# have about two million inhabitants (6x the population of iceland)
# but they're isolated from all other landmasses
# (iceland has two regions, the canary islands has just one)
# re-draw the map without them..
plot( subset( matches.shp , NUTS_ID != 'ES70' ) , col = 'red' )
# ..alright, it looks a bit better, so i'm going to throw them out.
# store this shapefile
matnci.shp <- subset( matches.shp , NUTS_ID != 'ES70' )
# matches with no canary islands.
# plot the international borders on top of
plot( world.shp , add = TRUE )
# so look at that. there are some glaring omissions
# european union members like austria, greece, latvia did not participate
# and then there are some regions (like corsica and that italian province of molise)
# that are also missing. we'll deal with those troublemakers later.
# but guess what?
nonmatches <- unique( sas[ !( sas$NUTS_ID %in% matnci.shp@data$NUTS_ID ) , 'NUTS_ID' ] )
# the european social survey includes certain non-european states
# so let's fetch their provincial boundaries as well
print( nonmatches )
# # # add albania, ukraine, russian federation, kosovo # # #
# pulling from the administrative region data at http://www.gadm.org/
# specify the filenames of albania, ukraine, kosovo, and russian shapefiles
ab.fn <- 'http://biogeo.ucdavis.edu/data/diva/adm/ALB_adm.zip'
uk.fn <- 'http://biogeo.ucdavis.edu/data/diva/adm/UKR_adm.zip'
ko.fn <- 'http://biogeo.ucdavis.edu/data/diva/adm/KO-_adm.zip'
ru.fn <- 'http://biogeo.ucdavis.edu/data/diva/adm/RUS_adm.zip'
# albania #
download_cached( ab.fn , tf )
alb.files <- unzip( tf , exdir = tempdir() )
# there are four administrative files to choose from.
alb.files
# you want the regions, which are stored in `adm1`
alb.shp <- grep( 'adm1(.*)shp$' , alb.files , value = TRUE )
# read it in
alb <- readOGR( alb.shp , layer = gsub( "\\.shp" , "" , basename( alb.shp ) ) )
# compare the region names (in the ess codebook)
# with the region names in the shapefile
alb@data
# tack on the region identifiers
alb@data$NUTS_ID <- paste0( 'AL' , c( '01' , '09' , '02' , '03' , '04' , '05' , '06' , '07' , '08' , '10' , '11' , '12' ) )
# re-map everything, suddenly we've got a reddened albania
# (across the adriatic sea from the bootheel of italy)
plot( matnci.shp , col = 'red' )
plot( world.shp , add = TRUE )
plot( alb , add = TRUE , col = 'red' )
# kosovo #
download_cached( ko.fn , tf )
kos.files <- unzip( tf , exdir = tempdir() )
# there are three administrative files to choose from.
kos.files
# you want the regions, which are stored in `adm1`
kos.shp <- grep( 'adm1(.*)shp$' , kos.files , value = TRUE )
# read it in
kos <- readOGR( kos.shp , layer = gsub( "\\.shp" , "" , basename( kos.shp ) ) )
# compare the region names (in the ess codebook)
# with the region names in the shapefile
kos@data
# tack on the region identifiers
kos@data$NUTS_ID <- paste0( 'XK' , c( 4 , 5 , 2 , 6 , 1 , 3 , 7 ) )
# still have your map loaded?
# make kosovo (northeast of albania) red.
plot( kos , add = TRUE , col = 'red' )
# ukraine #
download_cached( uk.fn , tf )
ukr.files <- unzip( tf , exdir = tempdir() )
# there are three administrative files to choose from.
ukr.files
# you want the regions, which are stored in `adm1`
ukr.shp <- grep( 'adm1(.*)shp$' , ukr.files , value = TRUE )
# read it in
ukr <- readOGR( ukr.shp , layer = gsub( "\\.shp" , "" , basename( ukr.shp ) ) )
# compare the region names (in the ess codebook)
# with the region names in the shapefile
ukr@data
# in the ukranian shapefile, this is the order of UA01 - UA26
ukr.ord <- c( 4 , 24 , 25 , 5 , 6 , 27 , 23 , 26 , 7 , 11 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 21 , 22 , 8 , 9 , 10 , 1 , 3 , 2 , 12 )
# tack the regions on to the ukranian shapefile
ukr@data[ ukr.ord , 'NUTS_ID' ] <- paste0( 'UA' , str_pad( 1:26 , 2 , pad = '0' ) )
# throw out regions not included in the ess
# on the southern tip of crimea
ukr <- subset( ukr , !is.na( NUTS_ID ) )
# look at what you've got
plot( ukr , add = TRUE , col = 'red' )
# russia #
# http://www.europeansocialsurvey.org/docs/round6/fieldwork/russian_federation/ESS_region_variable_in_the_russian_federation.pdf
download_cached( ru.fn , tf )
rus.files <- unzip( tf , exdir = tempdir() )
# there are three administrative files to choose from.
rus.files
# you want the regions, which are stored in `adm1`
rus.shp <- grep( 'adm1(.*)shp$' , rus.files , value = TRUE )
# read it in
rus <- readOGR( rus.shp , layer = gsub( "\\.shp" , "" , basename( rus.shp ) ) )
# pull in the russian regional crosswalk that i made by hand just for you
download( "https://raw.githubusercontent.com/davidbrae/swmap/master/2012%20ESS%20crosswalk%20of%20Russian%20Regions.csv" , tf )
# http://www.europeansocialsurvey.org/docs/round6/fieldwork/russian_federation/ESS_region_variable_in_the_russian_federation.pdf
rus.xwalk <- read.csv( tf )
# tack on the various NUTS_IDs for each of the available regions
rus@data$NUTS_ID <- rus.xwalk[ match( rus@data$ID_1 , rus.xwalk$ID_1 ) , 'region' ]
# # want to explore this a bit? sure you do # #
# lop off the easternmost siberian province so this map doesn't wrap around
rusnc <- subset( rus , ID_1 != 2524 )
# russia without the far far eastern tip
plot(rusnc)
# plot all eight regions so you can see what this landmass looks like
for ( i in 1:8 ) plot( subset( rus , str_trim( NUTS_ID ) == paste0( 'RU1' , i ) ) , add = TRUE , col = rainbow(8)[i] )
# but that's not terribly interesting,
# and the siberian sample sizes are tiny.
# the main point of this survey is continental europe,
# so the principle here will be: center on the continent
# and if parts of russia get colored on the edge of the map, cool.
# # exploration end # #
# re-map everything, suddenly we've got lot more available geographies
plot( matnci.shp , col = 'red' )
plot( world.shp , add = TRUE )
plot( alb , add = TRUE , col = 'red' )
plot( kos , add = TRUE , col = 'red' )
plot( ukr , add = TRUE , col = 'red' )
plot( rus , add = TRUE , col = 'red' )
# but it's still not as tight of a map as it could be
# # here's the goal # #
# russia is going to be off the map, no question
# but all of the other countries should be visible entirely.
# using the current bounding box of this shape,
plot( 1 ,
type = 'n' , axes = FALSE , xlab = "" , ylab = "" ,
xlim = bbox( matnci.shp )[ 1 , ] ,
ylim = bbox( matnci.shp )[ 2 , ]
)
# ukraine gets cut off.
plot( ukr , add = TRUE , col = 'red' )
# yes, russia gets cut off too, but we probably have to live with that
plot( rus , add = TRUE , col = 'red' )
# let's re-calculate the bounding box with ukraine
# so that it's not cut off.
# we can skip this for kosovo and albania
# because they are both *within* the bounding box
# created by including cyprus.
# start with the main europe map,
# transform its projection to standard longlat
ess.shp <- spTransform( matnci.shp , CRS( "+proj=longlat" ) )
# repeat this conversion for ukraine
ukr.ll <- spTransform( ukr , CRS( "+proj=longlat" ) )
# keep only the NUTS identifier in both shapefiles
ess.shp@data <- ess.shp@data[ "NUTS_ID" ]
ukr.ll@data <- ukr.ll@data[ "NUTS_ID" ]
# merge both shapes
ess.shp <- rbind( ess.shp , ukr.ll )
# now you could plot it on its own..
plot( ess.shp )
# ..but the purpose of this was to determine
( bb <- bbox( as( 1.05 * extent( ess.shp ), "SpatialPolygons" ) ) )
# the latitude and longitude limits that make sense.
# initiate the plot using that new ukranian-extended bounding box
plot( 1 ,
type = 'n' , axes = FALSE , xlab = "" , ylab = "" ,
xlim = bb[ 1 , ] ,
ylim = bb[ 2 , ]
)
# add in every shape, one by one.
# no color for the world map
plot( world.shp , add = TRUE )
# then add red to all regions with ess microdata.
plot( matnci.shp , col = 'red' , add = TRUE )
plot( alb , add = TRUE , col = 'red' )
plot( kos , add = TRUE , col = 'red' )
plot( ukr , add = TRUE , col = 'red' )
plot( rus , add = TRUE , col = 'red' )
# # end of step 3 # #
# # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # step 4: merge the results of your survey analysis with the small-area geography # #
library(rgeos)
library(sqldf)
# # note: this uses the unweighted centroid,
# # rather than the population-weighted centroid.
# # population-weighted centroids aren't too difficult
# # to calculate (see the other mapping scripts for examples)
# # but require that you merge on the smallest possible area
# # and then distribute the small area statistics to even smaller geographies.
# # the population estimates *within* each of these regions
# # would mostly be the NUTS3 estimate from
# # each country's most recent decennial census.
# # but it's a lot of work with relatively little payoff for a
# # continent-wide map, so this map works around it.
# # it's not perfect, though. you do experience issues like
# # the centroid of norway's northernmost province is in sweden.
# calculate the centroids of each administrative boundary
matnci.cen <- gCentroid( matnci.shp , byid = TRUE )
# you can have a little detour to see what these look like
plot( matnci.cen )
plot( matnci.shp , add = TRUE )
# those look pretty central, huh?
# additionally calculate the centroids
# of albania, kosovo, and ukraine
alb.cen <- gCentroid( alb , byid = TRUE )
kos.cen <- gCentroid( kos , byid = TRUE )
ukr.cen <- gCentroid( ukr , byid = TRUE )
# now russia is different from the others.
rus.cen <- gCentroid( rus , byid = TRUE )
# for each of the previous centroid calculations,
# you had one centroid per small area statistic
# but with russia, you have about ten.
# we'll have to deal with this later.
# # # construct the object containing values at every point # # #
# initiate a function that combines centroids with `tvtot` and `se`
# but also calculates the number of instances of each NUTS_ID
cte <-
function( cen , shp ){
out <- data.frame( cen )
out$NUTS_ID <- shp@data$NUTS_ID
out$tvtot <- sas[ match( shp@data$NUTS_ID , sas$NUTS_ID ) , 'tvtot' ]
out$se <- sas[ match( shp@data$NUTS_ID , sas$NUTS_ID ) , 'se' ]
nc <- sqldf( "select NUTS_ID , count(*) as nc from out group by NUTS_ID" )
out <- merge( out , nc )
out
}
matnci.vals <- cte( matnci.cen , matnci.shp )
alb.vals <- cte( alb.cen , alb )
kos.vals <- cte( kos.cen , kos )
ukr.vals <- cte( ukr.cen , ukr )
rus.vals <- cte( rus.cen , rus )
# note that *only* russia has multiple centroids per NUTS_ID
head( rus.vals )
# all of the other data.frame objects have a `nc` field of one
unique( c( matnci.vals$nc , alb.vals$nc , kos.vals$nc , ukr.vals$nc ) )
# see? seeeee? i told you.
# stack each of these values data.frame objects together
x <- rbind( matnci.vals , alb.vals , kos.vals , ukr.vals , rus.vals )
# alright. remember that the standard error (the `se` field) is a measure of precision.
print( x )
# the smaller the standard error, the more confident you should be
# that the estimate at a particular geography is correct.
# so invert it. you heard me. invert it.
x$invse <- 1 / x$se
# a smaller standard error indicates more precision.
# for our purposes, precision can be considered weight! #
# we also blankly distributed our values and standard errors
# across the multi-province regions in russia without
# accounting for multiple regions per province. let's do that now.
x$weight <- x$invse / x$nc
# so we have quite a few more russian centroids,
# but each of them has far lower weight than other points on our map
# as mentioned before, this isn't as ideal. for example:
# st. petersburg gets exactly one eleventh of the total
# "north-western" weight because there are eleven
# administrative regions in the north west.
# that's a rough cut. this effect might be even more biased
# for kaliningrad (the russian enclave west of lithuania)
# which is also in the north-western region
# if you'd like to calculate population-weighted centroids
# for all of these countries, it's just a lot of manual labor
# note that this distributed weight sums to
# the `invse` on the original analysis file
stopifnot( all.equal( sum( x$weight ) , sum( 1 / subset( sas , NUTS_ID != 'ES70' )$se ) ) )
# don't forget to pull out the weight of the canary islands as well!
# once you do that, the sums of the inverted, divided standard errors match up
# scale all weights so that they average to one
x$weight <- x$weight / mean( x$weight )
# you're done preparing your data.
# keep only the columns you need.
x <- x[ , c( 'x' , 'y' , 'tvtot' , 'weight' ) ]
# # end of step 4 # #
# # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # #
# # step 5: decide on your map parameters # #
library(ggplot2)
library(scales)
library(mapproj)
# before you ever touch surface smoothing or kriging,
# make some decisions about how you generally want
# your map to look: the projection and coloring
# the options below simply use hadley wickham's ggplot2
# with the region-level television viewing rates and centroids
# initiate the simple map
eu.map <-
qplot(
x ,
y ,
data = x ,
colour = tvtot ,
xlab = NULL ,
ylab = NULL
)
# look at that.. russia is still throwing things off
eu.map
# set the bounding box limits that
# you and i agreed to earlier in the script
# oh, also, remove all map crap
eu.map <-
eu.map +
scale_x_continuous( limits = bb[ 1 , ] , breaks = NULL ) +
scale_y_continuous( limits = bb[ 2 , ] , breaks = NULL ) +
theme(
legend.position = "none" ,
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
panel.border = element_blank(),
axis.ticks = element_blank()
)
# the asian part of russia has now been cut off
eu.map
# print the map with an albers projection.
eu.map + coord_map( project = "albers" , lat0 = min( x$y ) , lat1 = max( x$y ) )
# see ?mapproject for a zillion alternatives
# if you like that projection, store it in the map object.
eu.map <-
eu.map + coord_map( project = "albers" , lat0 = min( x$y ) , lat1 = max( x$y ) )
# check out some purty colors.
eu.map + scale_colour_gradient( low = 'green' , high = 'red' )
eu.map + scale_colour_gradient( low = 'white' , high = 'blue' )
eu.map + scale_colour_gradient( low = muted( 'blue' ) , high = muted( 'red' ) )
# clear up RAM
rm( eu.map ) ; gc()
# # end of step 5 # #
# # # # # # # # # # #
# # # # # # # # # # #
# # step 6: krige # #
# note that because there are less than five hundred points
# that we're kriging across, smaller computers can handle
# this particular kriged regression without knot tying
nrow( x )
# interpolation option one #
library(fields)
krig.fit <-
Krig(
cbind( x$x , x$y ) ,
x$tvtot ,
weights = x$weight
)
# that is: what is the (weighted) relationship between
# your variable of interest (hours of television) and
# the x/y points on a grid?
# check this out!
surface( krig.fit )
# lookin' good. but this includes siberia
# # end of step 6 # #
# # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # #
# # step 7: make a grid and predict # #
library(raster)
x.range <- bb[ 1 , ]
y.range <- bb[ 2 , ]
# add five percent on each side
x.diff <- abs( x.range[ 2 ] - x.range[ 1 ] ) * 0.05
y.diff <- abs( y.range[ 2 ] - y.range[ 1 ] ) * 0.05
x.range[ 1 ] <- x.range[ 1 ] - x.diff
x.range[ 2 ] <- x.range[ 2 ] + x.diff
y.range[ 1 ] <- y.range[ 1 ] - y.diff
y.range[ 2 ] <- y.range[ 2 ] + y.diff
# choose the number of ticks (in each direction) on your grid
grid.length <- 400
# grid.length <- 600
# # note: smaller grids will render much much faster
# # (so they're better if you're just playing around)
# # but larger grids will prevent your final plot from
# # being too pixelated, even when zooming in.
# # anything beyond a 400 x 400 grid might
# # overload the RAM capacity of small computers.
# create some grid data.frame objects, one for each interpolation type
grd <- krig.grd <-
expand.grid(
x = seq( from = x.range[1] , to = x.range[2] , length = grid.length ) ,
y = seq( from = y.range[1] , to = y.range[2] , length = grid.length )
)
# along your rectangular grid,
# what are the predicted values of
# television viewership hours?
# loop through this prediction to conserve RAM
for ( i in split( seq( nrow( grd ) ) , ceiling( seq( nrow( grd ) ) / 20000 ) ) ){
krig.grd[ i , 'kout' ] <- predict( krig.fit , krig.grd[ i , c( 'x' , 'y' ) ] )
gc()
}
# note: alternative prediction methods fare poorly on such a large surface,
# at least for me. but i'd love to be proven wrong. thanx
# # end of step 7 # #
# # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # #
# # step 8: ggplot and choose options # #
library(ggplot2)
# # # psa # # #
# capping your outliers might drastically change your map.
# if you find the 25th percentile and 75th percentile with
summary( krig.grd$kout )
# and then replace all `kout` values below the 25th or above the 75th
# with those capped percentile endpoints, i promise promise promise
# your maps will appear quite different. you could cap at the 25th and 75th with..
# grd.sum <- summary( krig.grd$kout )
# krig.grd[ krig.grd$kout > grd.sum[ 5 ] , 'kout' ] <- grd.sum[ 5 ]
# krig.grd[ krig.grd$kout < grd.sum[ 2 ] , 'kout' ] <- grd.sum[ 2 ]
# # # end # # #
# you don't want to cap at the 25th and 75th?
# well consider one other idea: at least cap at the 5th and 95th
# this will also increase the visible gradient ultimately plotted.
# if a numeric vector has values below the 5th percentile or above the 95th percentile, cap 'em
minnmax.at.0595 <-
function( z ){
q0595 <- quantile( z , c( 0.05 , 0.95 ) )
z[ z < q0595[ 1 ] ] <- q0595[ 1 ]
z[ z > q0595[ 2 ] ] <- q0595[ 2 ]
z
}
# min and max all numeric values. this makes the gradient much more visible.
krig.grd$kout <- minnmax.at.0595( krig.grd$kout )
# it also (unfairly perhaps) amplifies the differences between points
# initiate the krige-based plot
krg.plot <-
ggplot( data = krig.grd , aes( x = x , y = y ) ) +
geom_point( data = krig.grd , aes( color = kout ) )
# view the grid
krg.plot
# initiate the entire plot
the.plot <-
krg.plot +
# blank out the legend and axis labels
theme(
legend.position = "none" ,
axis.title.x = element_blank() ,
axis.title.y = element_blank()
) +
# blank out other plot elements
scale_x_continuous( limits = bb[ 1 , ] , breaks = NULL , oob = squish ) +
scale_y_continuous( limits = bb[ 2 , ] , breaks = NULL , oob = squish ) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
panel.border = element_blank(),
axis.ticks = element_blank()
)
# print the plot to the screen
the.plot
# # end of step 8 # #
# # # # # # # # # # #
# # # # # # # # # # #
# # step 9: blank # #
library(SpatialTools)
library(plyr)
# you have the krigged grid
# you have the map of world country borders
# you have a few maps of regions that you actually have data for
# everything outside country borders is water.
# everything inside country borders but outside of regions with data should be "missing"
# we had previously constructed the bounding box
# with `matnci.shp` plus ukraine. but we now need
# albania, kosovo, and russia added to that shape.
# standardize the projections of all shapes
# but for russia, we only need RU11 - RU15
rusub <- subset( rusnc , NUTS_ID %in% c( 'RU11' , 'RU12' , 'RU13' , 'RU14' , 'RU15' ) )
rus.ll <- spTransform( rusub , CRS( "+proj=longlat" ) )
alb.ll <- spTransform( alb , CRS( "+proj=longlat" ) )
kos.ll <- spTransform( kos , CRS( "+proj=longlat" ) )
# keep only the NUTS identifier
rus.ll@data <- rus.ll@data[ "NUTS_ID" ]