# Match Cells with their Sub-Region

Need the following files:
* list_25_75.csv - contains all the cells rounded to .25 and .75 accuracy
* tasday_thresh.csv - contains the subset of the cells we are working with.

METHOD:
Use the cells from `tasday_thresh.csv` to subset `list_25_75.csv` so that we just have data from the cells we want.

In [3]:
# Read in regions
regions = pd.read_csv('list_25_75.csv', index_col=0)
regions = regions.set_index(['lon_25_75', 'lat_25_75'])
regions

Unnamed: 0_level_0,Unnamed: 1_level_0,ID_HDC_G0,GCPNT_LAT,GCPNT_LON,XC_NM_LST,XC_ISO_LST,GRGN_L1,GRGN_L2,UC_NM_MN,UC_NM_LST,P15
lon_25_75,lat_25_75,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
51.25,35.75,5737,35.698462,51.227700,Iran,IRN,Asia,South-Central Asia,Tehran,Tehran; Karaj; Eslamshahr; Qods; Shahriyar,1.251504e+07
3.25,6.75,2125,6.567182,3.297388,Nigeria,NGA,Africa,Western Africa,Lagos,Lagos,1.157504e+07
106.75,10.75,11800,10.851848,106.712677,Vietnam,VNM,Asia,South-Eastern Asia,Ho Chi Minh City,Ho Chi Minh City; Thu Dau Mot City; Gia Dinh; ...,1.148834e+07
77.75,13.25,8050,13.020146,77.574697,India,IND,Asia,South-Central Asia,Bengaluru,Bengaluru,1.063162e+07
116.25,23.25,12430,23.429406,116.462478,China,CHN,Asia,Eastern Asia,Jieyang,Jieyang; Shantou; Chaozhou; Puning; Chaoyang; ...,1.044518e+07
...,...,...,...,...,...,...,...,...,...,...,...
79.75,26.25,7694,26.489143,79.900656,India,IND,Asia,South-Central Asia,Rura,Rura,5.002188e+04
30.75,27.75,3958,27.991563,30.848404,Egypt,EGY,Africa,Northern Africa,Abyūhā,Abyūhā; Kawm az Zuhayr; Izbat an Nakhl; Al Sah...,5.001650e+04
68.75,22.25,6371,22.244177,68.968158,India,IND,Asia,South-Central Asia,Dwarka,Dwarka,5.001263e+04
106.25,34.75,10270,34.992450,106.211371,China,CHN,Asia,Eastern Asia,Zhangjiachuan,Zhangjiachuan,5.000717e+04


In [4]:
len(np.unique(regions.index))

7191

In [5]:
# Create a list of cells we want to work with
df = pd.read_csv('tasday_thresh.csv')
df = df.set_index(['lon', 'lat'])
cells = np.unique(df.index)
len(cells)

365

In [6]:
# Subset regions using just the cells we want
regions_matched = regions.loc[cells]
regions_matched

Unnamed: 0_level_0,Unnamed: 1_level_0,ID_HDC_G0,GCPNT_LAT,GCPNT_LON,XC_NM_LST,XC_ISO_LST,GRGN_L1,GRGN_L2,UC_NM_MN,UC_NM_LST,P15
lon_25_75,lat_25_75,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
-122.75,42.25,69,42.337058,-122.868137,United States,USA,Northern America,Northern America,Medford,Medford,9.101015e+04
-119.75,34.25,5,34.427664,-119.743693,United States,USA,Northern America,Northern America,Santa Barbara,Santa Barbara,1.147532e+05
-118.25,34.75,32,34.584848,-118.131720,United States,USA,Northern America,Northern America,Palmdale,Palmdale; Lancaster,2.771812e+05
-109.75,31.25,87,31.323486,-109.541843,Mexico;United States,MEX; USA,Latin America and the Caribbean,Central America,Agua Prieta,Agua Prieta,9.791077e+04
-106.75,32.25,147,32.311465,-106.769657,United States,USA,Northern America,Northern America,Las Cruces,Las Cruces,6.407468e+04
...,...,...,...,...,...,...,...,...,...,...,...
140.75,40.75,12827,40.821641,140.745742,Japan,JPN,Asia,Eastern Asia,Aomori,Aomori,1.874362e+05
141.25,43.25,12736,43.070521,141.373814,Japan,JPN,Asia,Eastern Asia,Sapporo,Sapporo; Ebetsu,1.894557e+06
141.25,43.25,12734,43.193801,141.014573,Japan,JPN,Asia,Eastern Asia,Otaru,Otaru,7.577249e+04
142.75,-3.25,13086,-3.474474,142.689366,Papua New Guinea,PNG,Oceania,Melanesia,,-,7.321599e+04


In [7]:
# Create list of lon/lat pairs so that we can create new column 'urban_center_count'
# It's in ascending order and cells listed multiple times means there are multiple centers in that cell
cells_non_unique = list(regions_matched.index)
cells_non_unique

[(-122.75, 42.25),
 (-119.75, 34.25),
 (-118.25, 34.75),
 (-109.75, 31.25),
 (-106.75, 32.25),
 (-106.25, 28.75),
 (-103.25, 25.75),
 (-103.25, 25.75),
 (-102.75, 20.75),
 (-99.75, 16.75),
 (-99.75, 17.75),
 (-98.75, 20.25),
 (-98.75, 20.25),
 (-98.25, 34.75),
 (-97.75, 25.75),
 (-97.25, 19.75),
 (-96.25, 42.25),
 (-91.25, 14.25),
 (-90.75, 42.75),
 (-88.25, 15.25),
 (-87.75, 42.25),
 (-85.75, 42.75),
 (-84.75, 42.75),
 (-84.25, 33.75),
 (-79.75, -1.25),
 (-79.75, -1.25),
 (-79.25, 44.25),
 (-77.75, 21.25),
 (-76.25, 4.25),
 (-75.25, 7.75),
 (-75.25, 7.75),
 (-73.75, 18.25),
 (-73.25, -40.75),
 (-73.25, 41.25),
 (-73.25, 41.25),
 (-72.75, 10.25),
 (-72.75, 46.25),
 (-71.75, -35.75),
 (-71.25, 10.25),
 (-71.25, 10.25),
 (-69.25, 9.75),
 (-65.75, -27.25),
 (-64.75, -21.75),
 (-62.25, 8.25),
 (-60.75, -32.75),
 (-58.75, -38.75),
 (-56.25, -25.75),
 (-54.25, -31.25),
 (-53.25, -26.25),
 (-51.25, -22.25),
 (-50.25, -29.75),
 (-50.25, -27.75),
 (-48.75, -2.75),
 (-47.75, -23.25),
 (-47.75, -

In [8]:
# The total number of urban centers in a cell is equivalent to the lenth - len() - of the dataframe we get
# if we just look at data for that cell - regions_matched.loc[cell]
regions_matched['urban_center_count'] = pd.Series(data=[len(regions_matched.loc[cell]) for cell in cells_non_unique],
                                                  # Pass an index so that it matches up with regions_matched
                                                 index=cells_non_unique)

In [9]:
# Check last 50 entries to make sure it worked
regions_matched.urban_center_count.tail(50)

lon_25_75  lat_25_75
119.25      35.25       2
            35.25       2
119.75      28.25       2
            28.25       2
            36.25       1
120.75      16.25       1
            40.75       2
            40.75       2
121.25      16.75       1
            41.75       1
121.75      36.75       1
122.25      6.75        1
            37.25       2
            37.25       2
            39.75       1
            41.25       2
            41.25       2
122.75      0.75        1
            45.75       1
124.25      42.75       2
            42.75       2
124.75      8.25        1
            45.25       1
125.75      38.75       4
            38.75       4
            38.75       4
            38.75       4
            40.25       1
            41.75       2
            41.75       2
            47.25       1
126.75      46.75       1
127.25      45.75       1
129.25      35.25       3
            35.25       3
            35.25       3
            40.75       2
            40.75

In [10]:
# Check the unique regions listed
np.unique(regions_matched.GRGN_L2)

array(['Caribbean', 'Central America', 'Eastern Africa', 'Eastern Asia',
       'Eastern Europe', 'Melanesia', 'Middle Africa', 'Northern Africa',
       'Northern America', 'Northern Europe', 'South America',
       'South-Central Asia', 'South-Eastern Asia', 'Southern Europe',
       'Western Africa', 'Western Asia', 'Western Europe'], dtype=object)

In [11]:
# Total number of urban centers in each region
grouped_by_region = regions_matched.groupby('GRGN_L2')
grouped_by_region.urban_center_count.count()

GRGN_L2
Caribbean               2
Central America        13
Eastern Africa         46
Eastern Asia          110
Eastern Europe         23
Melanesia               2
Middle Africa          27
Northern Africa        22
Northern America       15
Northern Europe        15
South America          35
South-Central Asia    205
South-Eastern Asia     34
Southern Europe        16
Western Africa         52
Western Asia           26
Western Europe          7
Name: urban_center_count, dtype: int64

In [12]:
# Average number of urban centers per cell in each region
grouped_by_region.urban_center_count.mean()

GRGN_L2
Caribbean             1.000000
Central America       1.307692
Eastern Africa        3.043478
Eastern Asia          2.545455
Eastern Europe        1.000000
Melanesia             1.000000
Middle Africa         1.888889
Northern Africa       2.090909
Northern America      1.133333
Northern Europe       3.133333
South America         1.228571
South-Central Asia    4.707317
South-Eastern Asia    1.823529
Southern Europe       1.875000
Western Africa        3.500000
Western Asia          1.846154
Western Europe        1.571429
Name: urban_center_count, dtype: float64

In [14]:
regions_matched.to_csv('regions_matched.csv')