# Imports

## Libraries

In [134]:
import geopandas as gpd
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## Data

In [115]:
prov_gdf = gpd.read_file("../data/it_provinces_shapes.geojson", driver='geojson')
provinces_df = pd.read_csv("../data/provinces.csv", skipfooter=1)

# Fixing dataframes

## Incorrect Value

In [116]:
naples_location = provinces_df[provinces_df['Capital'] == "Naples"].index[0]
provinces_df.Code[naples_location] = 'NA'

In [117]:
# Checking that all codes are 2 letters 
code_list = list(provinces_df.Code.unique())
for idx, x in enumerate(code_list):
    if type(x) != str:
        print(idx)
    if len(x) > 2:
        print(x)

## Feature Engineering

In [118]:
# preparing to combine prov_gdf and provinces_df
prov_gdf['Code'] = [x[-2:] for x in prov_gdf['iso_3166_2']]

## Datatype Correction

In [119]:
provinces_df['Population'] = provinces_df['Population (2019)[3]'].str.replace(',', '').astype(int)

## Dropping Unnecessary Columns

### From prov_gdf

In [120]:
unnecessary_cols = [x for x in prov_gdf.columns if (prov_gdf[x].nunique() == 1) ] # 1 value columns
unnecessary_cols.extend(['adm1_code', 'iso_3166_2', 'gns_region'])
prov_gdf.drop(columns= unnecessary_cols, inplace=True)

### From provinces_df

In [121]:
provinces_df.drop(columns = ['Type', 'Capital', 'Comuni', 'President'], inplace=True)

## Sardinian regions

Due to a consolidation of the provinces in 2014, Sardinia went from 8 provinces, to 4. The geodataframe still represents the pre-2014 structure. To correct for this, I will combine the provinces geometries when possible (in the case of "Olbia-Tempio" being consolidated into "Sassari"). Only for the current provinces of Cagliari and South Sardinia does this run into an issue since the geometries changed in 2014, too. Therefore, I will combine the pre-2014 provinces that make up these two provinces and treat both as a single "South_Sardinia" province. 

### Changing prov_gdf (geometries)

In [122]:
# viewing the sardinian regions
sard_geo = prov_gdf[64:72]

In [123]:
prov_gdf['Code'][64] = 'South_Sardinia'
prov_gdf['Code'][65] = 'Sass'
prov_gdf['Code'][67] = 'Sass'
prov_gdf['Code'][69] = 'South_Sardinia'
prov_gdf['Code'][70] = 'South_Sardinia'
prov_gdf['Code'][71] = 'South_Sardinia'

In [124]:
# merging the geometries of the new "regions"
prov_gdf2 = prov_gdf.dissolve(by='Code')

### Changing Provinces_df (populations)

In [125]:
south_sard_new_population = provinces_df['Population'][17] + provinces_df['Population'][83]

In [126]:
provinces_df[85:89]

Unnamed: 0,Province,Code,Region,Macroregion,Population (2019)[3],Area(km²),Density(p/km²),Population
85,Siena,SI,Tuscany,Centre,267197,3823,71,267197
86,Sondrio,SO,Lombardy,North-West,181095,3210,57,181095
87,South Sardinia,SU,Sardinia,Insular,350725,6530,54,350725
88,South Tyrol,BZ,Trentino-South Tyrol,North-East,531178,7400,69,531178


In [127]:
## new population for south_sardinia is 431038 + 350725 = 781763
provinces_df['Code'][17] = 'South_Sardinia'
provinces_df['Code'][83] = 'Sass'
provinces_df.drop(index = 87, inplace=True)
provinces_df['Population'][17] = south_sard_new_population

#### Merging Geometries

In [128]:
prov_gdf2 = prov_gdf.dissolve(by='Code')

## Merging the 2 Dataframes

In [129]:
prov_gdf2.columns

Index(['geometry', 'name'], dtype='object')

In [130]:
print(provinces_df.shape)
print(prov_gdf2.shape)

(106, 8)
(106, 2)


In [131]:
provinces_data_gdf = prov_gdf2.merge(provinces_df, on='Code', how='inner')
provinces_data_gdf.shape

(106, 10)

# Saving DataFrame

In [133]:
provinces_data_gdf.to_file("../data/provinces_gdf.geojson", driver='GeoJSON')