<a href="https://colab.research.google.com/github/cimbelli/LAU/blob/main/IT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download and check LAU files for Italy

In [112]:
### IMPORT LIBRARIES

import pandas as pd
import os
import zipfile
import glob
import shutil
import numpy as np

year = 2021

In [113]:
def extract(urlzipfile, cond, name):
  
  zipfilename = os.path.basename(urlzipfile)
  outfile = ''
  if not os.path.isfile(zipfilename):
    !wget $urlzipfile
  #print(os.path.splitext(zipfilename)[-1])
  if os.path.splitext(zipfilename)[-1] == '.zip': 
    zip = zipfile.ZipFile(zipfilename)

    for file in zip.namelist():
        #print(os.path.basename(file))
        if file.endswith(cond):
          if not file:
                continue
          source = zip.open(file)
          #target = open(os.path.basename(file), "wb")
          ext = os.path.splitext(cond)[-1]
          outfile = name + '_' + str(year) + ext
          target = open(outfile, "wb")
          with source, target:
              shutil.copyfileobj(source, target)
    zip.close()
    os.remove(zipfilename)
  else:
    outfile = name + '_' + str(year) + os.path.splitext(zipfilename)[-1] 
    os.rename(zipfilename, outfile)
  
  return outfile

def openfile(filename, cols):
  ext = os.path.splitext(filename)[-1] 
  if ext == '.xls' or ext == '.xlsx':
    df = pd.read_excel(filename, sheet_name=0, usecols = cols)
  if ext == '.csv':
    df = pd.read_csv(filename, sep=';', encoding="utf-8", quotechar='"',usecols = cols, skiprows=1, index_col=False,  dtype={population_code: object}).dropna()
  return df

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
### FILES URL and NAMES
yrange = ''
if year in range (2022, 2024):
  yrange = '2022-2023' 
elif year in range (2017, 2022):
   yrange = '2017-2021'
else:
  print("year not found")


nomenclature_zipfile = 'https://www.istat.it/storage/codici-unita-amministrative/Archivio-elenco-comuni-codici-e-denominazioni_Anni_' + yrange + '.zip'
nomenclature_cond = '31_12_' + str(year) + '.xls'
nomenclature_code = 'Codice Comune formato alfanumerico'
nomenclature_name = 'Denominazione in italiano'
nomenclature_nuts3 = 'Codice NUTS3 2021'
nomenclature_cols = [nomenclature_code,nomenclature_name,nomenclature_nuts3]

population_zipfile = 'https://demo.istat.it/data/p2/P2_' + str(year) + '_it_Comuni.zip'
population_cond = 'P2_'  + str(year) + '_it_Comuni.csv'
population_code = 'Codice comune'
population_name = 'Comune'
population_1stJan = 'Popolazione al 1° gennaio - Totale'
population_cols = [population_code, population_name, population_1stJan]

classifications_zipfile = 'https://www.istat.it/it/files//2015/04/Classificazioni-statistiche-Anni_' + yrange + '.zip'
classifications_cond = '31_12_' + str(year) + '.xls'
classifications_code = 'Codice Istat del Comune \n(numerico)'
classifications_name = 'Denominazione (Italiana e straniera)'
classifications_name2 = 'Denominazione altra lingua'
classifications_area = 'Superficie territoriale (kmq) al 01/01/' + str(year)
classifications_coast = 'Zone costiere'
classifications_deg = 'Grado di urbanizzazione'
classifications_cols = [classifications_code,classifications_name,classifications_name2,classifications_area,classifications_coast,classifications_deg]

cityfua_file = 'https://www.istat.it/it/files//2019/11/Elenco-city-e-composizione-FUA-31-12-2020.xlsx'
cityfua_cond = cityfua_file[-9:]
cityfua_code = 'Procom numerico'
cityfua_citycode = 'Codice City'
cityfua_cityname = 'Denominazione City'
cityfua_fuacode = 'Codice FUA'
cityfua_fuaname = 'Denominazione FUA'
cityfua_cols = [cityfua_code,cityfua_citycode,cityfua_cityname,cityfua_fuacode,cityfua_fuaname]

lau_code = "LAU CODE"
change = "CHANGE (Y/N)"

# field mapping
columns={nomenclature_nuts3: "NUTS 3 CODE",
         lau_code: "LAU CODE",
         population_name: "LAU NAME NATIONAL",
         classifications_name2: "LAU NAME LATIN",
         change: "CHANGE (Y/N)",
         population_1stJan: "POPULATION",
         classifications_area: "TOTAL AREA (m2)",
         classifications_deg: "DEGURBA",
         classifications_coast: "COASTAL AREA (yes/no)",
         cityfua_citycode: "CITY_ID",
         cityfua_cityname: "CITY_ID",
         cityfua_fuacode: "FUA_ID",
         cityfua_fuaname: "FUA_NAME"
         }

country ='IT'
outname = 'NUTS_LAU_A_IT_' + str(year) + '_0000_V0001.xlsx'

In [None]:
### FILES DOWNLOAD and RENAME

%%capture test
nomenclature_file = extract(nomenclature_zipfile, nomenclature_cond,'nomenclature')
population_file = extract(population_zipfile, population_cond,'population')
classifications_file = extract(classifications_zipfile, classifications_cond,'classifications')
cityfua_file = extract(cityfua_file, cityfua_cond,'cityfua')


In [114]:
### FILES OPEN
nomenclature = openfile(nomenclature_file, nomenclature_cols)
population = openfile(population_file, population_cols)
classifications = openfile(classifications_file,classifications_cols)
cityfua = openfile(cityfua_file, cityfua_cols)

# check LAU file of the previous year
prev_file = 'NUTS_LAU_A_IT_' + str(year-1) + '_0000_V0001.xlsx'
if os.path.exists(prev_file):
  previous = openfile(prev_file)

population[lau_code] = population[population_code].astype(str)
population[population_code] = pd.to_numeric(population[population_code])
population[population_1stJan] = population[population_1stJan].astype(int)

In [None]:
### 1st JOIN
df = pd.merge(nomenclature,population,left_on=nomenclature_code, right_on=population_code)
df = df[[nomenclature_nuts3,lau_code, population_code, population_name, population_1stJan]]

### 2nd JOIN
df1 = pd.merge(df,classifications,left_on=population_code, right_on=classifications_code)
df1 = df1[[nomenclature_nuts3,lau_code, population_code, population_name,classifications_name2, population_1stJan, classifications_area,classifications_deg,classifications_coast, ]]

### 3rd JOIN
df2 = pd.merge(df1,cityfua,left_on=population_code, how='left', right_on=cityfua_code)
df2[change]= ''
df2 = df2[[nomenclature_nuts3,lau_code, population_name,classifications_name2, population_1stJan, change,classifications_area,classifications_deg,
           classifications_coast, cityfua_citycode,cityfua_cityname,cityfua_fuacode,cityfua_fuaname]]

# some value adjustments
df2 = df2.replace('N.d.', 0)
df2[classifications_area] = df2[classifications_area].astype(float)*1000000
df2[classifications_area] = df2[classifications_area].astype(int)

df2.loc[df2[classifications_coast] == 1, classifications_coast] = 'yes'
df2.loc[df2[classifications_coast] == 0, classifications_coast] = 'no'

df2 = df2.fillna('')
df2.replace(np.nan,'',regex=True) 
df2 = df2.replace(0,'')

# output
out = df2.rename(columns=columns)
out.to_excel(outname, sheet_name = country, index=False)

In [None]:
out.head()

Unnamed: 0,NUTS 3 CODE,LAU CODE,LAU NAME NATIONAL,LAU NAME LATIN,POPULATION,CHANGE (Y/N),TOTAL AREA (m2),DEGURBA,COASTAL AREA (yes/no),CITY_ID,CITY_ID.1,FUA_ID,FUA_NAME
0,ITC11,1001,Agliè,,2545,,13146200,2,no,,,,
1,ITC11,1002,Airasca,,3633,,15739300,3,no,0.0,0.0,IT004F,Torino
2,ITC11,1003,Ala di Stura,,459,,46331500,3,no,,,,
3,ITC11,1004,Albiano d'Ivrea,,1638,,11731400,3,no,,,,
4,ITC11,1006,Almese,,6355,,17875600,2,no,0.0,0.0,IT004F,Torino
