In [86]:
import pandas as pd
import numpy as np
import re
df_bla = pd.read_excel('assets/prepared/BLA.xlsx')
df_bla04 = pd.read_excel('assets/prepared/BLA_04.xlsx')
df_bla16 = pd.read_excel('assets/prepared/BLA_16.xlsx')

#df_bla drop year, month
df_bla = df_bla.drop(columns=['YEAR', 'MONTH'])
#df_bla04 drop year, month
df_bla04 = df_bla04.drop(columns=['YEAR', 'MONTH'])
#df_bla16 drop year, month
df_bla16 = df_bla16.drop(columns=['YEAR', 'MONTH'])

# SET TO IGNORE WARNINGS


In [93]:
#outter join them on time_period all and frequency
df_bla_merged = df_bla.merge(df_bla04, on=['time_period', 'FREQ'], how='outer', suffixes=('', '_04'))
df_bla_merged = df_bla_merged.merge(df_bla16, on=['time_period', 'FREQ'], how='outer', suffixes=('', '_16'))

# change column position as
new_order = ['FREQ', 'time_period', 'REGION', 'CATEGORY', 'LICENCES', 'AREA', 'VOLUME', 'NUMBER',
       'ROOMS', 'NEW_DWELLINGS_VOLUME', 'SURFACE', 'IMPROVEMENTS_VOLUME',
        'TOTAL_NUMBER', 'TOTAL_VOLUME', 'URBAN_NUMBER',
       'URBAN_VOLUME', 'SEMI_URBAN_NUMBER', 'SEMI_URBAN_VOLUME',
       'RURAL_NUMBER', 'RURAL_VOLUME']
df_bla_merged = df_bla_merged[new_order]

df_bla_merged = df_bla_merged.astype('object')  # upcast once, no warnings later

# prepend 4 empty rows
empty_rows = pd.DataFrame([[pd.NA]*len(df_bla_merged.columns)]*4,
                          columns=df_bla_merged.columns, dtype='object')
df_bla_merged = pd.concat([empty_rows, df_bla_merged], ignore_index=True)

# write UNIT header row
df_bla_merged.iloc[0, 0] = 'UNIT'

#list of values of the row of unit
unit_values = ['_Z', '_Z', '_Z', 'N', 'A', 'V', 'N', 'N', 'V', 'A', 'V', 'N', 'V', 'N', 'V', 'N', 'V', 'N', 'V']

df_bla_merged.iloc[0, 1:] = unit_values

df_bla_merged.iloc[1, 0] = 'DWELLINGS'

dwellings_values = ['_Z', '_Z', '_Z', '_Z', '_Z', '_Z',  'D', 'D', 'D', 'D', 'I', '_Z', '_Z', '_Z', '_Z', '_Z', '_Z', '_Z', '_Z']

df_bla_merged.iloc[1, 1:] = dwellings_values

df_bla_merged.iloc[2, 0] = 'URBAN STATUS'

urban_status_values = ['_Z', '_Z', '_Z', '_Z', '_Z', '_Z', '_Z', '_Z', '_Z', '_Z', '_Z', 'ALL', 'ALL', 'URBAN', 'URBAN', 'SEMI_URBAN', 'SEMI_URBAN', 'RURAL', 'RURAL']

df_bla_merged.iloc[2, 1:] = urban_status_values

df_bla_merged.iloc[3, 0] = 'MEASURE'

measure_values = ['_Z', '_Z', '_Z', 'NR', 'M2', 'M3', 'NR', 'NR', 'M3', 'M2', 'M3', 'NR', 'M3', 'NR', 'M3', 'NR', 'M3', 'NR', 'M3']

df_bla_merged.iloc[3, 1:] = measure_values


# 1) ensure object dtype (avoid future dtype warnings)
df_bla_merged[['REGION', 'CATEGORY']] = df_bla_merged[['REGION', 'CATEGORY']].astype('object')

# 2) light normalization: trim, collapse spaces, unify dashes, uppercase for matching
def _norm(s):
    if pd.isna(s): 
        return s
    s = str(s)
    s = s.replace('\u2013', '-').replace('\u2014', '-')   # en/em dash -> hyphen
    s = re.sub(r'\s+', ' ', s).strip()
    return s

REGION_N   = df_bla_merged['REGION'].map(_norm).str.upper()
CATEGORY_N = df_bla_merged['CATEGORY'].map(_norm).str.upper()

# 3) domain of existing REGION labels (if REGION stores codes/names)
region_set = set(REGION_N.dropna()[REGION_N.ne('_Z')].unique())

# 4) treat any "regional-looking" CATEGORY as region (case-insensitive):
#    matches 'REGIONAL', 'REGIONAL UNIT', 'REGION OF' anywhere in the string
regional_like = CATEGORY_N.str.contains(r'\b(REGIONAL|REGIONAL UNIT|REGION OF)\b', na=False)

# 5) final mask: CATEGORY is non-empty & either equals a known REGION label or looks regional
mask = CATEGORY_N.notna() & CATEGORY_N.ne('_Z') & (CATEGORY_N.isin(region_set) | regional_like)

# OPTIONAL: only overwrite REGION if it is empty/_Z
# mask &= REGION_N.isna() | REGION_N.eq('_Z')

# 6) move and zero-out
# use the normalized original CATEGORY text (not uppercased) for the move
df_bla_merged.loc[mask, 'REGION'] = df_bla_merged.loc[mask, 'CATEGORY'].map(_norm)
df_bla_merged.loc[mask, 'CATEGORY'] = '_Z'



#fill in NaN region and Category with _Z
df_bla_merged['REGION'] = df_bla_merged['REGION'].fillna('_Z')
df_bla_merged['CATEGORY'] = df_bla_merged['CATEGORY'].fillna('_Z')

# 1) ensure object dtype for safe string ops
df_bla_merged[['REGION']] = df_bla_merged[['REGION']].astype('object')

# 2) insert REGIONAL_UNIT right after REGION (if not already present)
reg_idx = df_bla_merged.columns.get_loc('REGION')
if 'REGIONAL_UNIT' not in df_bla_merged.columns:
    df_bla_merged.insert(reg_idx + 1, 'REGIONAL_UNIT', pd.NA)
df_bla_merged['REGIONAL_UNIT'] = df_bla_merged['REGIONAL_UNIT'].astype('object')

# 3) detect "regional unit" values currently living in REGION
mask = (
    df_bla_merged['REGION'].notna() &
    df_bla_merged['REGION'].astype(str).str.contains(r'\bUNIT\b', case=False, na=False)
)

# 4) move value to REGIONAL_UNIT and set REGION to '_Z'
df_bla_merged.loc[mask, 'REGIONAL_UNIT'] = df_bla_merged.loc[mask, 'REGION']
df_bla_merged.loc[mask, 'REGION'] = '_Z'

df_bla_merged.loc[mask, 'REGIONAL_UNIT'] = (
    df_bla_merged.loc[mask, 'REGIONAL_UNIT']
      .astype(str)
      .str.replace(r'^\s*REGIONAL\s+UNIT\s+OF\s+', '', regex=True, case=False)
      .str.strip()
)

#fill in <NA> in REGIONAL_UNIT AS _Z
df_bla_merged['REGIONAL_UNIT'] = df_bla_merged['REGIONAL_UNIT'].fillna('_Z')

df_bla_merged.head(10)



  regional_like = CATEGORY_N.str.contains(r'\b(REGIONAL|REGIONAL UNIT|REGION OF)\b', na=False)


Unnamed: 0,FREQ,time_period,REGION,REGIONAL_UNIT,CATEGORY,LICENCES,AREA,VOLUME,NUMBER,ROOMS,...,SURFACE,IMPROVEMENTS_VOLUME,TOTAL_NUMBER,TOTAL_VOLUME,URBAN_NUMBER,URBAN_VOLUME,SEMI_URBAN_NUMBER,SEMI_URBAN_VOLUME,RURAL_NUMBER,RURAL_VOLUME
0,UNIT,_Z,_Z,_Z,_Z,N,A,V,N,N,...,A,V,N,V,N,V,N,V,N,V
1,DWELLINGS,_Z,_Z,_Z,_Z,_Z,_Z,_Z,D,D,...,D,I,_Z,_Z,_Z,_Z,_Z,_Z,_Z,_Z
2,URBAN STATUS,_Z,_Z,_Z,_Z,_Z,_Z,_Z,_Z,_Z,...,_Z,_Z,ALL,ALL,URBAN,URBAN,SEMI_URBAN,SEMI_URBAN,RURAL,RURAL
3,MEASURE,_Z,_Z,_Z,_Z,NR,M2,M3,NR,NR,...,M2,M3,NR,M3,NR,M3,NR,M3,NR,M3
4,A,2007,_Z,_Z,_Z,79407,20582961,77850009,,,...,,,,,,,,,,
5,M,2007-M01,_Z,_Z,_Z,5840,1381686,5062775,,,...,,,,,,,,,,
6,M,2007-M02,_Z,_Z,_Z,6203,1516375,5675738,,,...,,,,,,,,,,
7,M,2007-M03,_Z,_Z,_Z,7563,1888152,7019056,,,...,,,,,,,,,,
8,M,2007-M04,_Z,_Z,_Z,6348,1582101,5825110,,,...,,,,,,,,,,
9,M,2007-M05,_Z,_Z,_Z,7416,1858219,6997199,,,...,,,,,,,,,,


In [91]:
# PRINT UNIQUE REGION
unique_regions = df_bla_merged['REGION'].unique()
print("Unique regions:")
print(unique_regions)

Unique regions:
['_Z' 'Greece, total' 'REGION OF ATTIKI'
 'REGION OF ANATOLIKI MAKEDONIA,THRAKI' 'REGION OF KENTRIKI MAKEDONIA'
 'REGION OF DYTIKI MAKEDONIA' 'REGION OF IPEIROS' 'REGION OF THESSALIA'
 'REGION OF STEREA ELLADA' 'REGION OF IONIA NISIA'
 'REGION OF DYTIKI ELLADA' 'REGION OF PELOPONNISOS'
 'REGION OF VOREIO AIGAIO' 'REGION OF NOTIO AIGAIO' 'REGION OF KRITI']


In [92]:
# print unique categories
unique_categories = df_bla_merged['CATEGORY'].unique()
print("Unique categories:")
print(unique_categories)

Unique categories:
['_Z' 'Unspecified' 'Manufacturing' 'Agricultural' 'Offices' 'Educational'
 'Commercial' 'Short stay accommodation (rooms for rent)' 'Livestock'
 'Other' 'Residences for communities' 'Hotels' 'Health care']


In [94]:
# print uniq regional units
print(df_bla_merged['REGIONAL_UNIT'].unique())  

['_Z' 'RODOPI' 'DRAMA' 'EVROS' 'KAVALA' 'XANTHI' 'THESSALONIKI' 'IMATHIA'
 'KILKIS' 'PELLA' 'SERRES' 'KOZANI' 'IOANNINA' 'ARTA' 'THESPROTIA'
 'PREVEZA' 'LARISA' 'KARDITSA' 'MAGNISIA' 'SPORADES' 'TRIKALA' 'VOIOTIA'
 'EVOIA' 'FOKIDA' 'KERKYRA' 'ZAKYNTHOS' 'KEFALLINIA' 'LEFKADA' 'ACHAIA'
 'ETOLOAKARNANIA' 'ILEIA' 'ARKADIA' 'ARGOLIDA' 'KORINTHIA' 'LAKONIA'
 'MESSINIA' 'KENTRIKOS TOMEAS ATHINON (CENTRAL SECTOR OF ATHENS)'
 'VOREIOS TOMEAS ATHINON (NORTH SECTOR OF ATHENS)'
 'NOTIOS TOMEAS ATHINON (SOUTH SECTOR OF ATHENS)' 'ANATOLIKI ATTIKI'
 'DYTIKI ATTIKI' 'PIREAS' 'NISIA' 'LIMNOS' 'SAMOS' 'CHIOS' 'THIRA'
 'KALYMNOS' 'KEA - KYTHNOS' 'KOS' 'MILOS' 'MYKONOS' 'NAXOS' 'PAROS'
 'RODOS' 'IRAKLEIO' 'LASITHI' 'RETHYMNO' 'CHANIA'
 'KENTRIKOS TOMEAS ATHINON\n (CENTRAL SECTOR OF ATHENS)'
 'VOREIOS TOMEAS ATHINON\n (NORTH SECTOR OF ATHENS)'
 'DYTIKOS TOMEAS ATHINON\n (WESTERN SECTOR OF ATHENS)'
 'NOTIOS TOMEAS ATHINON\n (SOUTH SECTOR OF ATHENS)' 'NISIA (ISLANDS)'
 'THASOS' 'PIERIA' 'CHALKIDIKI' 'GREVEN