In [1]:
import numpy as np
import pandas as pd

In [130]:
df = pd.read_csv("data/region/workforce_2020.csv", encoding='latin1')
header = df.columns.values.tolist()
# Convert all columns except the first one to integers
df.iloc[:, 1:] = df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')
# Postal code reduction:
def process_col(entry):
    if entry[:2].isdigit():
        return str(entry[:2])
    else:
        return entry
df['Postal code area'] = df['Postal code area'].apply(process_col)
# Postal map
postal_map = {
    "WH": "WHOLE COUNTRY",
    "00": "Helsinki",
    "01": "Helsinki",
    "02": "Helsinki",
    "03": "Helsinki",
    "04": "Helsinki",
    "05": "Helsinki",
    "06": "Helsinki",
    "07": "Helsinki",
    "08": "Helsinki",
    "09": "Helsinki",
    "10": "Helsinki",
    "11": "Hämeenlinna",
    "12": "Hämeenlinna",
    "13": "Hämeenlinna",
    "14": "Hämeenlinna",
    "15": "Lahti",
    "16": "Lahti",
    "17": "Lahti",
    "18": "Lahti",
    "19": "Lahti",
    "20": "Turku",
    "21": "Turku",
    "22": "Turku",
    "23": "Uusikaupunki",
    "24": "Turku",
    "25": "Turku",
    "26": "Turku",
    "27": "Turku",
    "28": "Pori",
    "29": "Pori",
    "30": "Forssa",
    "31": "Forssa",
    "32": "Forssa",
    "33": "Tampere",
    "34": "Tampere",
    "35": "Tampere",
    "36": "Tampere",
    "37": "Tampere",
    "38": "Tampere",
    "39": "Tampere",
    "40": "Jyväskylä",
    "41": "Jyväskylä",
    "42": "Jyväskylä",
    "43": "Jyväskylä",
    "44": "Jyväskylä",
    "45": "Kouvola",
    "46": "Kouvola",
    "47": "Kouvola",
    "48": "Kotka",
    "49": "Kotka",
    "50": "Mikkeli",
    "51": "Mikkeli",
    "52": "Mikkeli",
    "53": "Lappeenranta",
    "54": "Lappeenranta",
    "55": "Lappeenranta",
    "56": "Lappeenranta",
    "57": "Savonlinna",
    "58": "Savonlinna",
    "59": "Savonlinna",
    "60": "Seinäjoki",
    "61": "Seinäjoki",
    "62": "Seinäjoki",
    "63": "Seinäjoki",
    "64": "Seinäjoki",
    "65": "Vaasa",
    "66": "Vaasa",
    "67": "Kokkola",
    "68": "Kokkola",
    "69": "Kokkola",
    "70": "Kuopio",
    "71": "Kuopio",
    "72": "Kuopio",
    "73": "Kuopio",
    "74": "Kuopio",
    "75": "Kuopio",
    "76": "Pieksämäki",
    "77": "Pieksämäki",
    "78": "Pieksämäki",
    "79": "Pieksämäki",
    "80": "Joensuu",
    "81": "Joensuu",
    "82": "Joensuu",
    "83": "Joensuu",
    "84": "Ylivieska",
    "85": "Ylivieska",
    "86": "Ylivieska",
    "87": "Kajaani",
    "88": "Kajaani",
    "89": "Kajaani",
    "90": "Oulu",
    "91": "Oulu",
    "92": "Oulu",
    "93": "Oulu",
    "94": "Kemi",
    "95": "Kemi",
    "96": "Rovaniemi",
    "97": "Rovaniemi",
    "98": "Rovaniemi",
    "99": "Rovaniemi"
}
def postal_mapping(entry):
    for code, area in postal_map.items():
        if code == entry:
            entry = area
            break
    return entry
df['Postal code area'] = df['Postal code area'].apply(postal_mapping)
# Optionally, fill NaN values with a specific value (e.g., 0)
df = df.fillna(0)
cleaned_df = df.dropna(how='all')
data = np.array(cleaned_df)

# Summing each postal code into 1 big region
values = data[:, 1:].astype(int)
unique_values, inverse_indices = np.unique(data[:, 0], return_inverse=True)
sums_array = np.zeros((len(unique_values), values.shape[1]), dtype=int)
np.add.at(sums_array, inverse_indices, values)
results_with_names = np.column_stack((unique_values, sums_array))
summed_data = np.array(results_with_names)

# Calculate the percentage for that region
def cal_percentage(dataset):
    results = []
    for row in dataset:
        row_results = [row[0]]
        current_sum = row[1]
        for value in row[1:]:
            if current_sum == 0:
                row_results.append(1.0)
            else:
                percentage = value / current_sum
                row_results.append(percentage)
        results.append(row_results)
    return results
percentages = cal_percentage(summed_data)
final = np.vstack((header, percentages))
# Convert the NumPy array to a DataFrame
df = pd.DataFrame(final[1:], columns=final[0])
file_path = "data/region/workforce_2020_percentage_region.csv"
df.to_csv(file_path, sep=',', index=False)