In [1]:

from os.path import isdir
from os import makedirs

import re
import csv
import math

import csv

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import json
import requests

import plotly.io as pio

pio.templates[pio.templates.default].layout['font']["family"] = "Latin Modern Roman"
pio.templates[pio.templates.default].layout['font']["color"] = "black"

In [2]:
relPath = '.'
TASK = "geolingit"

In [3]:
def clean_input_text(text):
    text = re.sub(r'\t+', ' ', re.sub(r'\n+', ' ', re.sub(r'\s+', " ", text)))
    text = text.rstrip()
    return text

def get_data():
    if not isdir(f"out/{TASK}"):
        makedirs(f"out/{TASK}")

    data = dict()
    for split in ['dev', 'train']:
        with open(f"{relPath}/GeoLingIt/subtask_a/{split}_a.tsv", encoding="utf-8") as f:
            reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
            for row in reader:
                text = clean_input_text(row['text'])
                label = row['region']
                data[row['id']] = {
                    'text': text,
                    'label': label,
                }

        with open(f"{relPath}/GeoLingIt/subtask_b/{split}_b.tsv", encoding="utf-8") as f:
            reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
            for row in reader:
                latitude = math.floor(eval(row['latitude'])*100)/100.
                longitude = math.floor(eval(row['longitude'])*100)/100.
                data[row['id']]['latitude'] = latitude
                data[row['id']]['longitude'] = longitude
                
    with open(f'{relPath}/GeoLingIt/test_a_GOLD.tsv', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
        for row in reader:
            text = clean_input_text(row['text'])
            label = row['region']
            data[row['id']] = {
                'text': text,
                'label': label
            }
            
    with open(f'{relPath}/GeoLingIt/test_b_GOLD.tsv', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
        for row in reader:
            latitude = math.floor(eval(row['latitude'])*100)/100.
            longitude = math.floor(eval(row['longitude'])*100)/100.
            data[row['id']]['latitude'] = latitude
            data[row['id']]['longitude'] = longitude         
                
    return data

In [4]:
data = get_data()
data = pd.DataFrame(data).T
data

Unnamed: 0,text,label,latitude,longitude
13670,"[USER] Mortacci, na roba che nse po' vede, Por...",Lazio,41.89,12.54
13671,[USER] In Liguria diciamo: CIU INLA’ U GHE CIO...,Liguria,43.9,8.0
13672,[USER] Uuuuuuaaaaa.... [USER] si nu bell cafè ...,Campania,40.85,14.24
13673,[USER] [USER] Boffe a dui a dui finu caddivien...,Sicilia,38.13,13.34
13674,[USER] Anvedi andò stai. La prossima volta dim...,Lazio,41.89,12.54
...,...,...,...,...
15035,"I PARADOSSO DI ROSARECCIA o bimbi, ancora un g...",Toscana,43.55,10.88
15036,Se allenass a juve o vincess pur io o scudett ...,Campania,40.68,14.53
15037,[USER] Grandi passi avanti nel trasporto pubbl...,Veneto,45.6,12.22
15038,[USER] [USER] NO VABBÈ! Io mi sono appena sveg...,Puglia,41.09,16.87


In [5]:
print('Num regions:', len(set(data['label'])))

# generate 20 very bright colors with numpy
colors =  px.colors.qualitative.Light24[:len(set(data['label']))]

# convert hex color to rgb
def hex_to_rgb(value):
    value = value.lstrip('#')
    lv = len(value)
    rgb = tuple(int(value[i:i + lv // 3], 16) for i in range(0, lv, lv // 3))
    return rgb

def change_brightness(rgb, factor):
    rgb = np.array(rgb)
    rgb = np.clip(rgb / factor, 0, 255).astype(int)
    return tuple(rgb)

mapper = {label: hex_to_rgb(color) for label, color in zip(set(data['label']), colors)}
mapper

Num regions: 20


{'Puglia': (253, 50, 22),
 'Marche': (0, 254, 53),
 'Veneto': (106, 118, 252),
 'Trentino-Alto Adige': (254, 212, 196),
 'Lazio': (254, 0, 206),
 'Emilia Romagna': (13, 249, 255),
 'Molise': (246, 249, 38),
 'Calabria': (255, 150, 22),
 'Sardegna': (71, 155, 85),
 'Liguria': (238, 166, 251),
 'Toscana': (220, 88, 125),
 'Piemonte': (214, 38, 255),
 'Sicilia': (110, 137, 156),
 'Lombardia': (0, 181, 247),
 'Campania': (182, 142, 0),
 'Basilicata': (201, 251, 229),
 'Abruzzo': (255, 0, 146),
 'Umbria': (34, 255, 167),
 "Valle d'Aosta": (227, 238, 158),
 'Friuli-Venezia Giulia': (134, 206, 0)}

In [6]:
limits_it_regions_geojson = "https://raw.githubusercontent.com/openpolis/geojson-italy/master/geojson/limits_IT_regions.geojson"

response = requests.get(limits_it_regions_geojson)
regions = json.loads(response.text)

# correct some names
correct_name = {
    'Trentino-Alto Adige/Südtirol': 'Trentino-Alto Adige',
    'Emilia-Romagna': 'Emilia Romagna',
    "Valle d'Aosta/Vallée d'Aoste": "Valle d'Aosta",
}

for region in regions['features']:
    if region['properties']['reg_name'] in correct_name:
        region['properties']['reg_name'] = correct_name[region['properties']['reg_name']]

In [7]:
# plot posts with map

fig = px.choropleth_mapbox(data,
                           geojson=regions,
                            locations="label",
                            featureidkey="properties.reg_name",
                            color="label",
                            color_discrete_map={l: f'rgb{c}' for l, c in mapper.items()},
                            mapbox_style="white-bg",
                            zoom=5.3,
                            center = {"lat": 42, "lon": 12.5},
                            opacity=0.2,
                            height=810,
                            width=900
                            )
fig.update_traces(marker_line_width=2)

fig.update_geos(fitbounds="locations", visible=False)

fig.update_layout(
    mapbox = {
        'center': {'lon': 12.5, 'lat': 42},
    },
    margin={"r":0,"t":0,"l":0,"b":0},
    legend=dict(
        title="Region",
    )
)

fig.add_trace(go.Scattermapbox(
    lat=data['latitude'],
    lon=data['longitude'],
    mode='markers',
    marker=go.scattermapbox.Marker(
        allowoverlap=True,
        size=4.5,
        opacity=1,
        # map colors
        color=[f'rgb{change_brightness(mapper[label], 1.4)}' for label in data['label']],
    ),
    showlegend=False,
))

fig.update_layout(legend=dict(
    font=dict(size=20)
))

# fig.show(config={'scrollZoom': False, 'displayModeBar': False})
fig.write_image(f"img/coordinate_distribution.png", scale=5)

In [8]:
region2count = data['label'].value_counts().to_dict()

colors = {region: region2count[sample['label']] for region, sample in data.iterrows()}

fig = px.choropleth_mapbox(data,
                            geojson=regions,
                             locations="label",
                             featureidkey="properties.reg_name",
                             color=colors,
                             color_continuous_scale='Viridis',
                             mapbox_style="white-bg",
                             zoom=5.3,
                             center = {"lat": 42, "lon": 12.5},
                             opacity=1,
                             height=810,
                             width=880
                             )

fig.update_traces(marker_line_width=1, marker_line_color='white')

fig.update_layout(
    mapbox = {
        'center': {'lon': 12.5, 'lat': 42},
    },
    margin={"r":0,"t":0,"l":0,"b":0},
    legend=dict(
        title="Number of posts",
    )
)

fig.update_layout(coloraxis_colorbar=dict(
    title_font=dict(size=20),
    tickfont=dict(size=20),
    title='Number of posts',    
))

# fig.show(config={'scrollZoom': False, 'displayModeBar': False})
fig.write_image(f"img/number_of_posts_per_region.png", scale=5)

In [9]:
# plot the distribution of the regions

fig = px.histogram(data.sort_values(by='label'), x="label", title="Region distribution", labels={'label': 'Region'}, width=800, height=500)

fig.update_layout(margin=dict(l=0, r=0, t=40, b=0), font=dict(size=20))
fig.update_xaxes(tickangle=-45, categoryorder='total descending', tickfont=dict(size=20))
fig.update_yaxes(range=[0, 7000], title_text='Number of samples')
fig.update_traces(texttemplate='%{y}', textposition='outside', marker_line_color='black', marker_line_width=1)

# fig.show(config={'scrollZoom': False, 'displayModeBar': False})
fig.write_image(f"img/region_distribution.png", scale=5)