In [75]:
# imports
import polars as pl
import polars.selectors as cs
import os
from statistics_utils import ne2cc, build_country_code_dicts
import plotly.express as px
import json
import plotly.graph_objects as go
import pandas
import altair as alt
import geopandas as gpd
#import contextily as cx
from shapely.geometry import Point, LineString, Polygon
pl.Config.set_tbl_rows(20)

polars.config.Config

In [76]:
def load_and_concat_parquet_files(path):
    dfs = []
    for file in os.listdir(path):
        file_path = os.path.join(path, file)
        if file_path.endswith('.parquet'):
            dfs.append(convert_types(pl.read_parquet(file_path).with_columns([pl.lit(file[0:2]).alias("ReferringCountry")])))
    if not dfs:
        return pl.DataFrame()
    combined_df = dfs[0]
    for df in dfs[1:]:
        combined_df = pl.concat([combined_df,df], how="diagonal")
    return combined_df

def convert_types(df):
    for col in df.columns:
        if df[col].dtype != pl.String:
            df = df.with_columns(df[col].cast(pl.String))
    return df

path = "/home/simon/parliamint/ParlaMint_edgelists"
df = load_and_concat_parquet_files(path)
df = df.drop("[0.002s][warning][perf,memops] Cannot use file /tmp/hsperfdata_kopp/3612625 because it is locked by another process (errno = 11)")

df = df.with_columns(
    pl.col("Date").str.strptime(pl.Date, "%Y-%m-%d")
)
print(df)

shape: (1_835_116, 29)
┌───────────┬───────────┬───────────┬──────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ text_id   ┆ name_type ┆ position  ┆ entity   ┆ … ┆ Speaker_b ┆ target_co ┆ source_co ┆ Referring │
│ ---       ┆ ---       ┆ ---       ┆ ---      ┆   ┆ irth      ┆ untry     ┆ untry     ┆ Country   │
│ str       ┆ str       ┆ str       ┆ str      ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│           ┆           ┆           ┆          ┆   ┆ str       ┆ str       ┆ str       ┆ str       │
╞═══════════╪═══════════╪═══════════╪══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ ParlaMint ┆ LOC       ┆ ParlaMint ┆ europe   ┆ … ┆ null      ┆ europe    ┆ PT        ┆ PT        │
│ -PT_2015- ┆           ┆ -PT_2015- ┆          ┆   ┆           ┆           ┆           ┆           │
│ 03-20.u83 ┆           ┆ 03-20.seg ┆          ┆   ┆           ┆           ┆           ┆           │
│           ┆           ┆ 287…      ┆          ┆   ┆           ┆    

In [77]:
print(df.group_by("entity").len().sort("len", descending=True).filter())

shape: (270, 2)
┌────────────────────┬────────┐
│ entity             ┆ len    │
│ ---                ┆ ---    │
│ str                ┆ u32    │
╞════════════════════╪════════╡
│ europe             ┆ 341472 │
│ germany            ┆ 84756  │
│ russia             ┆ 81470  │
│ united states      ┆ 76337  │
│ ukraine            ┆ 60669  │
│ kosovo             ┆ 48419  │
│ spain              ┆ 46210  │
│ france             ┆ 45131  │
│ syria              ┆ 39171  │
│ china              ┆ 37814  │
│ …                  ┆ …      │
│ brunei darussalam  ┆ 3      │
│ east australia     ┆ 2      │
│ south west asia    ┆ 2      │
│ west america       ┆ 2      │
│ southern asia      ┆ 1      │
│ türkiye            ┆ 1      │
│ north east africa  ┆ 1      │
│ south west africa  ┆ 1      │
│ northern australia ┆ 1      │
│ norfolk island     ┆ 1      │
└────────────────────┴────────┘


In [78]:
dicts = build_country_code_dicts()
def ne2cc_s(x):
    return ne2cc(x, dicts[0], dicts[1])


# Neighbor investigation (Simon)

neighbor json file was created in `border_finder.py` using data from [naturalearthdata.com](https://www.naturalearthdata.com/downloads/110m-cultural-vectors/)

In [108]:
with open("/home/simon/parlianets/neighbors_iso2.json", "r") as f:
    neighbors_map = json.load(f)

# assume `df` already exists with "source_country" & "target_country"
# (b) pull your Polars DataFrame into Python dicts
records = df.to_dicts()

# (c) annotate each record
for rec in records:
    src = rec["source_country"]
    tgt = rec["target_country"]
    rec["is_neighbor"] = tgt in neighbors_map.get(src, [])

# (d) re-ingest into Polars
df2 = pl.DataFrame(records)

# (e) group & compute percentages
result = (
    df2
    .group_by("source_country", maintain_order=True)
    .agg([
      # fraction of neighbor-links, *100 and round(2)
      (pl.sum("is_neighbor") / pl.count() * 100).round(2)
        .alias("neighbors_rel"),
      # complement: non-neighbor %
      (((pl.count() - pl.sum("is_neighbor")) / pl.count()) * 100)
        .round(2)
        .alias("non_neighbors_rel")
    ])
)

print(result)

shape: (27, 3)
┌────────────────┬───────────────┬───────────────────┐
│ source_country ┆ neighbors_rel ┆ non_neighbors_rel │
│ ---            ┆ ---           ┆ ---               │
│ str            ┆ f64           ┆ f64               │
╞════════════════╪═══════════════╪═══════════════════╡
│ PT             ┆ 9.35          ┆ 90.65             │
│ RS             ┆ 20.83         ┆ 79.17             │
│ CZ             ┆ 23.92         ┆ 76.08             │
│ IS             ┆ 0.0           ┆ 100.0             │
│ GR             ┆ 6.08          ┆ 93.92             │
│ ES             ┆ 4.41          ┆ 95.59             │
│ BE             ┆ 17.0          ┆ 83.0              │
│ not found      ┆ 0.0           ┆ 100.0             │
│ LV             ┆ 37.7          ┆ 62.3              │
│ GB             ┆ 0.0           ┆ 100.0             │
│ …              ┆ …             ┆ …                 │
│ TR             ┆ 32.04         ┆ 67.96             │
│ FI             ┆ 24.19         ┆ 75.81          

  (pl.sum("is_neighbor") / pl.count() * 100).round(2)
  (((pl.count() - pl.sum("is_neighbor")) / pl.count()) * 100)


In [111]:
with open("/home/simon/parlianets/neighbors_iso2.json", "r") as f:
    neighbors_map = json.load(f)

# 1) convert to pandas
pdf = df.to_pandas()

# 2) row-wise apply
pdf["is_neighbor"] = pdf.apply(
    lambda r: r.target_country in neighbors_map.get(r.source_country, []),
    axis=1
)

# 3) back to Polars (if you still want Polars for grouping)
df2 = pl.from_pandas(pdf)

result = (
    df2
    .group_by("source_country", maintain_order=True)
    .agg([
      (pl.sum("is_neighbor") / pl.count() * 100).round(2).alias("neighbors_rel"),
      (((pl.count() - pl.sum("is_neighbor")) / pl.count()) * 100)
        .round(2).alias("non_neighbors_rel")
    ])
)

print(result)

shape: (27, 3)
┌────────────────┬───────────────┬───────────────────┐
│ source_country ┆ neighbors_rel ┆ non_neighbors_rel │
│ ---            ┆ ---           ┆ ---               │
│ str            ┆ f64           ┆ f64               │
╞════════════════╪═══════════════╪═══════════════════╡
│ PT             ┆ 0.0           ┆ 100.0             │
│ RS             ┆ 53.53         ┆ 46.47             │
│ CZ             ┆ 23.92         ┆ 76.08             │
│ IS             ┆ 0.0           ┆ 100.0             │
│ GR             ┆ 6.08          ┆ 93.92             │
│ ES             ┆ 0.0           ┆ 100.0             │
│ BE             ┆ 24.86         ┆ 75.14             │
│ not found      ┆ 0.0           ┆ 100.0             │
│ LV             ┆ 37.7          ┆ 62.3              │
│ GB             ┆ 0.0           ┆ 100.0             │
│ …              ┆ …             ┆ …                 │
│ TR             ┆ 32.04         ┆ 67.96             │
│ FI             ┆ 28.0          ┆ 72.0           

  (pl.sum("is_neighbor") / pl.count() * 100).round(2).alias("neighbors_rel"),
  (((pl.count() - pl.sum("is_neighbor")) / pl.count()) * 100)


In [79]:
df = df.with_columns([
    pl.col("entity").map_elements(ne2cc_s, return_dtype=pl.String).alias("ReferredCountry")
])

print(
df.filter(pl.col("ReferredCountry").is_null()).group_by("entity").agg(pl.count()).sort("count", descending=True)
)
df = df.filter(
    pl.col("ReferredCountry") != pl.col("ReferringCountry")
)

print(df)

shape: (53, 2)
┌────────────────────┬────────┐
│ entity             ┆ count  │
│ ---                ┆ ---    │
│ str                ┆ u32    │
╞════════════════════╪════════╡
│ europe             ┆ 341472 │
│ eu                 ┆ 23911  │
│ america            ┆ 23486  │
│ africa             ┆ 18319  │
│ asia               ┆ 5802   │
│ usa                ┆ 3273   │
│ north africa       ┆ 2820   │
│ south america      ┆ 1898   │
│ north america      ┆ 1235   │
│ west africa        ┆ 635    │
│ …                  ┆ …      │
│ northern africa    ┆ 4      │
│ brunei darussalam  ┆ 3      │
│ west america       ┆ 2      │
│ south west asia    ┆ 2      │
│ east australia     ┆ 2      │
│ north east africa  ┆ 1      │
│ southern asia      ┆ 1      │
│ northern australia ┆ 1      │
│ türkiye            ┆ 1      │
│ south west africa  ┆ 1      │
└────────────────────┴────────┘
shape: (1_384_795, 30)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬─────────

  df.filter(pl.col("ReferredCountry").is_null()).group_by("entity").agg(pl.count()).sort("count", descending=True)


In [80]:
print(df.group_by("ReferredCountry").len().sort("len", descending=True).filter())

shape: (214, 2)
┌─────────────────┬───────┐
│ ReferredCountry ┆ len   │
│ ---             ┆ ---   │
│ str             ┆ u32   │
╞═════════════════╪═══════╡
│ US              ┆ 95842 │
│ DE              ┆ 84756 │
│ RU              ┆ 81470 │
│ UA              ┆ 60669 │
│ XK              ┆ 48419 │
│ FR              ┆ 45131 │
│ SY              ┆ 39171 │
│ CN              ┆ 37814 │
│ SE              ┆ 35262 │
│ GR              ┆ 34344 │
│ …               ┆ …     │
│ SX              ┆ 23    │
│ CZ              ┆ 21    │
│ SZ              ┆ 20    │
│ SB              ┆ 15    │
│ CD              ┆ 14    │
│ AS              ┆ 11    │
│ NU              ┆ 8     │
│ FK              ┆ 6     │
│ CX              ┆ 5     │
│ NF              ┆ 1     │
└─────────────────┴───────┘


In [81]:
#alt.data_transformers.disable_max_rows()

df_yearly = df.group_by("ReferringCountry", "Date").len().sort("Date", descending=False)
df_yearly = df_yearly.with_columns(
    pl.col("Date").dt.truncate("1y").alias("year_start")
)

df_yearly = df_yearly.group_by("year_start", "ReferringCountry").agg([
    pl.col("len").sum().alias("year_sum")])
#df_yearly.plot.line(x="year_start", y="year_sum").properties(width=100,height=100).facet(
#    facet="ReferringCountry", columns=6
#)

In [82]:
alt.data_transformers.disable_max_rows()

df_yearly = df.group_by("Date").len().sort("Date", descending=False)
df_yearly = df_yearly.with_columns(
    pl.col("Date").dt.truncate("1y").alias("year_start")
)

"""
df_yearly = df_yearly.group_by("year_start").agg([
    pl.col("len").sum().alias("year_sum")])
df_yearly.plot.line(x="year_start", y="year_sum").properties(width=200,height=200)
"""

'\ndf_yearly = df_yearly.group_by("year_start").agg([\n    pl.col("len").sum().alias("year_sum")])\ndf_yearly.plot.line(x="year_start", y="year_sum").properties(width=200,height=200)\n'

In [83]:
df.group_by("ReferringCountry", "ReferredCountry").len().sort("len", descending=True)

ReferringCountry,ReferredCountry,len
str,str,u32
"""RS""","""XK""",40588
"""DK""","""GL""",20248
"""TR""","""SY""",12581
"""RS""","""ME""",11041
"""CZ""","""UA""",10945
"""NO""","""SE""",10889
"""NO""","""US""",10405
"""AT""","""DE""",10322
"""UA""","""RU""",8781
"""SE""","""RU""",8726


In [84]:
df.group_by("ReferringCountry", "ReferredCountry").len().sort("len", descending=True).filter(pl.col("ReferredCountry") == "UA")


ReferringCountry,ReferredCountry,len
str,str,u32
"""CZ""","""UA""",10945
"""NL""","""UA""",5656
"""SE""","""UA""",5303
"""PL""","""UA""",4990
"""EE""","""UA""",4819
"""DK""","""UA""",3492
"""HR""","""UA""",2712
"""BE""","""UA""",2589
"""HU""","""UA""",2504
"""ES""","""UA""",2350


In [85]:
total_counts = df.group_by("ReferringCountry").len().rename({"len": "total"})

# Step 2: Count pairs and compute proportion
result = (
    df.group_by("ReferringCountry", "ReferredCountry")
      .len()
      .join(total_counts, on="ReferringCountry")
      .with_columns(
          (pl.col("len") / pl.col("total")).alias("proportion")
      )
      .sort("proportion", descending=True)
)
result

ReferringCountry,ReferredCountry,len,total,proportion
str,str,u32,u32,f64
"""GB""","""LY""",1883,4218,0.44642
"""no""","""ES""",5226,12267,0.426021
"""RS""","""XK""",40588,124124,0.326996
"""IS""","""US""",6892,26678,0.25834
"""UA""","""RU""",8781,38478,0.228208
"""DK""","""GL""",20248,94485,0.214299
"""BA""","""HR""",2611,12445,0.209803
"""FI""","""SE""",3468,21380,0.162208
"""GB""","""SS""",666,4218,0.157895
"""BA""","""RS""",1935,12445,0.155484


In [86]:
df.group_by("ReferringCountry", "ReferredCountry", "Party_orientation").len().sort("len", descending=True)


ReferringCountry,ReferredCountry,Party_orientation,len
str,str,str,u32
"""RS""","""XK""","""Far-right""",12603
"""RS""","""XK""","""Centre-left""",8144
"""DK""","""GL""","""Left""",5941
"""RS""","""XK""","""Big tent""",5764
"""RS""","""XK""","""Right""",5209
"""RS""","""XK""","""-""",4069
"""DK""","""GL""","""Centre-right""",3910
"""DK""","""GL""","""Centre-left""",3902
"""TR""","""SY""","""Centre-left""",3781
"""RS""","""ME""","""Far-right""",3613


In [87]:
with open("restcountries_all.json", encoding="utf-8") as f:
    cntry_data = json.load(f)

def cca2ToGeo(ca):
    for country in cntry_data:
        if country["cca2"] == ca:
            return country["latlng"]
    return None

In [88]:
df_geo = result.with_columns([
    pl.col("ReferringCountry").map_elements(cca2ToGeo, return_dtype=pl.List(pl.Float64)).alias("ReferringCountryGeo"),
    pl.col("ReferredCountry").map_elements(cca2ToGeo, return_dtype=pl.List(pl.Float64)).alias("ReferredCountryGeo")
])


In [89]:
"""
import folium
df_geo_FI = df_geo.filter(pl.col("ReferringCountry") == "FI").to_pandas()
df_geo_FI["geometry"] =df_geo_FI.apply(
    lambda row: LineString([row["ReferringCountryGeo"].reverse(), row["ReferredCountryGeo"].reverse()]),
    axis=1
)
geo_df = gpd.GeoDataFrame(df_geo_FI, geometry='geometry', crs="EPSG:4326")
# Get center of your data for map start
geo_df.explore()
"""

'\nimport folium\ndf_geo_FI = df_geo.filter(pl.col("ReferringCountry") == "FI").to_pandas()\ndf_geo_FI["geometry"] =df_geo_FI.apply(\n    lambda row: LineString([row["ReferringCountryGeo"].reverse(), row["ReferredCountryGeo"].reverse()]),\n    axis=1\n)\ngeo_df = gpd.GeoDataFrame(df_geo_FI, geometry=\'geometry\', crs="EPSG:4326")\n# Get center of your data for map start\ngeo_df.explore()\n'