In [35]:
# imports
import polars as pl
import polars.selectors as cs
import os
from statistics_utils import ne2cc, build_country_code_dicts

pl.Config.set_tbl_rows(100)

polars.config.Config

In [36]:
def load_and_concat_parquet_files(path):
    dfs = []
    for file in os.listdir(path):
        file_path = os.path.join(path, file)
        if file_path.endswith('.parquet'):
            dfs.append(convert_types(pl.read_parquet(file_path)))
    if not dfs:
        return pl.DataFrame()
    combined_df = dfs[0]
    for df in dfs[1:]:
        combined_df = pl.concat([combined_df,df], how="diagonal")
    return combined_df

def convert_types(df):
    for col in df.columns:
        if df[col].dtype != pl.String:
            df = df.with_columns(df[col].cast(pl.String))
    return df

path = "../../data/filtered/"
df = load_and_concat_parquet_files(path)
df = df.drop("[0.002s][warning][perf,memops] Cannot use file /tmp/hsperfdata_kopp/3612625 because it is locked by another process (errno = 11)")

#print(df.filter(~pl.col("text_id").str.starts_with("ParlaMint")))
print(df.columns)

['text_id', 'name_type', 'position', 'entity', 'Text_ID', 'Title', 'Date', 'Body', 'Term', 'Session', 'Meeting', 'Sitting', 'Agenda', 'Subcorpus', 'Lang', 'Speaker_role', 'Speaker_MP', 'Speaker_minister', 'Speaker_party', 'Speaker_party_name', 'Party_status', 'Party_orientation', 'Speaker_ID', 'Speaker_name', 'Speaker_gender', 'Speaker_birth']


In [37]:
print(df.group_by("entity").len().sort("len", descending=True).filter())

shape: (270, 2)
┌────────────────────┬────────┐
│ entity             ┆ len    │
│ ---                ┆ ---    │
│ str                ┆ u32    │
╞════════════════════╪════════╡
│ europe             ┆ 337559 │
│ germany            ┆ 84267  │
│ russia             ┆ 81258  │
│ united states      ┆ 75534  │
│ ukraine            ┆ 60249  │
│ kosovo             ┆ 48416  │
│ france             ┆ 44566  │
│ spain              ┆ 40984  │
│ syria              ┆ 39111  │
│ china              ┆ 37667  │
│ sweden             ┆ 35132  │
│ greece             ┆ 34233  │
│ italy              ┆ 31423  │
│ iraq               ┆ 26887  │
│ afghanistan        ┆ 24684  │
│ austria            ┆ 24248  │
│ eu                 ┆ 23911  │
│ poland             ┆ 23630  │
│ america            ┆ 23429  │
│ united kingdom     ┆ 23085  │
│ hungary            ┆ 22796  │
│ israel             ┆ 22427  │
│ greenland          ┆ 21761  │
│ finland            ┆ 21480  │
│ croatia            ┆ 21164  │
│ denmark            ┆ 2

In [38]:
dicts = build_country_code_dicts()
def ne2cc_s(x):
    return ne2cc(x, dicts[0], dicts[1])


In [39]:
df = df.with_columns([
    pl.col("entity").map_elements(ne2cc_s, return_dtype=pl.String).alias("ReferredCountry"),
    pl.col("position").str.extract(r"ParlaMint\-(..)", 1).alias("ReferringCountry")
])
print(df)

shape: (1_818_766, 28)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ text_id   ┆ name_type ┆ position  ┆ entity    ┆ … ┆ Speaker_g ┆ Speaker_b ┆ ReferredC ┆ Referrin │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ender     ┆ irth      ┆ ountry    ┆ gCountry │
│ str       ┆ str       ┆ str       ┆ str       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆ str       ┆ str       ┆ str       ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ ParlaMint ┆ LOC       ┆ ParlaMint ┆ europe    ┆ … ┆ M         ┆ null      ┆ null      ┆ PT       │
│ -PT_2015- ┆           ┆ -PT_2015- ┆           ┆   ┆           ┆           ┆           ┆          │
│ 03-20.u83 ┆           ┆ 03-20.seg ┆           ┆   ┆           ┆           ┆           ┆          │
│           ┆           ┆ 287…      ┆           ┆   ┆           ┆   

In [40]:
print(df.group_by("ReferredCountry").len().sort("len", descending=True).filter())

shape: (215, 2)
┌─────────────────┬────────┐
│ ReferredCountry ┆ len    │
│ ---             ┆ ---    │
│ str             ┆ u32    │
╞═════════════════╪════════╡
│ null            ┆ 421997 │
│ US              ┆ 94978  │
│ DE              ┆ 84267  │
│ RU              ┆ 81258  │
│ UA              ┆ 60249  │
│ XK              ┆ 48416  │
│ FR              ┆ 44566  │
│ ES              ┆ 40984  │
│ SY              ┆ 39111  │
│ CN              ┆ 37667  │
│ SE              ┆ 35132  │
│ GR              ┆ 34233  │
│ IT              ┆ 31423  │
│ IQ              ┆ 26887  │
│ AF              ┆ 24684  │
│ AT              ┆ 24248  │
│ PL              ┆ 23630  │
│ GB              ┆ 23085  │
│ HU              ┆ 22796  │
│ IL              ┆ 22427  │
│ GL              ┆ 21761  │
│ FI              ┆ 21480  │
│ HR              ┆ 21164  │
│ DK              ┆ 20945  │
│ NL              ┆ 20891  │
│ NO              ┆ 18027  │
│ ME              ┆ 17942  │
│ CA              ┆ 14833  │
│ CH              ┆ 14830  

In [41]:

df.filter(pl.col("ReferredCountry").is_null()).group_by("entity").agg(pl.count()).sort("count", descending=True)


  df.filter(pl.col("ReferredCountry").is_null()).group_by("entity").agg(pl.count()).sort("count", descending=True)


entity,count
str,u32
"""europe""",337559
"""eu""",23911
"""america""",23429
"""africa""",18271
"""asia""",5775
"""usa""",3268
"""north africa""",2814
"""south america""",1886
"""north america""",1231
"""west africa""",635


In [45]:
df.group_by("ReferringCountry", "ReferredCountry") \
  .len() \
  .filter(
      pl.col("ReferredCountry").is_not_null() & pl.col("ReferringCountry").is_not_null()
  ) \
  .sort("len", descending=True).filter(pl.col("ReferredCountry") != pl.col("ReferringCountry"))


ReferringCountry,ReferredCountry,len
str,str,u32
"""RS""","""XK""",40588
"""DK""","""GL""",20248
"""RS""","""ME""",11041
"""CZ""","""UA""",10945
"""NO""","""SE""",10889
"""NO""","""US""",10405
"""AT""","""DE""",10322
"""UA""","""RU""",8781
"""NO""","""RU""",8161
"""DK""","""FO""",7811
