In [1]:
import pandas as pd
from pandas import MultiIndex
import itertools
from geo_kpe_multidoc import GEO_KPE_MULTIDOC_CACHE_PATH
import re
import os
import joblib

from geo_kpe_multidoc.geo.utils import load_topic_geo_locations, process_geo_associations_for_topics
from geo_kpe_multidoc.geo.measures import inv_dist
from loguru import logger

In [2]:
def get_files(path: str):
    geo_file_name_pattern = re.compile(r"d\d{2}-mdkpe-geo\.pkl")
    for file in os.listdir(path):
        if os.path.isfile(os.path.join(path, file)) and geo_file_name_pattern.match(file):
            yield file

In [5]:
# load topic _ doc _ coordenates
coordinates = pd.DataFrame()

for filename in get_files(os.path.join(GEO_KPE_MULTIDOC_CACHE_PATH, "MKDUC01")):
    topic_id = filename[:3]

    if topic_id in ["d08",  # errors
            "d14",
            "d22",
            "d28",
            "d34",
            "d50",
            "d53",
            "d59",]:
            logger.info(f"Skiping topic {topic_id} processing.")
            continue

    df = pd.DataFrame.from_dict({
        topic_id: load_topic_geo_locations(topic_id)
    }, orient='index').stack().explode().to_frame()

    df.columns = ["lat_long"]
    df.index.names = ["topic", "doc"]

    coordinates = pd.concat([coordinates, df])


2023-03-28 10:12:16.552 | DEBUG    | geo_kpe_multidoc.datasets.process_mordecai:load_topic_geo_locations:137 - loading mordecai parsing from topic d41
2023-03-28 10:12:16.555 | DEBUG    | geo_kpe_multidoc.datasets.process_mordecai:locations_from_mordecai_parsing:152 - load mordecai geo parsing for LA051590-0065
2023-03-28 10:12:16.557 | DEBUG    | geo_kpe_multidoc.datasets.process_mordecai:locations_from_mordecai_parsing:152 - load mordecai geo parsing for AP881211-0027
2023-03-28 10:12:16.558 | DEBUG    | geo_kpe_multidoc.datasets.process_mordecai:locations_from_mordecai_parsing:152 - load mordecai geo parsing for LA081490-0030
2023-03-28 10:12:16.559 | DEBUG    | geo_kpe_multidoc.datasets.process_mordecai:locations_from_mordecai_parsing:152 - load mordecai geo parsing for AP890111-0227
2023-03-28 10:12:16.560 | DEBUG    | geo_kpe_multidoc.datasets.process_mordecai:locations_from_mordecai_parsing:152 - load mordecai geo parsing for AP890801-0025
2023-03-28 10:12:16.562 | DEBUG    | ge

# Debug process_geo_associations...

In [3]:
def add_gold_label(df, gold):
    """
    Mutate dataframe `df` adding a label column if candidate is in the gold set.
    """
    gold_idx = MultiIndex.from_tuples(
    itertools.chain.from_iterable(
        df.index[df.index.isin([topic], level=0) 
                    & df.index.isin(gold[topic], level=1)] 
        for topic in df.index.get_level_values(0).unique()
    ),
    names=['topic', 'keyphrases']
    )

    not_gold_idx = MultiIndex.from_tuples(
    itertools.chain.from_iterable(
          df.index[df.index.isin([topic], level=0)
                      & ~df.index.isin(gold[topic], level=1)] 
          for topic in df.index.get_level_values(0).unique()
      ),
      names=['topic', 'keyphrases']
    )

    df.loc[gold_idx, "gold"] = True
    df.loc[not_gold_idx, "gold"] = False

In [4]:
docs_data = pd.read_parquet(os.path.join(GEO_KPE_MULTIDOC_CACHE_PATH, "MKDUC01", "MKDUC01-docs-data-20230324.parquet"))
topic_data = pd.read_parquet(os.path.join(GEO_KPE_MULTIDOC_CACHE_PATH,  "MKDUC01", "MKDUC01-topic-data-20230324.parquet"))
topic_docs_coordinates = pd.read_parquet(os.path.join(GEO_KPE_MULTIDOC_CACHE_PATH, "MKDUC01-topic-doc-coordinates-20230329.parquet"))

gold_24 = joblib.load(os.path.join(GEO_KPE_MULTIDOC_CACHE_PATH,           "MKDUC01", "MKDUC01-gold-20230324.pkl"))
add_gold_label(topic_data, gold_24)

In [5]:
w_function = inv_dist
w_function_param = 1

df = topic_data.copy() 

process_geo_associations_for_topics(df, 
                                        docs_data, 
                                        doc_coordinate_data = topic_docs_coordinates,
                                        w_function = w_function, 
                                        w_function_param = w_function_param, 
                                        save_cache=False)

d56
d57
d54
d45
d44
d43
d41
d39
d32
d37
d31
d30
d27
d24
d19
d15
d13
d12
d11
d05
d06
d04


Unnamed: 0_level_0,Unnamed: 1_level_0,semantic_score,N,gold,moran_i,geary_c,getis_g
topic,keyphrase,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
d56,fat-saturate food common,0.757605,1,False,1.000000,0.000000,0.250000
d56,dr . michael stern,0.732934,1,False,1.000000,0.000000,0.250000
d56,low-income hispanics,0.726801,1,False,1.000000,0.000000,0.250000
d56,anglo-orient health education network,0.726532,1,False,1.000000,0.000000,0.200000
d56,heredity,0.722802,2,False,0.646204,0.303298,0.142881
...,...,...,...,...,...,...,...
d04,year-the,0.176340,1,False,1.000000,0.000000,0.111111
d04,andrew-would,0.173403,1,False,1.000000,0.000000,0.200000
d04,shares-although low initially-have,0.170057,1,False,1.000000,0.000000,0.111111
d04,eight-a quarter,0.160397,1,False,1.000000,0.000000,0.200000
