## Organizando y depurando tópicos

In [1]:
import pandas as pd
import textrazor, json
from tqdm import tqdm

In [2]:
data = pd.read_csv("data/semantic_web_project_data.csv")

Escogiendo los tópicos de **Babelfy** que cuenten con enlace a DBpedia y tengan un score de mínimo 0.5.

In [12]:
babelify_topics_new = []
for i in range(len(data)):
    tops = data["topics_babelify"].iloc[i]
    new_tops = []
    if tops is not None:
        try:
            tops = eval(tops)
        except:
            if math.isnan(tops):
                babelify_topics_new.append(new_tops)
                continue
        for t in tops:
            if t["DBpediaURL"] != "" and t["score"] > 0.5:
                new_tops.append(t)
    babelify_topics_new.append(new_tops)

data["topics_babelify"] = babelify_topics_new

In [13]:
data['topics_textrazor'] = data['topics_textrazor'].apply(lambda x: [] if pd.isna(x) else x)

Eliminar topicos repetidos.

In [14]:
def remove_repeated_topics_babelfy(topics_babelfy):
    try:
        topics_babelfy = eval(topics_babelfy)
    except:
        pass
    new_topics = []
    topics_urls = []
    for top in topics_babelfy:
        if top["DBpediaURL"] not in topics_urls:
            new_topics.append(top)
            topics_urls.append(top["DBpediaURL"])
    return new_topics 

def remove_repeated_topics_textrazor(topics_textrazor):
    try:
        topics_textrazor = eval(topics_textrazor)
    except:
        pass
    new_topics = []
    topics_ids = []
    for top in topics_textrazor:
        if top["topic_wikidata_id"] not in topics_ids:
            new_topics.append(top)
            topics_ids.append(top["topic_wikidata_id"])
    return new_topics

In [15]:
data["topics_babelify"] = data["topics_babelify"].apply(remove_repeated_topics_babelfy)
data["topics_textrazor"] = data["topics_textrazor"].apply(remove_repeated_topics_textrazor)

Estableciendo un umbral para el score de coherencia de **Babelfy.** Este representa qué tanto se conecta el fragmento con respecto a todos los demás obtenidos. Proponemos un umbral de 0.2.

In [18]:
def coherence_score_threshold(topics_babelfy):
    try:
        topics_babelfy = eval(topics_babelfy)
    except:
        pass
    new_topics = []
    for top in topics_babelfy:
        if top["coherenceScore"] >= 0.2:
            new_topics.append(top)
    return new_topics 

In [19]:
data["topics_babelify"] = data["topics_babelify"].apply(coherence_score_threshold)

Extracción adicional de papers que quedaron sin tópicos por parte de ambas plataformas.

In [20]:
mask_babelify = data["topics_babelify"].apply(lambda x: isinstance(x, list) and len(x) == 0)
mask_textrazor = data["topics_textrazor"].apply(lambda x: isinstance(x, list) and len(x) == 0)

mask_combined = mask_babelify & mask_textrazor
rows_with_empty_lists = data.loc[mask_combined]

In [22]:
rows_with_empty_lists[["semanticId", "topics_babelify", "topics_textrazor"]]

Unnamed: 0,semanticId,topics_babelify,topics_textrazor
754,,[],[]
1488,93031891bb1492c9533eceb77534b9192d8dc627,[],[]
2796,5bf171259e1ebfe1a438b54f1bc1056a363da75f,[],[]
2800,1cadd921cde096642c967a9a4fa9059848382cf0,[],[]
2806,c9ec302f2646c2421e7fd449dd63b2e3d81f70ad,[],[]
...,...,...,...
7962,1aa6a6574a5919fd1ce5b773c1651633e9ec0d3d,[],[]
7966,bf2868365117a379139076b0cf30557fc738df07,[],[]
8012,17449db4decff5fa3cc21829cadf3be366c420b9,[],[]
8028,2b2cee5161cb553014688f684ee4b245a11371ac,[],[]


In [23]:
rows_with_empty_lists.drop(["topics_babelify", "topics_textrazor"], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rows_with_empty_lists.drop(["topics_babelify", "topics_textrazor"], axis = 1, inplace = True)


Para estos vamos a obtener nuevamente sus tópicos bajando el umbral a mínimo un 0.2 de puntaje.

In [24]:
textrazor.api_key = "your_api_key_here"
client = textrazor.TextRazor(extractors=["topics"])
client.set_language_override("eng")
topics_textrazor_missing = []

for _, row in tqdm(rows_with_empty_lists.iterrows()):
    title = row["title"]
    abstract = row["abstract"]
    full_text = f"{title}\n{abstract}"
    response = client.analyze(full_text)
    if response.ok:
        textrazor_response = []
        for topic in response.topics():
            if topic.score > 0.2:
                textrazor_response.append({
                    "topic_name" : topic.label,
                    "topic_score" : topic.score,
                    "topic_wikidata_id" : topic.wikidata_id
                })
        topics_textrazor_missing.append({
                    "semanticId": row["semanticId"],
                    "topics_textrazor": textrazor_response
                })


393it [03:46,  1.73it/s]


In [25]:
rows_with_empty_lists = pd.merge(rows_with_empty_lists, pd.DataFrame(topics_textrazor_missing), on = "semanticId", how = "left")

Con los tópicos de **Babelfy** vamos a buscar el mismo umbral de 0.2.

In [26]:
topic_results = []
with open("data/babelify_topics.json", "r", encoding="utf-8") as input_file:
    for row in input_file:
        topic_results.append(json.loads(row))

In [27]:
rows_with_empty_lists = pd.merge(rows_with_empty_lists, pd.DataFrame(topic_results), on = "semanticId", how = "left")

In [28]:
babelify_topics_new = []
for i in range(len(rows_with_empty_lists)):
    try:
        tops = rows_with_empty_lists["topics_babelify"].iloc[i]
        new_tops = []
        if tops is not None:
            for t in tops:
                if t["DBpediaURL"] != "" and t["score"] > 0.2:
                    new_tops.append(t)
    except:
        new_tops = []
    babelify_topics_new.append(new_tops)

rows_with_empty_lists["topics_babelify"] = babelify_topics_new

Unimos y corroboramos la máscara otra vez. Solo la instancia con ``semanticId`` nulo permanece.

In [29]:
rows_with_empty_lists = rows_with_empty_lists[["semanticId", "topics_babelify", "topics_textrazor"]].dropna()
indexes_ = data[data["semanticId"].isin(rows_with_empty_lists["semanticId"])].index
for id_ in rows_with_empty_lists["semanticId"]:
    data.loc[indexes_, "topics_babelify"] = rows_with_empty_lists["topics_babelify"]
    data.loc[indexes_, "topics_textrazor"] = rows_with_empty_lists["topics_textrazor"]

In [30]:
mask_babelify = data["topics_babelify"].apply(lambda x: isinstance(x, list) and len(x) == 0)
mask_textrazor = data["topics_textrazor"].apply(lambda x: isinstance(x, list) and len(x) == 0)

mask_combined = mask_babelify & mask_textrazor
data.loc[mask_combined][["semanticId", "topics_babelify", "topics_textrazor"]]

Unnamed: 0,semanticId,topics_babelify,topics_textrazor
754,,[],[]


Guardamos los datos.

In [31]:
data.to_csv("data/semantic_web_project_data_depurated.csv", index = False)