In [141]:
import os
import pandas as pd
from pathlib import Path

try:
    import rispy
except ImportError:
    !pip install rispy --quiet

In [142]:
def get_abstract_and_title(path):
    """
    Get all abstracts and titles from a RIS file.
    :param path: Path to the RIS file.
    :return: Abstracts and titles as pandas Series.
    """
    title_list = []
    abstract_list = []
    with open(path, "r") as f:
        ris_data = rispy.load(f)
        for entry in ris_data:
            try:
                title_list.append(entry["primary_title"])
                abstract_list.append(entry["abstract"])
            except KeyError as e:
                print(f"KeyError: entry does not contain 'T1' or 'AB'. Keys: {entry.keys()}")
                raise e
    return pd.DataFrame({"title": title_list, "abstract": abstract_list})

In [143]:
sectors = os.listdir("raw_data")
full_dataset = []
for sector in sectors:
    print(f"Processing sector: {sector}")
    df_list = []

    # Get all files in the sector directory
    sector_files = os.listdir(os.path.join("raw_data", sector))
    sector_files = [Path(os.path.join("raw_data", sector, f)) for f in sector_files]
    sector_files_types = [f.suffix for f in sector_files]

    if ".csv" in sector_files_types:
        # Freight CSV Case
        for file in sector_files:
            print(f"Processing file: {file}")
            # If a CSV file is present, read it
            tmp_df = pd.read_csv(file, sep=";", usecols=["title", "abstract"], encoding_errors="replace")
            if "Excluded" in file.name:
                tmp_df["true_label"] = "Not About Sufficiency"
            else:
                tmp_df["true_label"] = "About Sufficiency"
            df_list.append(tmp_df)
    
    elif ".ris" in sector_files_types or ".txt" in sector_files_types:
        # RIS or TXT Case
        for file in sector_files:
            print(f"Processing file: {file}")
            tmp_df = get_abstract_and_title(file)
            if "Excluded" in file.name:
                tmp_df["true_label"] = "Not About Sufficiency"
            elif "Included" in file.name:
                tmp_df["true_label"] = "About Sufficiency"
            else:
                raise ValueError(f"File name does not contain 'Included' or 'Excluded': {file.name}")
            df_list.append(tmp_df)
    else:
        raise ValueError(f"No valid file types found in {sector}. Expected .csv, .ris, or .txt files.")

    full_dataset += df_list
    combined_df = pd.concat(df_list, ignore_index=True)

    output_path = os.path.join("transformed_data", f"{sector.lower()}_dataset.csv")
    combined_df.to_csv(output_path, index=False)
    print(f"Saved transformed data to {output_path}\n\n")
# Combine all datasets into one
full_dataset_df = pd.concat(full_dataset, ignore_index=True)


Processing sector: Urban_Governance
Processing file: raw_data/Urban_Governance/UrbanGovSuff Excluded.txt
Processing file: raw_data/Urban_Governance/UrbanGovSuff Included.txt
Saved transformed data to transformed_data/urban_governance_dataset.csv


Processing sector: Freight
Processing file: raw_data/Freight/Freight Excluded.csv
Processing file: raw_data/Freight/Freight_Included.csv
Saved transformed data to transformed_data/freight_dataset.csv


Processing sector: Digitalisation
Processing file: raw_data/Digitalisation/Digitalisation Excluded.txt
Processing file: raw_data/Digitalisation/Digitalisation Included.txt
Saved transformed data to transformed_data/digitalisation_dataset.csv


Processing sector: Nutrition
Processing file: raw_data/Nutrition/Nutrition Excluded.txt
Processing file: raw_data/Nutrition/Nutrition Included.txt
Saved transformed data to transformed_data/nutrition_dataset.csv


Processing sector: Urban_Ecology
Processing file: raw_data/Urban_Ecology/Urban_Ecology_Inclu

In [144]:
# Checking for duplicated entries
duplicated_entries = full_dataset_df[full_dataset_df.duplicated(subset=['title'], keep=False)]
duplicated_entries

Unnamed: 0,title,abstract,true_label
0,Author Correction: Pan-cancer analysis of whol...,"In the published version of this paper, the li...",Not About Sufficiency
179,SDG 7 requires post-growth energy sufficiency,Sustainable Development Goal 7 (SDG 7) aims to...,Not About Sufficiency
408,Decarbonization and social justice: The case f...,Artisanal and small-scale mining (ASM) is the ...,Not About Sufficiency
503,Green innovation dynamics: the mediating role ...,"This study investigates whether and, if so, ho...",Not About Sufficiency
532,Determinants of renewable energy consumption i...,This study aims to investigate the factors of ...,Not About Sufficiency
...,...,...,...
10124,Assessment of the Impact of Road Transport Cha...,"In the context of accelerating urbanisation, c...",About Sufficiency
10128,Waste as Resource for Pakistan: An Innovative ...,Municipal solid waste (MSW) management is a gl...,About Sufficiency
10133,The Road to Eliminating Energy Poverty: Does R...,Under the constraint of carbon neutrality targ...,About Sufficiency
10134,Legal Guarantee of Smart City Pilot and Green ...,Green and smart cities are based on clean ener...,About Sufficiency


In [145]:
# Check if duplicated entries have the same label
gb  = duplicated_entries.groupby('title')['true_label']
gb.count().value_counts()

true_label
2    435
3     30
4     12
5      1
Name: count, dtype: int64

* 30 articles apparaissent dans 2 secteurs.
* 12 articles apparaissent dans 3 secteurs.
* 1 article apparait dans 4 secteurs.

In [146]:
duplicated_entries_same_label = gb.nunique()
duplicated_entries_different_label_index = duplicated_entries_same_label[
    duplicated_entries_same_label > 1
].index
duplicated_entries_different_label = full_dataset_df[
    full_dataset_df["title"].isin(duplicated_entries_different_label_index)
].sort_values(
    by=["title", "true_label"], ascending=[False, False]
)
duplicated_entries_different_label.to_markdown(
    "duplicated_entries_different_label.md",
    index=False,
    tablefmt="github",
    colalign=("left", "left", "left"),
    stralign="left",
)
duplicated_entries_different_label.drop(columns='abstract').to_markdown(
    "duplicated_entries_different_label_title_only.md",
    index=False,
    tablefmt="github",
    colalign=("left", "left"),
    stralign="left",
)

In [147]:
duplicated_entries_different_label.head(20)

Unnamed: 0,title,abstract,true_label
8717,Utilization of ecosystem services in future vi...,In the face of uncertain future climate change...,Not About Sufficiency
9049,Utilization of ecosystem services in future vi...,In the face of uncertain future climate change...,Not About Sufficiency
9702,Utilization of ecosystem services in future vi...,In the face of uncertain future climate change...,Not About Sufficiency
7235,Utilization of ecosystem services in future vi...,In the face of uncertain future climate change...,About Sufficiency
8120,The hidden risk in China's cropland conversion...,Cropland conversion is a significant theme of ...,Not About Sufficiency
6992,The hidden risk in China's cropland conversion...,Cropland conversion is a significant theme of ...,About Sufficiency
9153,The battle to achieve Sustainable Development ...,The current period marked by addressing enviro...,Not About Sufficiency
1700,The battle to achieve Sustainable Development ...,The current period marked by addressing enviro...,About Sufficiency
7416,The Road to Eliminating Energy Poverty: Does R...,Under the constraint of carbon neutrality targ...,Not About Sufficiency
10133,The Road to Eliminating Energy Poverty: Does R...,Under the constraint of carbon neutrality targ...,About Sufficiency


Check the markdown files !! It's interesting to see the different labels

We need to drop duplicates in the all sector full_dataset.
I don't have time to treat the 13 articles that has ambiguous labels

In [148]:
full_dataset_df_cleaned = full_dataset_df.drop_duplicates(subset=["title"], keep="first")
output_path = os.path.join("transformed_data", "full_dataset.csv")
full_dataset_df.to_csv(output_path, index=False)