In [1]:
import os
import pandas as pd
from pathlib import Path

try:
    import rispy
except ImportError:
    !pip install rispy --quiet

In [2]:
def get_abstract_and_title(path):
    """
    Get all abstracts and titles from a RIS file.
    :param path: Path to the RIS file.
    :return: Abstracts and titles as pandas Series.
    """
    title_list = []
    abstract_list = []
    with open(path, "r") as f:
        ris_data = rispy.load(f)
        for entry in ris_data:
            try:
                title_list.append(entry["primary_title"])
                abstract_list.append(entry["abstract"])
            except KeyError as e:
                print(
                    f"KeyError: entry does not contain 'T1' or 'AB'. Keys: {entry.keys()}"
                )
                raise e
    return pd.DataFrame({"title": title_list, "abstract": abstract_list})

In [3]:
sectors = os.listdir("raw_data")
full_dataset = []
for sector in sectors:
    print(f"Processing sector: {sector}")
    df_list = []

    # Get all files in the sector directory
    sector_files = os.listdir(os.path.join("raw_data", sector))
    sector_files = [Path(os.path.join("raw_data", sector, f)) for f in sector_files]
    sector_files_types = [f.suffix for f in sector_files]

    if ".csv" in sector_files_types:
        # Freight CSV Case
        for file in sector_files:
            print(f"Processing file: {file}")
            # If a CSV file is present, read it
            tmp_df = pd.read_csv(
                file, sep=";", usecols=["title", "abstract"], encoding_errors="replace"
            )
            if "Excluded" in file.name:
                tmp_df["true_label"] = "Not About Sufficiency"
            else:
                tmp_df["true_label"] = "About Sufficiency"
            df_list.append(tmp_df)

    elif ".ris" in sector_files_types or ".txt" in sector_files_types:
        # RIS or TXT Case
        for file in sector_files:
            print(f"Processing file: {file}")
            tmp_df = get_abstract_and_title(file)
            if "Excluded" in file.name:
                tmp_df["true_label"] = "Not About Sufficiency"
            elif "Included" in file.name:
                tmp_df["true_label"] = "About Sufficiency"
            else:
                raise ValueError(
                    f"File name does not contain 'Included' or 'Excluded': {file.name}"
                )
            df_list.append(tmp_df)
    else:
        raise ValueError(
            f"No valid file types found in {sector}. Expected .csv, .ris, or .txt files."
        )

    combined_df = pd.concat(df_list, ignore_index=True)
    output_path = os.path.join("transformed_data", f"{sector.lower()}_dataset.csv")
    combined_df.to_csv(output_path, index=False)

    # Track sector information in the full dataset
    combined_df["sector"] = sector
    full_dataset.append(combined_df)

    print(f"Saved transformed data to {output_path}\n\n")

# Add mobility sector
mobility_df = pd.read_csv(
    "../../data/mobility_full_concat_dataset.csv",
    usecols=["primary_title", "abstract", "true_label"],
).rename(columns={"primary_title": "title"})
mobility_df["sector"] = "mobility"
full_dataset.append(mobility_df)

# Combine all datasets into one
full_dataset_df = pd.concat(full_dataset, ignore_index=True)


Processing sector: Urban_Governance
Processing file: raw_data/Urban_Governance/UrbanGovSuff Excluded.txt
Processing file: raw_data/Urban_Governance/UrbanGovSuff Included.txt
Saved transformed data to transformed_data/urban_governance_dataset.csv


Processing sector: Freight
Processing file: raw_data/Freight/Freight Excluded.csv
Processing file: raw_data/Freight/Freight_Included.csv
Saved transformed data to transformed_data/freight_dataset.csv


Processing sector: Digitalisation
Processing file: raw_data/Digitalisation/Digitalisation Excluded.txt
Processing file: raw_data/Digitalisation/Digitalisation Included.txt
Saved transformed data to transformed_data/digitalisation_dataset.csv


Processing sector: Buildings
Processing file: raw_data/Buildings/Buildings Included.txt
Processing file: raw_data/Buildings/Buildings Excluded.txt
Saved transformed data to transformed_data/buildings_dataset.csv


Processing sector: Nutrition
Processing file: raw_data/Nutrition/Nutrition Excluded.txt
Proc

In [4]:
# Checking for duplicated entries
duplicated_entries = full_dataset_df[
    full_dataset_df.duplicated(subset=["title"], keep=False)
]
duplicated_entries

Unnamed: 0,title,abstract,true_label,sector
0,Author Correction: Pan-cancer analysis of whol...,"In the published version of this paper, the li...",Not About Sufficiency,Urban_Governance
168,Introduction,,Not About Sufficiency,Urban_Governance
179,SDG 7 requires post-growth energy sufficiency,Sustainable Development Goal 7 (SDG 7) aims to...,Not About Sufficiency,Urban_Governance
268,Environmental Justice as Scalar Parity: Lesson...,The development of major infrastructure projec...,Not About Sufficiency,Urban_Governance
408,Decarbonization and social justice: The case f...,Artisanal and small-scale mining (ASM) is the ...,Not About Sufficiency,Urban_Governance
...,...,...,...,...
19319,"Residential location, urban form, and househol...",,About Sufficiency,mobility
19328,Relationship between Rural Built Environment a...,With the rapid rural urbanization and new rura...,About Sufficiency,mobility
19329,Segmentation of the current levels of passenge...,There is a clear need for debate about targete...,About Sufficiency,mobility
19354,Developing and Assessing Alternative Land-Use ...,"In this study, environmental sustainability im...",About Sufficiency,mobility


In [5]:
# Check if duplicated entries have the same label
gb = duplicated_entries.groupby("title")["true_label"]
gb.count().value_counts()

true_label
2    611
3     32
4     13
6      1
5      1
Name: count, dtype: int64

* 32 articles apparaissent dans 2 secteurs.
* 13 articles apparaissent dans 3 secteurs.
* 1 article apparait dans 4 secteurs.
* 1 article apparait dans 5 secteurs.

In [6]:
duplicated_entries_same_label = gb.nunique()
duplicated_entries_different_label_index = duplicated_entries_same_label[
    duplicated_entries_same_label > 1
].index
duplicated_entries_different_label = full_dataset_df[
    full_dataset_df["title"].isin(duplicated_entries_different_label_index)
].sort_values(by=["title", "true_label"], ascending=[False, False])
duplicated_entries_different_label.to_markdown(
    "duplicated_entries_different_label.md",
    index=False,
    tablefmt="github",
    colalign=("left", "left", "left"),
    stralign="left",
)
duplicated_entries_different_label.drop(columns="abstract").to_markdown(
    "duplicated_entries_different_label_title_only.md",
    index=False,
    tablefmt="github",
    colalign=("left", "left"),
    stralign="left",
)

In [7]:
duplicated_entries_different_label.head(20)

Unnamed: 0,title,abstract,true_label,sector
5690,Who Are Bicyclists? Why and How Much Are They ...,The factors influencing the decision to bicycl...,Not About Sufficiency,Buildings
19369,Who Are Bicyclists? Why and How Much Are They ...,The factors influencing the decision to bicycl...,About Sufficiency,mobility
15408,"Walking together: Exploring perspectives, atti...",Modifiable risk factors related to lifestyle c...,Not About Sufficiency,mobility
17186,"Walking together: Exploring perspectives, atti...",Modifiable risk factors related to lifestyle c...,About Sufficiency,mobility
12143,Utilization of ecosystem services in future vi...,In the face of uncertain future climate change...,Not About Sufficiency,Urban_Infra
12475,Utilization of ecosystem services in future vi...,In the face of uncertain future climate change...,Not About Sufficiency,Urban_Infra
13128,Utilization of ecosystem services in future vi...,In the face of uncertain future climate change...,Not About Sufficiency,Urban_Infra
10661,Utilization of ecosystem services in future vi...,In the face of uncertain future climate change...,About Sufficiency,Urban_Ecology
14389,"Using a real-world, project-based energy modul...",A project-based energy module has been taught ...,Not About Sufficiency,mobility
19004,"Using a real-world, project-based energy modul...",A project-based energy module has been taught ...,About Sufficiency,mobility


Check the markdown files !! It's interesting to see the different labels

We need to drop duplicates in the all sector full_dataset.
I don't have time to treat the 13 articles that has ambiguous labels

In [8]:
full_dataset_df_cleaned = full_dataset_df.drop_duplicates(
    subset=["title"], keep="first"
)
output_path = os.path.join("transformed_data", "full_dataset.csv")
full_dataset_df.to_csv(output_path, index=False)

In [9]:
full_dataset_df_cleaned['true_label'].value_counts()

true_label
Not About Sufficiency    12109
About Sufficiency         6545
Name: count, dtype: int64