In [1]:
import os
import pandas as pd
from pathlib import Path

try:
    import rispy
except ImportError:
    !pip install rispy --quiet

In [2]:
def get_abstract_and_title(path):
    """
    Get all abstracts and titles from a RIS file.
    :param path: Path to the RIS file.
    :return: Abstracts and titles as pandas Series.
    """
    title_list = []
    abstract_list = []
    with open(path, "r") as f:
        ris_data = rispy.load(f)
        for entry in ris_data:
            try:
                title_list.append(entry["primary_title"])
                abstract_list.append(entry["abstract"])
            except KeyError as e:
                print(
                    f"KeyError: entry does not contain 'T1' or 'AB'. Keys: {entry.keys()}"
                )
                raise e
    return pd.DataFrame({"title": title_list, "abstract": abstract_list})

In [3]:
sectors = os.listdir("raw_data")
full_dataset = []
for sector in sectors:
    print(f"Processing sector: {sector}")
    df_list = []

    # Get all files in the sector directory
    sector_files = os.listdir(os.path.join("raw_data", sector))
    sector_files = [Path(os.path.join("raw_data", sector, f)) for f in sector_files]
    sector_files_types = [f.suffix for f in sector_files]

    if ".csv" in sector_files_types:
        # Freight CSV Case
        for file in sector_files:
            print(f"Processing file: {file}")
            # If a CSV file is present, read it
            tmp_df = pd.read_csv(
                file, sep=";", usecols=["title", "abstract"], encoding_errors="replace"
            )
            if "Excluded" in file.name:
                tmp_df["true_label"] = "Not About Sufficiency"
            else:
                tmp_df["true_label"] = "About Sufficiency"
            df_list.append(tmp_df)

    elif ".ris" in sector_files_types or ".txt" in sector_files_types:
        # RIS or TXT Case
        for file in sector_files:
            print(f"Processing file: {file}")
            tmp_df = get_abstract_and_title(file)
            if "Excluded" in file.name:
                tmp_df["true_label"] = "Not About Sufficiency"
            elif "Included" in file.name:
                tmp_df["true_label"] = "About Sufficiency"
            else:
                raise ValueError(
                    f"File name does not contain 'Included' or 'Excluded': {file.name}"
                )
            df_list.append(tmp_df)
    else:
        raise ValueError(
            f"No valid file types found in {sector}. Expected .csv, .ris, or .txt files."
        )

    combined_df = pd.concat(df_list, ignore_index=True)
    output_path = os.path.join("transformed_data", f"{sector.lower()}_dataset.csv")
    combined_df.to_csv(output_path, index=False)

    # Track sector information in the full dataset
    combined_df["sector"] = sector
    full_dataset.append(combined_df)

    print(f"Saved transformed data to {output_path}\n\n")

# Add mobility sector
mobility_df = pd.read_csv(
    "../../data/mobility_full_concat_dataset.csv",
    usecols=["primary_title", "abstract", "true_label"],
).rename(columns={"primary_title": "title"})
mobility_df["sector"] = "mobility"
full_dataset.append(mobility_df)

# Combine all datasets into one
full_dataset_df = pd.concat(full_dataset, ignore_index=True)


Processing sector: Urban_Governance
Processing file: raw_data/Urban_Governance/UrbanGovSuff Excluded.txt
Processing file: raw_data/Urban_Governance/UrbanGovSuff Included.txt
Saved transformed data to transformed_data/urban_governance_dataset.csv


Processing sector: Freight
Processing file: raw_data/Freight/Freight Excluded.csv
Processing file: raw_data/Freight/Freight_Included.csv
Saved transformed data to transformed_data/freight_dataset.csv


Processing sector: Digitalisation
Processing file: raw_data/Digitalisation/Digitalisation Excluded.txt
Processing file: raw_data/Digitalisation/Digitalisation Included.txt
Saved transformed data to transformed_data/digitalisation_dataset.csv


Processing sector: Nutrition
Processing file: raw_data/Nutrition/Nutrition Excluded.txt
Processing file: raw_data/Nutrition/Nutrition Included.txt
Saved transformed data to transformed_data/nutrition_dataset.csv


Processing sector: Urban_Ecology
Processing file: raw_data/Urban_Ecology/Urban_Ecology_Inclu

In [4]:
combined_df

Unnamed: 0,title,abstract,true_label,sector
0,Social Equity and Environmental Risk,Social equity has become an important concern ...,Not About Sufficiency,Urban_Infra
1,Time to treat the climate and nature crisis as...,Damage to one subsystem can create feedback th...,Not About Sufficiency,Urban_Infra
2,Time to treat the climate and nature crisis as...,Damage to one subsystem can create feedback th...,Not About Sufficiency,Urban_Infra
3,Decarbonization will lead to more equitable ai...,Air quality associated public health co-benefi...,Not About Sufficiency,Urban_Infra
4,Assessment and Optimization of Ecological Netw...,"In the urbanization development trend, constru...",Not About Sufficiency,Urban_Infra
...,...,...,...,...
1477,Legal Guarantee of Smart City Pilot and Green ...,Green and smart cities are based on clean ener...,About Sufficiency,Urban_Infra
1478,Legal Guarantee of Smart City Pilot and Green ...,Green and smart cities are based on clean ener...,About Sufficiency,Urban_Infra
1479,Sustainable Urban Resource Management an Analy...,Conference Title: 2024 6th International Confe...,About Sufficiency,Urban_Infra
1480,Efficiency of green and low-carbon coordinated...,As a critical engine for national economic gro...,About Sufficiency,Urban_Infra


In [5]:
# Checking for duplicated entries
duplicated_entries = full_dataset_df[
    full_dataset_df.duplicated(subset=["title"], keep=False)
]
duplicated_entries

Unnamed: 0,title,abstract,true_label,sector
0,Author Correction: Pan-cancer analysis of whol...,"In the published version of this paper, the li...",Not About Sufficiency,Urban_Governance
168,Introduction,,Not About Sufficiency,Urban_Governance
179,SDG 7 requires post-growth energy sufficiency,Sustainable Development Goal 7 (SDG 7) aims to...,Not About Sufficiency,Urban_Governance
268,Environmental Justice as Scalar Parity: Lesson...,The development of major infrastructure projec...,Not About Sufficiency,Urban_Governance
408,Decarbonization and social justice: The case f...,Artisanal and small-scale mining (ASM) is the ...,Not About Sufficiency,Urban_Governance
...,...,...,...,...
15755,Is neighborhood satisfaction related to densit...,While sustainable land use planning ensures la...,About Sufficiency,mobility
15824,"Living Environment, Mobility, and Wellbeing am...",In view of the demographic profile of the olde...,About Sufficiency,mobility
15832,Transition engineering of transport in megacit...,Private automobiles have been wildly popular a...,About Sufficiency,mobility
15871,The relationship between regional compactness ...,Innovation has become a key driver of economic...,About Sufficiency,mobility


In [6]:
# Check if duplicated entries have the same label
gb = duplicated_entries.groupby("title")["true_label"]
gb.count().value_counts()

true_label
2    514
3     33
4     12
6      1
5      1
Name: count, dtype: int64

* 33 articles apparaissent dans 2 secteurs.
* 12 articles apparaissent dans 3 secteurs.
* 1 article apparait dans 4 secteurs.
* 1 article apparait dans 5 secteurs.

In [7]:
duplicated_entries_same_label = gb.nunique()
duplicated_entries_different_label_index = duplicated_entries_same_label[
    duplicated_entries_same_label > 1
].index
duplicated_entries_different_label = full_dataset_df[
    full_dataset_df["title"].isin(duplicated_entries_different_label_index)
].sort_values(by=["title", "true_label"], ascending=[False, False])
duplicated_entries_different_label.to_markdown(
    "duplicated_entries_different_label.md",
    index=False,
    tablefmt="github",
    colalign=("left", "left", "left"),
    stralign="left",
)
duplicated_entries_different_label.drop(columns="abstract").to_markdown(
    "duplicated_entries_different_label_title_only.md",
    index=False,
    tablefmt="github",
    colalign=("left", "left"),
    stralign="left",
)

In [8]:
duplicated_entries_different_label.head(20)

Unnamed: 0,title,abstract,true_label,sector
11982,"Walking together: Exploring perspectives, atti...",Modifiable risk factors related to lifestyle c...,Not About Sufficiency,mobility
13760,"Walking together: Exploring perspectives, atti...",Modifiable risk factors related to lifestyle c...,About Sufficiency,mobility
8717,Utilization of ecosystem services in future vi...,In the face of uncertain future climate change...,Not About Sufficiency,Urban_Infra
9049,Utilization of ecosystem services in future vi...,In the face of uncertain future climate change...,Not About Sufficiency,Urban_Infra
9702,Utilization of ecosystem services in future vi...,In the face of uncertain future climate change...,Not About Sufficiency,Urban_Infra
7235,Utilization of ecosystem services in future vi...,In the face of uncertain future climate change...,About Sufficiency,Urban_Ecology
10963,"Using a real-world, project-based energy modul...",A project-based energy module has been taught ...,Not About Sufficiency,mobility
15578,"Using a real-world, project-based energy modul...",A project-based energy module has been taught ...,About Sufficiency,mobility
3142,Urban spatial structure and equity for urban s...,"Urban development, equity and sustainability c...",Not About Sufficiency,Freight
13398,Urban spatial structure and equity for urban s...,"Urban development, equity and sustainability c...",About Sufficiency,mobility


Check the markdown files !! It's interesting to see the different labels

We need to drop duplicates in the all sector full_dataset.
I don't have time to treat the 13 articles that has ambiguous labels

In [9]:
full_dataset_df_cleaned = full_dataset_df.drop_duplicates(
    subset=["title"], keep="first"
)
output_path = os.path.join("transformed_data", "full_dataset.csv")
full_dataset_df.to_csv(output_path, index=False)