In [26]:
import pandas as pd
import os
from pathlib import Path

try:
    import rispy
except ImportError:
    !pip install rispy --quiet

In [27]:
df = pd.read_csv(
    "../data/all_documents_general_query.csv",
    usecols=["doi", "title", "abstract", "concepts", "keywords"],
    dtype={"doi": str, "title": str, "abstract": str, "concepts": str, "keywords": str},
).drop_duplicates().reset_index(drop=True)

In [28]:
def parse_keywords(keywords):
    if pd.isna(keywords):
        return []
    keywords = keywords.split("|")
    keywords = [keyword.split(":")[0] for keyword in keywords]
    return keywords


# Parse the keywords column and change the dtype to categorical
df["keywords_cats"] = df["keywords"].apply(parse_keywords)

In [29]:
df.head()

Unnamed: 0,doi,title,abstract,concepts,keywords,keywords_cats
0,https://doi.org/10.1002/1096-8644(200007)112:3...,Age-dependent cortical bone loss in women from...,Age-dependent cortical bone loss was investiga...,Late 19th century:0.7359046|Cortical bone:0.56...,Late 19th century:0.7359046,[Late 19th century]
1,https://doi.org/10.1093/molbev/msv045,Model-Based Verification of Hypotheses on the ...,Various hypotheses for the peopling of the Jap...,Biology:0.8816396|Inference:0.6029327|Genome:0...,SNP:0.53888464,[SNP]
2,https://doi.org/10.3390/su13031436,Horticulture and Orchards as New Markets for M...,Animal manure management is a real challenge t...,Manure:0.8611909|Agriculture:0.64704436|Contex...,Valorisation:0.5449556|Manure management:0.482...,"[Valorisation, Manure management, Chicken manure]"
3,https://doi.org/10.1016/j.wace.2015.07.003,Why well yield matters for managing agricultur...,Groundwater-fed irrigation has supported growt...,Agriculture:0.695057|Climate change:0.6315455|...,,[]
4,https://doi.org/10.5210/spir.v2024i0.13937,INDUSTRY INFLUENCE ON CONTENT MODERATION REGUL...,As the EU Digital Services Act increasingly in...,Moderation:0.8861263|Civil society:0.6190976|P...,Moderation:0.8861263|Civil Society:0.6190976|C...,"[Moderation, Civil Society, Content (measure t..."


In [30]:
def get_all_keywords(path):
    """
    Get all keywords from a RIS file.
    :param path: Path to the RIS file.
    :return: Keywords as pandas Series.
    """
    keyword_list = []
    with open(path, "r") as f:
        ris_data = rispy.load(f)
        for entry in ris_data:
            try:
                keyword_list.append(entry["keywords"])
            except KeyError as e:
                print(
                    f"KeyError: entry does not contain 'keywords'. Keys: {entry.keys()}"
                )
                raise e
    return pd.DataFrame({"keywords": keyword_list})


sectors = os.listdir("ETL_all_sectors/raw_data")
full_dataset = []
for sector in sectors:
    print(f"Processing sector: {sector}")
    df_list = []

    # Get all files in the sector directory
    sector_files = os.listdir(os.path.join("ETL_all_sectors/raw_data", sector))
    sector_files = [
        Path(os.path.join("ETL_all_sectors/raw_data", sector, f)) for f in sector_files
    ]
    sector_files_types = [f.suffix for f in sector_files]

    if ".csv" in sector_files_types:
        # Freight CSV Case
        for file in sector_files:
            print(f"Processing file: {file}")
            # If a CSV file is present, read it
            tmp_df = pd.read_csv(
                file, sep=";", usecols=["keywords"], encoding_errors="replace"
            )
            df_list.append(tmp_df)

    elif ".ris" in sector_files_types or ".txt" in sector_files_types:
        # RIS or TXT Case
        for file in sector_files:
            print(f"Processing file: {file}")
            tmp_df = get_all_keywords(file)
            df_list.append(tmp_df)
    else:
        raise ValueError(
            f"No valid file types found in {sector}. Expected .csv, .ris, or .txt files."
        )

combined_df = pd.concat(df_list, ignore_index=True)

Processing sector: Nutrition
Processing file: ETL_all_sectors/raw_data/Nutrition/Nutrition Included.txt
Processing file: ETL_all_sectors/raw_data/Nutrition/Nutrition Excluded.txt
Processing sector: Urban_Infra
Processing file: ETL_all_sectors/raw_data/Urban_Infra/UrbanInfra Excluded.ris
Processing file: ETL_all_sectors/raw_data/Urban_Infra/UrbanInfra Included.ris
Processing sector: Digitalisation
Processing file: ETL_all_sectors/raw_data/Digitalisation/Digitalisation Included.txt
Processing file: ETL_all_sectors/raw_data/Digitalisation/Digitalisation Excluded.txt
Processing sector: Urban_Governance
Processing file: ETL_all_sectors/raw_data/Urban_Governance/UrbanGovSuff Included.txt
Processing file: ETL_all_sectors/raw_data/Urban_Governance/UrbanGovSuff Excluded.txt
Processing sector: Freight
Processing file: ETL_all_sectors/raw_data/Freight/Freight_Included.csv
Processing file: ETL_all_sectors/raw_data/Freight/Freight Excluded.csv
Processing sector: Urban_Ecology
Processing file: ETL_a

In [31]:
# check and count all the unique keywords using a pandas Series and value counts
unique_keywords = df["keywords_cats"].explode().tolist()
unique_keywords_set = set(unique_keywords)
keyword_prior_knowledge = combined_df["keywords"].explode().tolist()
keyword_prior_knowledge_set = set(keyword_prior_knowledge)
print(f"Number of unique keywords in the dataset: {len(unique_keywords_set)}")
print(
    f"Number of unique keywords in the prior knowledge: {len(keyword_prior_knowledge_set)}"
)

# Count the number of common keywords between the two sets
common_keywords = unique_keywords_set.intersection(keyword_prior_knowledge_set)
print(f"Number of common keywords: {len(common_keywords)}")

# Count the number of different keywords between the two sets
different_keywords = keyword_prior_knowledge_set.symmetric_difference(
    unique_keywords_set
)
print(f"Number of different keywords: {len(different_keywords)}")

Number of unique keywords in the dataset: 413803
Number of unique keywords in the prior knowledge: 16286
Number of common keywords: 373
Number of different keywords: 429343


In [32]:
common_keywords

{'Acculturation',
 'Active transport',
 'Adaptability',
 'Adaptive capacity',
 'Aerobic exercise',
 'Affect',
 'Affordable housing',
 'Agricultural policy',
 'Air filtration',
 'Air movement',
 'Altruism',
 'Analytic network process',
 'Annuity',
 'Answer Set Programming',
 'Appropriation',
 'Asbestos cement',
 'Asbestosis',
 'Asset turnover',
 'Attitude',
 'Austerity',
 'Backcasting',
 'Bank credit',
 'Bayesian Optimization',
 'Behaviour change',
 'Benchmarking',
 'Binary logit model',
 'Biomass fuels',
 'Black Sea',
 'Blue carbon',
 'Bounded rationality',
 'British Empire',
 'Building Information Modeling',
 'Building energy simulation',
 'Building envelope',
 'Building management',
 'Building science',
 'Bustard',
 'Cadastre',
 'Capacity Building',
 'Car ownership',
 'Car parking',
 'Carbon neutrality',
 'Caregiving',
 'Carrion',
 'Cash crop',
 'Chi-square test',
 'Chinese city',
 'Choice modelling',
 'Circular Economy',
 'Circular economy',
 'City centre',
 'CityGML',
 'Clean coal'