In [6]:
# Imports
import os
import pandas as pd
import numpy as np
import bibtexparser

# Constants
STANDARD_COLUMNS = ['Database', 'DOI', 'Title', 'Author', 'Year', 'Abstract', 'REVIEW_STAGE']
NOT_FOUND_VALUE = 'NOT_FOUND'
DEFAULT_REVIEW_STAGE = 'unchecked'

COLUMN_MAPPINGS = {
    'DOI': ['doi', 'DOI', 'digitalObjectIdentifier'],
    'Title': ['title', 'Article Title', 'Document Title', 'Article title', 'Title'],
    'Author': ['author', 'Authors', 'Author'],
    'Year': ['year', 'Publication Year', 'Publication Date'],
    'Abstract': ['Abstract', 'abstract']
}

# File names to process
file_list = [
    'results_review_incl_RAG/ieeexplore_85.bib',
    'results_review_incl_RAG/ProQuestDocuments_1-100.xls',
    'results_review_incl_RAG/ProQuestDocuments_101-150.xls',
    'results_review_incl_RAG/ScienceDirect_citations_1-100.bib',
    'results_review_incl_RAG/ScienceDirect_citations_101-184.bib',
    'results_review_incl_RAG/ebsohost_1-50.csv',
    'results_review_incl_RAG/ebsohost_1-94.csv',
    'results_review_incl_RAG/TaylorFrancis_36.bib',
    'results_review_incl_RAG/acm_126.bib',
    'results_review_incl_RAG/webofscience_150.xls'
]

# Mapping from filename patterns to database names
DATABASE_MAPPING = {
    'ProQuestDocuments': 'ProQuest',
    'ScienceDirect_citations': 'ScienceDirect',
    'TaylorFrancis': 'Taylor & Francis',
    'acm': 'ACM Digital Library',
    'ebsohost': 'EBSCOhost',
    'ieeexplore': 'IEEE Xplore',
    'webofscience': 'Web of Science',  # Adjusted for potential typo
}

def get_database_name(filename):
    for key in DATABASE_MAPPING.keys():
        if filename.startswith(key):
            return DATABASE_MAPPING[key]
    return 'Unknown'

def process_file(filepath):
    filename = os.path.basename(filepath)
    file_extension = os.path.splitext(filename)[1].lower()
    database = get_database_name(filename)
    
    if file_extension in ['.xls', '.xlsx']:
        df = pd.read_excel(filepath)
    elif file_extension == '.csv':
        df = pd.read_csv(filepath, encoding='utf-8')
    elif file_extension == '.bib':
        df = process_bibtex_file(filepath)
    else:
        print(f"Unsupported file type: {filepath}")
        return None
    
    # Map columns to standard names
    df = map_columns(df)
    
    # Add 'Database' column
    df['Database'] = database
    
    # Add 'REVIEW_STAGE' column with default value
    df['REVIEW_STAGE'] = DEFAULT_REVIEW_STAGE
    
    # Ensure all standard columns are present and handle missing data
    for col in STANDARD_COLUMNS:
        if col not in df.columns:
            df[col] = NOT_FOUND_VALUE
    
    # Replace NaN or empty strings with NOT_FOUND_VALUE
    df[STANDARD_COLUMNS] = df[STANDARD_COLUMNS].replace(
        {pd.NA: NOT_FOUND_VALUE, np.nan: NOT_FOUND_VALUE, '': NOT_FOUND_VALUE}
    )
    
    # Keep only the standard columns
    df = df[STANDARD_COLUMNS]
    
    return df

def process_bibtex_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as bibtex_file:
        bib_database = bibtexparser.load(bibtex_file)
    entries = bib_database.entries
    df = pd.DataFrame(entries)
    return df

def map_columns(df):
    # Create a mapping from lowercase column names to actual column names
    lower_columns = {col.lower(): col for col in df.columns}
    column_mapping = {}
    for standard_col, possible_names in COLUMN_MAPPINGS.items():
        for possible_name in possible_names:
            possible_name_lower = possible_name.lower()
            if possible_name_lower in lower_columns:
                actual_col_name = lower_columns[possible_name_lower]
                column_mapping[actual_col_name] = standard_col
                break
    df = df.rename(columns=column_mapping)
    return df


print("Current working directory:", os.getcwd())


Current working directory: C:\Users\FelixNeubauer\UniRepos\literature_review_thesis\scripts_and_outputs_incl_rag


In [7]:
# Processing files in a Jupyter notebook
all_dfs = []

for file in file_list:
    print(f"Processing {file}")
    df = process_file(file)
    if df is not None:
        all_dfs.append(df)

if all_dfs:
    combined_df = pd.concat(all_dfs, ignore_index=True)
    # Save combined dataframe to Excel
    combined_df.to_excel('combined_results.xlsx', index=False)
    print("Combined dataframe saved to 'combined_results.xlsx'")
else:
    print("No dataframes to combine.")

Processing results_review_incl_RAG/ieeexplore_85.bib
Processing results_review_incl_RAG/ProQuestDocuments_1-100.xls
Processing results_review_incl_RAG/ProQuestDocuments_101-150.xls
Processing results_review_incl_RAG/ScienceDirect_citations_1-100.bib
Processing results_review_incl_RAG/ScienceDirect_citations_101-184.bib
Processing results_review_incl_RAG/ebsohost_1-50.csv
Processing results_review_incl_RAG/ebsohost_1-94.csv
Processing results_review_incl_RAG/TaylorFrancis_36.bib
Processing results_review_incl_RAG/acm_126.bib
Processing results_review_incl_RAG/webofscience_150.xls
Combined dataframe saved to 'combined_results.xlsx'
