In [1]:
import pandas as pd
import os


# Keywords

In [4]:
keywords = [
    # 'network',
    'network of belief',
    'networks of belief',
    'belief network',
    'beliefs network',
    'map of belief',
    'maps of belief',
    'relationship between belief',
    'cognitive map',
    'map of cognitive behavior',
    'map of opinion',
    'network of opinion',
    'networks of opinion',
    'relationship between opinion',
    'system of belief',
    'systems of belief',
    'belief system',
    'beliefs system',
    'set of opinion',
    'opinion set',
    'set of value',
    'values set',
    'system of opinion',
    'opinion system', 
    'opimions system',
    'system of value',
    'values system',
    'value system',
    'attitude network',
    'attitude map',
    'relationship between attitude',
    'attitude system',
    # 'attitude',
    'issue system'
    'factor system',
    'system of factor',
    'variable system',
    'system of variable',
    'crucial factor',
    # additional ones after 2023.11.20
    'correlational class analysis',
    'relational class analysis',
    'ideology system',
    'map of idea',
    'system of idea',
    # use things like * in scopus: "behav* network" would match behavior network and behavioural network both; from https://library.bath.ac.uk/scopus/keyword-searching
]


systematic_keywords = [
    'Belief* Network',
    'Belief* System',
    'Belief* Map',
    'Belief* Set',
    'Issue* Network',
    'Issue* System',
    'Issue* Map',
    'Issue* Set',
    'Opinion* Network',
    'Opinion* System', # keep an eye on this one
    'Opinion* Map',
    'Opinion* Set',
    # 'Value Network', catches valued networks too, physics heavy application
    # 'Values Network',
    'Value System',
    # 'Value* Map', business heavy application
    'Value Set',
    'Attitude* Network',
    'Attitude* System',
    'Attitude* Map',
    'Attitude* Set',
    # 'Cogni* Network', catches engineering and medical applications
    'Cogni* System',
    'Cogni* Map', # cognitive catches a lot of things, FUZZY exclude needed
    'Cogni* Set', 
    'Network of Belief',
    'Network of Issue',
    'Network of Opinion',
    # 'Network of Values', catching SBM stuff
    'Network of Attitudes',
    'Network of Cogni*', # FINISHED HERE 2023.11.20
    'System* of Beliefs',
    'System* of Issue',
    'System *of Opinion',
    'System* of Value',
    'System* of Attitude',
    # 'System* of Cogni*', catches neuroscience and dual process stuff
    'Map* of Belief',
    'Map* of Issue',
    'Map* of Opinion',
    # 'Map of Values',
    'Map* of Attitude',
    # 'Map of Cognitives', neuroscience heavy application
    # 'Set of Beliefs',
    # 'Set of Issues',
    # 'Set of Opinions',
    # 'Set of Values',
    # 'Set of Attitudes',
    # 'Set of Cognitives', lot of noise in all the "set" keywords
    'Relationship Between Belief',
    # 'Relationship Between of Issue' , ?
    'Relationship Between Opinion',
    'Relationship Between Value',
    'Relationship Between Attitude',
    # 'Relationship Between of Cognitives',
    "Mental Map",
]


joker_keywords = [
    
]


def generate_query(keywords):
    # Joining keywords with ' OR ' and enclosing each in quotation marks
    joined_keywords = ' OR '.join(f'"{keyword}"' for keyword in keywords)
    # Constructing the final query
    query = f'TITLE-ABS-KEY ({joined_keywords})'
    return query

query = generate_query(systematic_keywords)
print(query)

TITLE-ABS-KEY ("Belief* Network" OR "Belief* System" OR "Belief* Map" OR "Belief* Set" OR "Issue* Network" OR "Issue* System" OR "Issue* Map" OR "Issue* Set" OR "Opinion* Network" OR "Opinion* System" OR "Opinion* Map" OR "Opinion* Set" OR "Value System" OR "Value Set" OR "Attitude* Network" OR "Attitude* System" OR "Attitude* Map" OR "Attitude* Set" OR "Cogni* System" OR "Cogni* Map" OR "Cogni* Set" OR "Network of Belief" OR "Network of Issue" OR "Network of Opinion" OR "Network of Attitudes" OR "Network of Cogni*" OR "System* of Beliefs" OR "System* of Issue" OR "System *of Opinion" OR "System* of Value" OR "System* of Attitude" OR "Map* of Belief" OR "Map* of Issue" OR "Map* of Opinion" OR "Map* of Attitude" OR "Relationship Between Belief" OR "Relationship Between Opinion" OR "Relationship Between Value" OR "Relationship Between Attitude" OR "Mental Map")


In [28]:
# Mapping of file numbers to categories
categories = {
    # 1: "Computer Science Pre 2017",
    1: "Computer Science",
    # 2: "Computer Science Post 2017",
    2: "Computer Science",
    # 3: "Engineering Pre 2017",
    3: "Engineering",
    # 4: "Engineering Post 2017",
    4: "Engineering",
    5: "Social Sciences",
    6: "Mathematics",
    7: "Psychology",
    8: "Arts and Humanities",
    9: "Medicine",
    10: "Neuroscience",
    11: "Business Management and Accounting",
    12: "Environmental Science",
    13: "Decision Sciences",
    14: "Physics and Astronomy",
    15: "Agricultural and Biological Sciences",
    16: "Materials Science",
    17: "Economics Econometrics Finance",
    18: "Biochemistry Genetics and Molecular Biology",
    19: "Earth and Planetary Sciences",
    20: "Energy",
    21: "Health Professions",
    22: "Nursing",
    23: "Multidisciplinary",
    24: "Chemical Engineering",
    25: "Chemistry",
    26: "Pharmacology Toxicology and Pharmaceutics",
    27: "Immunology and Microbiology",
    28: "Veterinary",
    29: "Dentistry",
    30: "Undefined",
}

# Directory containing the files
directory = "data/by category/raw/"  # Replace with the path to your files

# List to hold all dataframes
dataframes = []

# Iterate over the files in the directory
for filename in os.listdir(directory):
    if filename.startswith("scopus") and filename.endswith(".csv"):
        # Extract the number from the filename
        file_number = int(filename.split('(')[1].split(')')[0])-1
        
        # Read the CSV file into a dataframe
        df = pd.read_csv(os.path.join(directory, filename))
        
        # Add the category column
        df['Category'] = categories.get(file_number, "Unknown")
        
        # Append the dataframe to the list
        dataframes.append(df)

# Now dataframes is a list of dataframes with the category column added
# concat all dataframes into one
papers = pd.concat(dataframes)

# number of unique papers 
print("number of unique papers")
print("by title: ", len(papers['Title'].unique()))
print("by DOI: ", len(papers['DOI'].unique()))
print("by EID: ", len(papers['EID'].unique()))



number of unique papers
by title:  40729
by DOI:  36169
by EID:  40938


In [None]:
# Group by a unique identifier (e.g., 'EID') and aggregate categories
unified_papers = papers.groupby('EID').agg({
    'Authors': 'first', 
    'Author full names': 'first',
    'Author(s) ID': 'first',
    'Title': 'first',
    'Year': 'first',
    'Source title': 'first',
    'Volume': 'first',
    'Issue': 'first',
    'Art. No.': 'first',
    'Page start': 'first',
    'Page end': 'first',
    'Page count': 'first',
    'Cited by': 'first',
    'DOI': 'first',
    'Link': 'first',
    'Abstract': 'first',
    'Author Keywords': 'first',
    'Index Keywords': 'first',
    'References': 'first',
    'Document Type': 'first',
    'Publication Stage': 'first',
    'Open Access': 'first',
    'Source': 'first',
    # Category column: aggregate into a string of unique categories
    'Category': lambda x: list(set(x))
}).reset_index()

unified_papers.to_pickle('data/by category/unified.pkl')


unified_papers