In [75]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta, datetime

In [76]:
random.seed(42)

# Values for data synthesis

In [113]:
cities = [
    "New York", "Los Angeles", "Chicago", "Houston", "Phoenix",
    "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose",
    "Austin", "Jacksonville", "Fort Worth", "Columbus", "Charlotte",
    "San Francisco", "Indianapolis", "Seattle", "Denver", "Washington",
    "Boston", "El Paso", "Nashville", "Detroit", "Oklahoma City",
    "Portland", "Las Vegas", "Memphis", "Louisville", "Baltimore",
    "Milwaukee", "Albuquerque", "Tucson", "Fresno", "Mesa",
    "Sacramento", "Atlanta", "Kansas City", "Colorado Springs", "Miami",
    "Raleigh", "Omaha", "Long Beach", "Virginia Beach", "Oakland",
    "Minneapolis", "Tulsa", "Arlington", "Tampa", "New Orleans",
    "London", "Paris", "Tokyo", "Beijing", "Moscow",
    "Sydney", "Dubai", "Mexico City", "Sao Paulo", "Mumbai",
    "Cairo", "Istanbul", "Lagos", "Buenos Aires", "Seoul",
    "Bangkok", "Kolkata", "Tehran", "Berlin", "Bogota",
    "Lima", "Jakarta", "Riyadh", "Madrid", "Rome"
]

In [114]:
keywords = [
    "artificial intelligence", "climate change", "sustainable development",
    "quantum computing", "genetic engineering", "machine learning", "cybersecurity",
    "public health", "renewable energy", "data analytics", "blockchain", "nanotechnology",
    "neural networks", "ecosystem services", "cancer research", "autonomous vehicles",
    "internet of things", "big data", "vaccine development", "bioinformatics",
    "virtual reality", "augmented reality", "deep learning", "biodiversity",
    "mental health", "smart cities", "robotics", "3D printing", "cloud computing",
    "CRISPR", "agritech", "clean technology", "material science", "photovoltaics",
    "drug discovery", "astrophysics", "oceanography", "glaciology", "sociology",
    "economics", "political science", "urban planning", "microbiology", "quantum mechanics",
    "biochemistry", "particle physics", "organic chemistry", "computational biology",
    "environmental justice", "sustainable agriculture", "water scarcity", "air pollution",
    "soil degradation", "renewable resources", "energy storage", "machine ethics",
    "digital humanities", "biomedical engineering", "forensic science", "epidemiology",
    "neuroscience", "cognitive science", "psychology", "anthropology", "linguistics",
    "education technology", "space exploration", "conservation biology", "green chemistry", 
    "industrial automation", "wearable technology", "nutrition science",
    # mandatory keywords
    "data management", 
    "indexing", 
    "data modeling", 
    "big data", 
    "data processing", 
    "data storage",
    "data querying"
]

first_names = [
    "Alex", "Jamie", "Casey", "Morgan", "Taylor", "Jordan", "Riley", "Cameron", "Skyler", "Quinn", 
    "Pat", "Drew", "Sam", "Chris", "Robin", "Lee", "Dana", "Kelly", "Alexis", "Leslie"
]

last_names = [
    "Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", 
    "Martinez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin", "Lee", "Perez", "Thompson"
]

# Functions & Load data #

In [115]:
def read_custom_file(file_path, low_memory=False):
    df = pd.read_csv(file_path, sep=';', header=None, low_memory=low_memory)
    df.columns = df.iloc[0]
    df = df.drop(df.index[0])
    return df

In [116]:
path = '/Users/danilakokin/Desktop/UPC/Semester2/SDM/SDM_Project/joined'

proceeding_df = pd.read_csv(f'{path}/proceedings.csv', low_memory=False).sample(200)
article_df = pd.read_csv(f'{path}/article.csv', low_memory=False).sample(200)
book_df = pd.read_csv(f'{path}/book.csv', low_memory=False).sample(200)

journal_df = read_custom_file(f'{path}/journal.csv')
authors_df = read_custom_file(f'{path}/author.csv')

In [117]:
def generate_city():
    return np.random.choice(cities)
    
def generate_birth_date_vectorized(start='1945-01-01', end='1989-01-01', size=1):
    # Convert start and end dates to timestamps
    start_u = datetime.strptime(start, '%Y-%m-%d').timestamp()
    end_u = datetime.strptime(end, '%Y-%m-%d').timestamp()
    
    # Generate random dates
    random_dates_u = np.random.randint(start_u, end_u, size)
    random_dates = [datetime.utcfromtimestamp(date).strftime('%Y-%m-%d') for date in random_dates_u]
    
    return random_dates if size > 1 else random_dates[0]

def generate_proceeding_type():
    return np.random.choice(['WORKSHOP', 'CONFERENCE'], p=[0.4, 0.6])

def select_random_keywords(keywords=keywords, min_keywords=4, max_keywords=8):
    num_to_select = random.randint(min_keywords, max_keywords)
    selected_keywords = random.sample(keywords, num_to_select)
    return selected_keywords

In [118]:
# Helpers
def filter_rows_by_values(df, column_name, values_set):
    return df[df[column_name].isin(values_set)]

def make_mapping(df, key_column, value_column):
    return pd.Series(df[key_column], index=df[value_column]).to_dict()

def create_df_from_values_with_keys(values, value_col_name, key_col_name, key_pattern='ID-{:06d}'):
    df = pd.DataFrame({
        key_col_name: [key_pattern.format(i+1) for i in range(len(values))],
        value_col_name: values
    })
    return df

def collect_values_to_set(df, column_name):
    return set(df[column_name].explode().dropna().unique())


In [119]:
def add_citations_column(df):
    sorted_df = df.sort_values(by='year').copy()
    citations_dict = {index: [] for index in sorted_df.index}
    
    # Pre-compute the valid citation ids for each year to avoid doing it in the loop
    valid_citations_per_year = {year: sorted_df[sorted_df['year'] < year]['paper_id'].tolist() for year in sorted_df['year'].unique()}
    
    for index, row in sorted_df.iterrows():
        valid_citations = valid_citations_per_year[row['year']]
        
        if valid_citations:
            num_ids_to_select = min(np.random.randint(3, 7), len(valid_citations))
            selected_ids = np.random.choice(valid_citations, size=num_ids_to_select, replace=False)
            citations_dict[index] = selected_ids.tolist()
    
    sorted_df['citation_id'] = pd.Series(citations_dict).apply(lambda ids: ids if isinstance(ids, list) else pd.NA)
    return sorted_df[('citation_id')]


def choose_reviewers(authors, reviewers_set):
    authors_set = set(authors)
    all_reviewers = list(reviewers_set - authors_set)
    num_reviewers = np.random.choice([1, 2, 3, 4, 5], p=[0.05, 0.15, 0.6, 0.15, 0.05])
    
    if len(all_reviewers) >= num_reviewers:
        return np.random.choice(all_reviewers, size=num_reviewers, replace=False).tolist()
    
    chosen_reviewers = all_reviewers
    remaining_needed = num_reviewers - len(chosen_reviewers)
    
    if remaining_needed > 0:
        fill_ins = np.random.choice(list(reviewers_set - authors_set), size=remaining_needed, replace=False).tolist()
        chosen_reviewers.extend(fill_ins)
    
    return chosen_reviewers

def add_columns_from_authors(df, authors_col_name, reviewers_set):
    df['core_author'] = df[authors_col_name].str[0]
    df['co_author'] = df[authors_col_name].apply(lambda authors: authors[1:] if len(authors) > 1 else pd.NA)
    
    # Assuming 'authors' are lists, we can avoid an apply by directly using a list comprehension
    df['reviewer'] = [choose_reviewers(authors, reviewers_set) for authors in df[authors_col_name]]
    
    return df


In [120]:
def preprocess_authors(df):
    birth_dates = generate_birth_date_vectorized(size=len(df))
    
    df = df.dropna(subset=[':ID', 'author:string']).drop_duplicates(subset=[':ID'], keep='first')
    
    df['id'] = df[':ID'].astype(str)
    df['full_name'] = df['author:string']
    df['birth_date'] = birth_dates
    
    return df[['id', 'full_name', 'birth_date']]

def preprocess_journals(df):
    df = df.loc[:, [':ID', 'journal:string']].drop_duplicates(subset=[':ID'], keep='first').dropna()
    df.rename(columns={':ID': 'id', 'journal:string': 'name'}, inplace=True)
    
    return df

def preprocess_books(df):
    filtered_df = df[(df['author:string[]'].str.len() > 2) & (df['title:string'].str.len() > 2)]
    
    return (
        filtered_df.assign(
            abstract='Abstract',
            paper_id='PPR-' + filtered_df['book:ID'].astype(str),
            authors=filtered_df['author:string[]'].apply(lambda x: x.split('|') if pd.notnull(x) else pd.NA),
            title=filtered_df['title:string'],
            year=filtered_df['year:int']
        )
        .loc[:, ['paper_id', 'authors', 'title', 'abstract', 'year']]
        .dropna(subset=['authors', 'year'])
    )

def preprocess_proceedings(df):
    # Fill missing values
    df['editor:string[]'] = df['editor:string[]'].fillna('')
    df['publisher:string[]'] = df['publisher:string[]'].fillna('')
    
    # Filter rows based on content length criteria
    filtered_df = df[(df['editor:string[]'].str.len() > 2) & (df['publisher:string[]'].str.len() > 2)].copy()
    
    # Assuming 'create_df_from_values_with_keys' returns a DataFrame with unique values and an ID column
    unique_proceeding_df = create_df_from_values_with_keys(filtered_df['publisher:string[]'].unique(), 'proceeding_name', 'proceeding_id', key_pattern='PR-{:06d}')
    proceeding_name_to_id_map = dict(zip(unique_proceeding_df['proceeding_name'], unique_proceeding_df['proceeding_id']))
    
    # Map 'publisher:string[]' to 'proceeding_id'
    filtered_df.loc[:, 'proceeding_id'] = filtered_df['publisher:string[]'].map(proceeding_name_to_id_map)
    
    # Extract the first editor as chairman
    filtered_df.loc[:, 'chairman'] = filtered_df['editor:string[]'].apply(lambda x: x.split('|')[0] if x else pd.NA)
    
    unique_chairman_df = create_df_from_values_with_keys(filtered_df['chairman'].unique(), 'person_name', 'person_id', key_pattern='CH-{:06d}')
    chairman_name_to_id_map = dict(zip(unique_chairman_df['person_name'], unique_chairman_df['person_id']))
    
    # Map 'chairman' to 'person_id'
    filtered_df.loc[:, 'chairman'] = filtered_df['chairman'].map(chairman_name_to_id_map)
    
    # Additional transformations
    filtered_df.loc[:, 'edition_id'] = filtered_df['proceeding_id'].astype(str) + '-' + filtered_df['volume:string'].astype(str)
    
    filtered_df = filtered_df.assign(
        name=filtered_df['publisher:string[]'],
        edition_value=filtered_df['volume:string'],
        edition_date=filtered_df['mdate:date'],
        edition_city=[generate_city() for _ in range(len(filtered_df))]
    ).dropna(subset=['edition_value']).drop_duplicates(subset=['edition_id'])

    # Select and rename columns as necessary
    final_df = filtered_df.loc[:, ['proceeding_id', 'chairman', 'name', 'edition_date', 'edition_id', 'edition_city']]
    
    return final_df, unique_proceeding_df, unique_chairman_df

def preprocess_articles(df):
    filtered_df = df[(df['author:string[]'].str.len() > 2) & (df['title:string'].str.len() > 2)]
    
    return (
        filtered_df.assign(
            paper_id=filtered_df['article:ID'].astype(str),
            authors=filtered_df['author:string[]'].apply(lambda x: x.split('|') if pd.notnull(x) else pd.NA),
            journal=filtered_df['journal:string'],
            date=filtered_df['mdate:date'],
            title=filtered_df['title:string'],
            volume=filtered_df['volume:string'],
            year=filtered_df['year:int'].apply(lambda x: int(x) if pd.notnull(x) else pd.NA)
        )
        .dropna(subset=['authors', 'journal', 'volume'])
        .loc[:, ['paper_id', 'authors', 'journal', 'date', 'title', 'volume', 'year']]
    )


# Cleaning and filling #

In [121]:
authors_prep_df = preprocess_authors(authors_df)
journals_prep_df = preprocess_journals(journal_df)

  random_dates = [datetime.utcfromtimestamp(date).strftime('%Y-%m-%d') for date in random_dates_u]


In [122]:
author_to_id_map = dict(zip(authors_prep_df['full_name'], authors_prep_df['id']))
journal_to_id_map = dict(zip(journals_prep_df['name'], journals_prep_df['id']))

In [123]:
papers_from_proceeding_df = preprocess_books(book_df)
papers_from_journal_df = preprocess_articles(article_df)
prep_proceedings_df, proceeding_nodes_df, chairman_nodes_df = preprocess_proceedings(proceeding_df)

In [124]:
papers_from_proceeding_df['authors'] = papers_from_proceeding_df['authors'].apply(lambda x: [author_to_id_map.get(a) for a in x] if x is not pd.NA else pd.NA)
reviewers_set = set(papers_from_proceeding_df['authors'].dropna().explode().unique())
papers_from_proceeding_df = add_columns_from_authors(papers_from_proceeding_df, 'authors', reviewers_set)
papers_from_proceeding_df['citation_id'] = add_citations_column(papers_from_proceeding_df)
papers_from_proceeding_df['keywords'] = [select_random_keywords() for _ in range(len(papers_from_proceeding_df))]

# Prepare the final DataFrame for proceedings
proceeding_prep_df = papers_from_proceeding_df[['paper_id', 'title', 'abstract', 'core_author', 'co_author', 'citation_id', 'keywords']]

In [125]:
papers_from_proceeding_authors_set = collect_values_to_set(papers_from_proceeding_df, 'authors').intersection(set(authors_prep_df['full_name']))
papers_from_journal_authors_set = collect_values_to_set(papers_from_journal_df, 'authors').intersection(set(authors_prep_df['full_name']))
all_authors = papers_from_proceeding_authors_set.union(papers_from_journal_authors_set)

papers_from_proceeding_df = add_columns_from_authors(papers_from_proceeding_df, 'authors', all_authors).drop('authors', axis=1)
papers_from_journal_df = add_columns_from_authors(papers_from_journal_df, 'authors', all_authors).drop('authors', axis=1)

# limit authors df by all_authors
author_nodes_df = authors_prep_df[authors_prep_df['full_name'].isin(all_authors)]

In [126]:
def assign_event_type(df, id_column):
    unique_ids = df[id_column].unique()
    event_types = np.random.choice(['workshop', 'conference'], size=len(unique_ids))
    id_to_event_type = dict(zip(unique_ids, event_types))
    return df[id_column].map(id_to_event_type)

values = papers_from_proceeding_df['paper_id'].unique().tolist()
prep_proceedings_df['paper_id'] = prep_proceedings_df.apply(lambda _: list(np.random.choice(values, size=np.random.randint(2, 7), replace=False)), axis=1)
prep_proceedings_df['proceeding_type'] = assign_event_type(prep_proceedings_df, 'proceeding_id')

In [127]:
papers_from_journal_df['core_author'] = papers_from_journal_df['core_author'].apply(lambda x: author_to_id_map[x])
papers_from_journal_df['co_author'] = papers_from_journal_df['co_author'].apply(lambda x: [author_to_id_map.get(a) for a in x] if x is not pd.NA else pd.NA)
papers_from_journal_df['reviewer'] = papers_from_journal_df['reviewer'].apply(lambda x: [author_to_id_map.get(a) for a in x] if x is not pd.NA else pd.NA)
papers_from_journal_df['journal_id'] = papers_from_journal_df['journal'].apply(lambda x: journal_to_id_map[x])
papers_from_journal_df['volume_id'] = papers_from_journal_df['journal_id'] + '-' + papers_from_journal_df['volume'] + '-' + papers_from_journal_df['year'].astype(str)
papers_from_journal_df['keywords'] = [select_random_keywords() for _ in range(len(papers_from_journal_df))]
papers_from_journal_df['citation_id'] = add_citations_column(papers_from_journal_df)
papers_from_journal_df['abstract'] = "Abstract text"

In [128]:
journals_prep_df = journals_prep_df[journals_prep_df['id'].isin(papers_from_journal_df['journal_id'])]

In [129]:
def generate_editor(df, id_format):
    editor_names = [f"{np.random.choice(first_names)} {np.random.choice(last_names)}" for _ in range(len(df))]
    editor_ids = [id_format.format(i) for i in range(1, len(df) + 1)]
    
    editors_df = pd.DataFrame({'editor_id': editor_ids, 'editor_name': editor_names})
    
    return editor_ids, editors_df

journals_prep_df['editor_id'], editor_nodes_df = generate_editor(journals_prep_df, 'ED_{:06d}')

# Neo4j import folder path

In [130]:
neo4j_import_folder_path = '/Users/danilakokin/Library/Application Support/Neo4j Desktop/Application/relate-data/dbmss/dbms-d3231510-48ce-4369-b724-f487562fd7a8/import'

# Slice nodes #

In [131]:
papers_from_journal_nodes_df = papers_from_journal_df[['paper_id', 'title', 'keywords', 'abstract']].drop_duplicates(['paper_id', 'title', 'abstract'])
papers_from_proceeding_nodes_df = papers_from_proceeding_df[['paper_id', 'title', 'keywords', 'abstract']].drop_duplicates(['paper_id', 'title', 'abstract'])

journal_nodes_df = journals_prep_df[['id', 'name']].drop_duplicates(['id', 'name'])
proceeding_nodes_df = prep_proceedings_df[['proceeding_id', 'name']].drop_duplicates(['proceeding_id', 'name'])

volume_nodes_df = papers_from_journal_df[['volume_id', 'year']].drop_duplicates(['volume_id'])
edition_nodes_df = prep_proceedings_df[['edition_id', 'edition_city', 'edition_date']].drop_duplicates(['edition_id', 'edition_date'])

# Save nodes

In [132]:
author_nodes_df.to_csv(f'{neo4j_import_folder_path}/author_nodes.csv', index=False)
editor_nodes_df.to_csv(f'{neo4j_import_folder_path}/editor_nodes.csv', index=False)
chairman_nodes_df.to_csv(f'{neo4j_import_folder_path}/chairman_nodes.csv', index=False)

In [133]:
papers_from_journal_nodes_df.to_csv(f'{neo4j_import_folder_path}/papers_from_journal_nodes.csv', index=False)
papers_from_proceeding_nodes_df.to_csv(f'{neo4j_import_folder_path}/papers_from_proceeding_nodes.csv', index=False)

In [134]:
journal_nodes_df.to_csv(f'{neo4j_import_folder_path}/journal_nodes.csv', index=False)
proceeding_nodes_df.to_csv(f'{neo4j_import_folder_path}/proceeding_nodes.csv', index=False)

In [135]:
volume_nodes_df.to_csv(f'{neo4j_import_folder_path}/volume_nodes.csv', index=False)
edition_nodes_df.to_csv(f'{neo4j_import_folder_path}/edition_nodes.csv', index=False)

# Slice relations #

In [136]:
paper_CITES_paper_journal_df = papers_from_journal_df[['paper_id', 'citation_id']].explode('citation_id').dropna()
paper_CITES_paper_proceeding_df = papers_from_proceeding_df[['paper_id', 'citation_id']].explode('citation_id').dropna()

paper_PUBLISHED_IN_WORKSHOP_edition_proceeding_df = prep_proceedings_df[prep_proceedings_df['proceeding_type'] == 'workshop'][['paper_id', 'edition_id']].explode('paper_id').drop_duplicates(['paper_id', 'edition_id'])
paper_PUBLISHED_IN_CONFERENCE_edition_proceeding_df = prep_proceedings_df[prep_proceedings_df['proceeding_type'] == 'conference'][['paper_id', 'edition_id']].explode('paper_id').drop_duplicates(['paper_id', 'edition_id'])

paper_PUBLISHED_IN_VOLUME_journal_df = papers_from_journal_df[['paper_id', 'volume_id']].drop_duplicates(['paper_id', 'volume_id'])

author_IS_CORE_AUTHOR_paper_journal_df = papers_from_journal_df[['paper_id', 'core_author']]
author_IS_CORE_AUTHOR_paper_proceeding_df = papers_from_proceeding_df[['paper_id', 'core_author']]

author_IS_CO_AUTHOR_paper_journal_df = papers_from_journal_df[['paper_id', 'co_author']].explode('co_author').dropna()
author_IS_CO_AUTHOR_paper_proceeding_df = papers_from_proceeding_df[['paper_id', 'co_author']].explode('co_author').dropna()

author_REVIEWS_paper_journal_df = papers_from_journal_df[['paper_id', 'reviewer']].explode('reviewer')
author_REVIEWS_paper_proceeding_df = papers_from_proceeding_df[['paper_id', 'reviewer']].explode('reviewer')

proceeding_CONTAINS_WORKSHOP_edition_df = prep_proceedings_df[prep_proceedings_df['proceeding_type'] == 'workshop'][['proceeding_id', 'edition_id']].drop_duplicates(['proceeding_id', 'edition_id'])
proceeding_CONTAINS_CONFERENCE_edition_df = prep_proceedings_df[prep_proceedings_df['proceeding_type'] == 'conference'][['proceeding_id', 'edition_id']].drop_duplicates(['proceeding_id', 'edition_id'])
journal_CONTAINS_VOLUME_volume_df = papers_from_journal_df[['journal_id', 'volume_id']].drop_duplicates(['journal_id', 'volume_id'])

journal_HAS_EDITOR_person_df = journals_prep_df[['id', 'editor_id']].drop_duplicates(['id', 'editor_id'])
proceeding_HAS_WORKSHOP_CHAIRMAN_person_df = prep_proceedings_df[prep_proceedings_df['proceeding_type'] == 'workshop'][['proceeding_id', 'chairman']].drop_duplicates(['proceeding_id', 'chairman'])
proceeding_HAS_CONFERENCE_CHAIRMAN_person_df = prep_proceedings_df[prep_proceedings_df['proceeding_type'] == 'conference'][['proceeding_id', 'chairman']].drop_duplicates(['proceeding_id', 'chairman'])

# Save relations

In [137]:
paper_CITES_paper_journal_df.to_csv(f'{neo4j_import_folder_path}/paper_CITES_paper_journal.csv', index=False)
paper_CITES_paper_proceeding_df.to_csv(f'{neo4j_import_folder_path}/paper_CITES_paper_proceeding.csv', index=False)
paper_PUBLISHED_IN_WORKSHOP_edition_proceeding_df.to_csv(f'{neo4j_import_folder_path}/paper_PUBLISHED_IN_WORKSHOP_edition_proceeding.csv', index=False)
paper_PUBLISHED_IN_CONFERENCE_edition_proceeding_df.to_csv(f'{neo4j_import_folder_path}/paper_PUBLISHED_IN_CONFERENCE_edition_proceeding.csv', index=False)
paper_PUBLISHED_IN_VOLUME_journal_df.to_csv(f'{neo4j_import_folder_path}/paper_PUBLISHED_IN_VOLUME_journal.csv', index=False)
author_IS_CORE_AUTHOR_paper_journal_df.to_csv(f'{neo4j_import_folder_path}/author_IS_CORE_AUTHOR_paper_journal.csv', index=False)
author_IS_CORE_AUTHOR_paper_proceeding_df.to_csv(f'{neo4j_import_folder_path}/author_IS_CORE_AUTHOR_paper_proceeding.csv', index=False)
author_IS_CO_AUTHOR_paper_journal_df.to_csv(f'{neo4j_import_folder_path}/author_IS_CO_AUTHOR_paper_journal.csv', index=False)
author_IS_CO_AUTHOR_paper_proceeding_df.to_csv(f'{neo4j_import_folder_path}/author_IS_CO_AUTHOR_paper_proceeding.csv', index=False)
author_REVIEWS_paper_journal_df.to_csv(f'{neo4j_import_folder_path}/author_REVIEWS_paper_journal.csv', index=False)
author_REVIEWS_paper_proceeding_df.to_csv(f'{neo4j_import_folder_path}/author_REVIEWS_paper_proceeding.csv', index=False)
proceeding_CONTAINS_WORKSHOP_edition_df.to_csv(f'{neo4j_import_folder_path}/proceeding_CONTAINS_WORKSHOP_edition.csv', index=False)
proceeding_CONTAINS_CONFERENCE_edition_df.to_csv(f'{neo4j_import_folder_path}/proceeding_CONTAINS_CONFERENCE_edition.csv', index=False)
journal_CONTAINS_VOLUME_volume_df.to_csv(f'{neo4j_import_folder_path}/journal_CONTAINS_EDITION_edition.csv', index=False)
journal_HAS_EDITOR_person_df.to_csv(f'{neo4j_import_folder_path}/journal_HAS_EDITOR_person.csv', index=False)
proceeding_HAS_WORKSHOP_CHAIRMAN_person_df.to_csv(f'{neo4j_import_folder_path}/proceeding_HAS_WORKSHOP_CHAIRMAN_person.csv', index=False)
proceeding_HAS_CONFERENCE_CHAIRMAN_person_df.to_csv(f'{neo4j_import_folder_path}/proceeding_HAS_CONFERENCE_CHAIRMAN_person.csv', index=False)

# Evolve graph

In [138]:
universities = [
    "Massachusetts Institute of Technology (MIT)",
    "Stanford University",
    "Harvard University",
    "California Institute of Technology (Caltech)",
    "University of Oxford",
    "University of Cambridge",
    "ETH Zurich - Swiss Federal Institute of Technology",
    "Imperial College London",
    "University of Chicago",
    "UCL (University College London)",
    "National University of Singapore (NUS)",
    "Princeton University",
    "Nanyang Technological University, Singapore (NTU)",
    "Ecole Polytechnique Fédérale de Lausanne (EPFL)",
    "Tsinghua University"
]

company = [
    "Google (Alphabet Inc.)",
    "Meta Platforms, Inc. (formerly Facebook, Inc.)",
    "Apple Inc.",
    "Microsoft Corporation",
    "Amazon.com, Inc.",
    "Tesla, Inc.",
    "Berkshire Hathaway Inc.",
    "Visa Inc.",
    "JPMorgan Chase & Co.",
    "Johnson & Johnson",
    "Samsung Electronics",
    "Exxon Mobil Corporation",
    "Walmart Inc.",
    "Toyota Motor Corporation",
    "Volkswagen AG"
]

In [139]:
uni_df = create_df_from_values_with_keys(universities, 'org_name', 'org_id', 'UNI-{:04d}')
com_df = create_df_from_values_with_keys(company, 'org_name', 'org_id', 'COM-{:04d}')
organisation_nodes_df = pd.concat([uni_df, com_df], ignore_index=True)

In [140]:
organisation_nodes_df

Unnamed: 0,org_id,org_name
0,UNI-0001,Massachusetts Institute of Technology (MIT)
1,UNI-0002,Stanford University
2,UNI-0003,Harvard University
3,UNI-0004,California Institute of Technology (Caltech)
4,UNI-0005,University of Oxford
5,UNI-0006,University of Cambridge
6,UNI-0007,ETH Zurich - Swiss Federal Institute of Techno...
7,UNI-0008,Imperial College London
8,UNI-0009,University of Chicago
9,UNI-0010,UCL (University College London)


In [141]:
def generate_organisation(df, organisation_df):
    random_org_ids = np.random.choice(organisation_df['org_id'].values, size=len(df))
    return random_org_ids.tolist()

def generate_reviewer_decision(df):
    random_org_ids = np.random.choice(['approved', 'rejected'], size=len(df))
    return random_org_ids.tolist()

In [143]:
author_nodes_df['org_id'] = generate_organisation(author_nodes_df, organisation_nodes_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  author_nodes_df['org_id'] = generate_organisation(author_nodes_df, organisation_nodes_df)


In [144]:
author_AFFILIATED_UNIVERSITY_organisation_df = author_nodes_df[author_nodes_df['org_id'].str.startswith("UNI")][['id', 'org_id']]
author_AFFILIATED_COMPANY_organisation_df = author_nodes_df[author_nodes_df['org_id'].str.startswith("COM")][['id', 'org_id']]

In [145]:
author_REVIEWS_paper_journal_df['decision'] = generate_reviewer_decision(author_REVIEWS_paper_journal_df)
author_REVIEWS_paper_journal_df['description'] = 'Description placeholder'

In [146]:
author_REVIEWS_paper_proceeding_df['decision'] = generate_reviewer_decision(author_REVIEWS_paper_proceeding_df)
author_REVIEWS_paper_proceeding_df['description'] = 'Description placeholder'

# Save nodes and relations

In [147]:
# Nodes
organisation_nodes_df.to_csv(f'{neo4j_import_folder_path}/organisation_nodes.csv', index=False)

# Relations
author_AFFILIATED_UNIVERSITY_organisation_df.to_csv(f'{neo4j_import_folder_path}/author_AFFILIATED_UNIVERSITY_organisation.csv', index=False)
author_AFFILIATED_COMPANY_organisation_df.to_csv(f'{neo4j_import_folder_path}/author_AFFILIATED_COMPANY_organisation.csv', index=False)
author_REVIEWS_paper_journal_df.to_csv(f'{neo4j_import_folder_path}/author_REVIEWS_paper_journal_upd.csv', index=False)
author_REVIEWS_paper_proceeding_df.to_csv(f'{neo4j_import_folder_path}/author_REVIEWS_paper_proceeding_upd.csv', index=False)