In [498]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta, datetime

In [ ]:
random.seed(42)

# Values for data synthesis

In [499]:
cities = [
    "New York", "Los Angeles", "Chicago", "Houston", "Phoenix",
    "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose",
    "Austin", "Jacksonville", "Fort Worth", "Columbus", "Charlotte",
    "San Francisco", "Indianapolis", "Seattle", "Denver", "Washington",
    "Boston", "El Paso", "Nashville", "Detroit", "Oklahoma City",
    "Portland", "Las Vegas", "Memphis", "Louisville", "Baltimore",
    "Milwaukee", "Albuquerque", "Tucson", "Fresno", "Mesa",
    "Sacramento", "Atlanta", "Kansas City", "Colorado Springs", "Miami",
    "Raleigh", "Omaha", "Long Beach", "Virginia Beach", "Oakland",
    "Minneapolis", "Tulsa", "Arlington", "Tampa", "New Orleans",
    "London", "Paris", "Tokyo", "Beijing", "Moscow",
    "Sydney", "Dubai", "Mexico City", "Sao Paulo", "Mumbai",
    "Cairo", "Istanbul", "Lagos", "Buenos Aires", "Seoul",
    "Bangkok", "Kolkata", "Tehran", "Berlin", "Bogota",
    "Lima", "Jakarta", "Riyadh", "Madrid", "Rome"
]

In [500]:
keywords = [
    "artificial intelligence", "climate change", "sustainable development",
    "quantum computing", "genetic engineering", "machine learning", "cybersecurity",
    "public health", "renewable energy", "data analytics", "blockchain", "nanotechnology",
    "neural networks", "ecosystem services", "cancer research", "autonomous vehicles",
    "internet of things", "big data", "vaccine development", "bioinformatics",
    "virtual reality", "augmented reality", "deep learning", "biodiversity",
    "mental health", "smart cities", "robotics", "3D printing", "cloud computing",
    "CRISPR", "agritech", "clean technology", "material science", "photovoltaics",
    "drug discovery", "astrophysics", "oceanography", "glaciology", "sociology",
    "economics", "political science", "urban planning", "microbiology", "quantum mechanics",
    "biochemistry", "particle physics", "organic chemistry", "computational biology",
    "environmental justice", "sustainable agriculture", "water scarcity", "air pollution",
    "soil degradation", "renewable resources", "energy storage", "machine ethics",
    "digital humanities", "biomedical engineering", "forensic science", "epidemiology",
    "neuroscience", "cognitive science", "psychology", "anthropology", "linguistics",
    "education technology", "space exploration", "conservation biology", "green chemistry", 
    "industrial automation", "wearable technology", "nutrition science",
    # mandatory keywords
    "data management", 
    "indexing", 
    "data modeling", 
    "big data", 
    "data processing", 
    "data storage",
    "data querying"
]

first_names = [
    "Alex", "Jamie", "Casey", "Morgan", "Taylor", "Jordan", "Riley", "Cameron", "Skyler", "Quinn", 
    "Pat", "Drew", "Sam", "Chris", "Robin", "Lee", "Dana", "Kelly", "Alexis", "Leslie"
]

last_names = [
    "Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", 
    "Martinez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin", "Lee", "Perez", "Thompson"
]

# Functions & Load data #

In [501]:
def read_custom_file(file_path, low_memory=False):
    df = pd.read_csv(file_path, sep=';', header=None, low_memory=low_memory)
    df.columns = df.iloc[0]
    df = df.drop(df.index[0])
    return df

In [502]:
path = '/Users/danilakokin/Desktop/UPC/Semester2/SDM/SDM_Project/joined'

proceeding_df = pd.read_csv(f'{path}/proceedings.csv', low_memory=False).sample(7500)
article_df = pd.read_csv(f'{path}/article.csv', low_memory=False).sample(7500)
book_df = pd.read_csv(f'{path}/book.csv', low_memory=False).sample(7500)

journal_df = read_custom_file(f'{path}/journal.csv')
authors_df = read_custom_file(f'{path}/author.csv')

In [503]:
def generate_city():
    return np.random.choice(cities)
    
def generate_birth_date_vectorized(start='1945-01-01', end='1989-01-01', size=1):
    # Convert start and end dates to timestamps
    start_u = datetime.strptime(start, '%Y-%m-%d').timestamp()
    end_u = datetime.strptime(end, '%Y-%m-%d').timestamp()
    
    # Generate random dates
    random_dates_u = np.random.randint(start_u, end_u, size)
    random_dates = [datetime.utcfromtimestamp(date).strftime('%Y-%m-%d') for date in random_dates_u]
    
    return random_dates if size > 1 else random_dates[0]

def generate_proceeding_type():
    return np.random.choice(['WORKSHOP', 'CONFERENCE'], p=[0.4, 0.6])

def select_random_keywords(keywords=keywords, min_keywords=4, max_keywords=8):
    num_to_select = random.randint(min_keywords, max_keywords)
    selected_keywords = random.sample(keywords, num_to_select)
    return selected_keywords

In [504]:
# Helpers
def filter_rows_by_values(df, column_name, values_set):
    return df[df[column_name].isin(values_set)]

def make_mapping(df, key_column, value_column):
    return pd.Series(df[key_column], index=df[value_column]).to_dict()

def create_df_from_values_with_keys(values, value_col_name, key_col_name, key_pattern='ID-{:06d}'):
    df = pd.DataFrame({
        key_col_name: [key_pattern.format(i+1) for i in range(len(values))],
        value_col_name: values
    })
    return df

def collect_values_to_set(df, column_name):
    return set(df[column_name].explode().dropna().unique())


In [505]:
def add_citations_column(df):
    sorted_df = df.sort_values(by='year').copy()
    citations_dict = {index: [] for index in sorted_df.index}
    
    # Pre-compute the valid citation ids for each year to avoid doing it in the loop
    valid_citations_per_year = {year: sorted_df[sorted_df['year'] < year]['paper_id'].tolist() for year in sorted_df['year'].unique()}
    
    for index, row in sorted_df.iterrows():
        valid_citations = valid_citations_per_year[row['year']]
        
        if valid_citations:
            num_ids_to_select = min(np.random.randint(3, 7), len(valid_citations))
            selected_ids = np.random.choice(valid_citations, size=num_ids_to_select, replace=False)
            citations_dict[index] = selected_ids.tolist()
    
    sorted_df['citation_id'] = pd.Series(citations_dict).apply(lambda ids: ids if isinstance(ids, list) else pd.NA)
    return sorted_df[('citation_id')]


def choose_reviewers(authors, reviewers_set):
    authors_set = set(authors)
    all_reviewers = list(reviewers_set - authors_set)
    num_reviewers = np.random.choice([1, 2, 3, 4, 5], p=[0.05, 0.15, 0.6, 0.15, 0.05])
    
    if len(all_reviewers) >= num_reviewers:
        return np.random.choice(all_reviewers, size=num_reviewers, replace=False).tolist()
    
    chosen_reviewers = all_reviewers
    remaining_needed = num_reviewers - len(chosen_reviewers)
    
    if remaining_needed > 0:
        fill_ins = np.random.choice(list(reviewers_set - authors_set), size=remaining_needed, replace=False).tolist()
        chosen_reviewers.extend(fill_ins)
    
    return chosen_reviewers

def add_columns_from_authors(df, authors_col_name, reviewers_set):
    df['core_author'] = df[authors_col_name].str[0]
    df['co_author'] = df[authors_col_name].apply(lambda authors: authors[1:] if len(authors) > 1 else pd.NA)
    
    # Assuming 'authors' are lists, we can avoid an apply by directly using a list comprehension
    df['reviewer'] = [choose_reviewers(authors, reviewers_set) for authors in df[authors_col_name]]
    
    return df


In [506]:
def preprocess_authors(df):
    birth_dates = generate_birth_date_vectorized(size=len(df))
    
    df = df.dropna(subset=[':ID', 'author:string']).drop_duplicates(subset=[':ID'], keep='first')
    
    df['id'] = df[':ID'].astype(str)
    df['full_name'] = df['author:string']
    df['birth_date'] = birth_dates
    
    return df[['id', 'full_name', 'birth_date']]

def preprocess_journals(df):
    df = df.loc[:, [':ID', 'journal:string']].drop_duplicates(subset=[':ID'], keep='first').dropna()
    df.rename(columns={':ID': 'id', 'journal:string': 'name'}, inplace=True)
    
    return df

def preprocess_books(df):
    filtered_df = df[(df['author:string[]'].str.len() > 2) & (df['title:string'].str.len() > 2)]
    
    return (
        filtered_df.assign(
            abstract='Abstract',
            paper_id='PPR-' + filtered_df['book:ID'].astype(str),
            authors=filtered_df['author:string[]'].apply(lambda x: x.split('|') if pd.notnull(x) else pd.NA),
            title=filtered_df['title:string'],
            year=filtered_df['year:int']
        )
        .loc[:, ['paper_id', 'authors', 'title', 'abstract', 'year']]
        .dropna(subset=['authors', 'year'])
    )

def preprocess_proceedings(df):
    # Fill missing values
    df['editor:string[]'] = df['editor:string[]'].fillna('')
    df['publisher:string[]'] = df['publisher:string[]'].fillna('')
    
    # Filter rows based on content length criteria
    filtered_df = df[(df['editor:string[]'].str.len() > 2) & (df['publisher:string[]'].str.len() > 2)].copy()
    
    # Assuming 'create_df_from_values_with_keys' returns a DataFrame with unique values and an ID column
    unique_proceeding_df = create_df_from_values_with_keys(filtered_df['publisher:string[]'].unique(), 'proceeding_name', 'proceeding_id', key_pattern='PR-{:06d}')
    proceeding_name_to_id_map = dict(zip(unique_proceeding_df['proceeding_name'], unique_proceeding_df['proceeding_id']))
    
    # Map 'publisher:string[]' to 'proceeding_id'
    filtered_df.loc[:, 'proceeding_id'] = filtered_df['publisher:string[]'].map(proceeding_name_to_id_map)
    
    # Extract the first editor as chairman
    filtered_df.loc[:, 'chairman'] = filtered_df['editor:string[]'].apply(lambda x: x.split('|')[0] if x else pd.NA)
    
    unique_chairman_df = create_df_from_values_with_keys(filtered_df['chairman'].unique(), 'person_name', 'person_id', key_pattern='CH-{:06d}')
    chairman_name_to_id_map = dict(zip(unique_chairman_df['person_name'], unique_chairman_df['person_id']))
    
    # Map 'chairman' to 'person_id'
    filtered_df.loc[:, 'chairman'] = filtered_df['chairman'].map(chairman_name_to_id_map)
    
    # Additional transformations
    filtered_df.loc[:, 'edition_id'] = filtered_df['proceeding_id'].astype(str) + '-' + filtered_df['volume:string'].astype(str)
    
    filtered_df = filtered_df.assign(
        name=filtered_df['publisher:string[]'],
        edition_value=filtered_df['volume:string'],
        edition_date=filtered_df['mdate:date'],
        edition_city=[generate_city() for _ in range(len(filtered_df))]
    ).dropna(subset=['edition_value']).drop_duplicates(subset=['edition_id'])

    # Select and rename columns as necessary
    final_df = filtered_df.loc[:, ['proceeding_id', 'chairman', 'name', 'edition_date', 'edition_id', 'edition_city']]
    
    return final_df, unique_proceeding_df, unique_chairman_df

def preprocess_articles(df):
    filtered_df = df[(df['author:string[]'].str.len() > 2) & (df['title:string'].str.len() > 2)]
    
    return (
        filtered_df.assign(
            paper_id=filtered_df['article:ID'].astype(str),
            authors=filtered_df['author:string[]'].apply(lambda x: x.split('|') if pd.notnull(x) else pd.NA),
            journal=filtered_df['journal:string'],
            date=filtered_df['mdate:date'],
            title=filtered_df['title:string'],
            volume=filtered_df['volume:string'],
            year=filtered_df['year:int'].apply(lambda x: int(x) if pd.notnull(x) else pd.NA)
        )
        .dropna(subset=['authors', 'journal', 'volume'])
        .loc[:, ['paper_id', 'authors', 'journal', 'date', 'title', 'volume', 'year']]
    )


# Cleaning and filling #

In [507]:
authors_prep_df = preprocess_authors(authors_df)
journals_prep_df = preprocess_journals(journal_df)

  random_dates = [datetime.utcfromtimestamp(date).strftime('%Y-%m-%d') for date in random_dates_u]


In [508]:
author_to_id_map = dict(zip(authors_prep_df['full_name'], authors_prep_df['id']))
journal_to_id_map = dict(zip(journals_prep_df['name'], journals_prep_df['id']))

In [509]:
papers_from_proceeding_df = preprocess_books(book_df)
papers_from_journal_df = preprocess_articles(article_df)
prep_proceedings_df, proceeding_nodes_df, chairman_nodes_df = preprocess_proceedings(proceeding_df)

In [510]:
papers_from_proceeding_df['authors'] = papers_from_proceeding_df['authors'].apply(lambda x: [author_to_id_map.get(a) for a in x] if x is not pd.NA else pd.NA)
reviewers_set = set(papers_from_proceeding_df['authors'].dropna().explode().unique())
papers_from_proceeding_df = add_columns_from_authors(papers_from_proceeding_df, 'authors', reviewers_set)
papers_from_proceeding_df['citation_id'] = add_citations_column(papers_from_proceeding_df)
papers_from_proceeding_df['keywords'] = [select_random_keywords() for _ in range(len(papers_from_proceeding_df))]

# Prepare the final DataFrame for proceedings
proceeding_prep_df = papers_from_proceeding_df[['paper_id', 'title', 'abstract', 'core_author', 'co_author', 'citation_id', 'keywords']]

In [542]:
papers_from_proceeding_df

Unnamed: 0,paper_id,title,abstract,year,core_author,co_author,reviewer,citation_id,keywords
17817,PPR-10461382,Security Patterns - Integrating Security and S...,Abstract,2005,10755193,"[14002639, 11040723, 10626027, 11517468]",[Dylan P. Losey],"[PPR-10459656, PPR-10452675, PPR-10460568, PPR...","[quantum computing, astrophysics, clean techno..."
4991,PPR-10429543,From Genomic to Functional models.,Abstract,2009,11105548,,"[Tingyang Xu, Yunzhan Zhou, Darya Klyamer]","[PPR-10457480, PPR-10430299, PPR-10454385, PPR...","[ecosystem services, industrial automation, na..."
1257,PPR-34349,Multiple Fuzzy Classification Systems,Abstract,2012,10613895,,"[Imed Romdhani, Wenkai Zhang, Paolo Bientinesi...","[PPR-10462870, PPR-10461817, PPR-10459994]","[quantum computing, nanotechnology, 3D printin..."
11794,PPR-10454641,Computational Rhythm Description - a Review an...,Abstract,2008,12979937,,"[Xiaofei Wang, Glen Kramer, Atta Oveisi]","[PPR-10454740, PPR-33306, PPR-10454293, PPR-10...","[data storage, quantum computing, nutrition sc..."
11366,PPR-10454173,"Mobile Computing - Grundlagen, Technik, Konzepte.",Abstract,2002,10832144,,"[Ranjan K. Mallik, Seok-Lyong Lee, Michael Rob...","[PPR-10457814, PPR-10457116, PPR-10455561, PPR...","[astrophysics, artificial intelligence, virtua..."
...,...,...,...,...,...,...,...,...,...
17732,PPR-10461291,Computational Matrix Analysis.,Abstract,2012,10805568,,"[Chanchal K. Roy, Baocang Ding, Ravi G. Patel]","[PPR-10452575, PPR-10461111, PPR-10457972]","[renewable energy, big data, cancer research, ..."
7084,PPR-10436584,Convex Analysis,Abstract,1970,10819748,,[Andrea Albright],"[PPR-10454907, PPR-10461664, PPR-10461070]","[deep learning, quantum mechanics, data queryi..."
7933,PPR-10439061,Introduction to Functional Differential Equations,Abstract,1993,13499553,[13848719],"[Xiangbo Shu, Shotaro Kataoka, Yongxuan Zhang]","[PPR-10459866, PPR-10430373, PPR-10462943, PPR...","[data analytics, biochemistry, energy storage,..."
5493,PPR-10430045,Une contribution à l'allocation des ressources...,Abstract,2014,10709944,,"[Chuansheng Chen, Magnus Egerstedt, Martin Mai...","[PPR-10460544, PPR-10430995, PPR-10437645, PPR...","[robotics, cybersecurity, cognitive science, m..."


In [511]:
papers_from_proceeding_authors_set = collect_values_to_set(papers_from_proceeding_df, 'authors').intersection(set(authors_prep_df['full_name']))
papers_from_journal_authors_set = collect_values_to_set(papers_from_journal_df, 'authors').intersection(set(authors_prep_df['full_name']))
all_authors = papers_from_proceeding_authors_set.union(papers_from_journal_authors_set)

papers_from_proceeding_df = add_columns_from_authors(papers_from_proceeding_df, 'authors', all_authors).drop('authors', axis=1)
papers_from_journal_df = add_columns_from_authors(papers_from_journal_df, 'authors', all_authors).drop('authors', axis=1)

# limit authors df by all_authors
author_nodes_df = authors_prep_df[authors_prep_df['full_name'].isin(all_authors)]

In [512]:
def assign_event_type(df, id_column):
    unique_ids = df[id_column].unique()
    event_types = np.random.choice(['workshop', 'conference'], size=len(unique_ids))
    id_to_event_type = dict(zip(unique_ids, event_types))
    return df[id_column].map(id_to_event_type)

values = papers_from_proceeding_df['paper_id'].unique().tolist()
prep_proceedings_df['paper_id'] = prep_proceedings_df.apply(lambda _: list(np.random.choice(values, size=np.random.randint(2, 7), replace=False)), axis=1)
prep_proceedings_df['proceeding_type'] = assign_event_type(prep_proceedings_df, 'proceeding_id')

In [513]:
papers_from_journal_df['core_author'] = papers_from_journal_df['core_author'].apply(lambda x: author_to_id_map[x])
papers_from_journal_df['co_author'] = papers_from_journal_df['co_author'].apply(lambda x: [author_to_id_map.get(a) for a in x] if x is not pd.NA else pd.NA)
papers_from_journal_df['reviewer'] = papers_from_journal_df['reviewer'].apply(lambda x: [author_to_id_map.get(a) for a in x] if x is not pd.NA else pd.NA)
papers_from_journal_df['journal_id'] = papers_from_journal_df['journal'].apply(lambda x: journal_to_id_map[x])
papers_from_journal_df['volume_id'] = papers_from_journal_df['journal_id'] + '-' + papers_from_journal_df['volume'] + '-' + papers_from_journal_df['year'].astype(str)
papers_from_journal_df['keywords'] = [select_random_keywords() for _ in range(len(papers_from_journal_df))]
papers_from_journal_df['citation_id'] = add_citations_column(papers_from_journal_df)
papers_from_journal_df['abstract'] = "Abstract text"

In [514]:
journals_prep_df = journals_prep_df[journals_prep_df['id'].isin(papers_from_journal_df['journal_id'])]

In [515]:
def generate_editor(df, id_format):
    editor_names = [f"{np.random.choice(first_names)} {np.random.choice(last_names)}" for _ in range(len(df))]
    editor_ids = [id_format.format(i) for i in range(1, len(df) + 1)]
    
    editors_df = pd.DataFrame({'editor_id': editor_ids, 'editor_name': editor_names})
    
    return editor_ids, editors_df

journals_prep_df['editor_id'], editor_nodes_df = generate_editor(journals_prep_df, 'ED_{:06d}')

In [516]:
proceeding_prep_df

Unnamed: 0,paper_id,title,abstract,core_author,co_author,citation_id,keywords
17817,PPR-10461382,Security Patterns - Integrating Security and S...,Abstract,10755193,"[14002639, 11040723, 10626027, 11517468]","[PPR-10459656, PPR-10452675, PPR-10460568, PPR...","[quantum computing, astrophysics, clean techno..."
4991,PPR-10429543,From Genomic to Functional models.,Abstract,11105548,,"[PPR-10457480, PPR-10430299, PPR-10454385, PPR...","[ecosystem services, industrial automation, na..."
1257,PPR-34349,Multiple Fuzzy Classification Systems,Abstract,10613895,,"[PPR-10462870, PPR-10461817, PPR-10459994]","[quantum computing, nanotechnology, 3D printin..."
11794,PPR-10454641,Computational Rhythm Description - a Review an...,Abstract,12979937,,"[PPR-10454740, PPR-33306, PPR-10454293, PPR-10...","[data storage, quantum computing, nutrition sc..."
11366,PPR-10454173,"Mobile Computing - Grundlagen, Technik, Konzepte.",Abstract,10832144,,"[PPR-10457814, PPR-10457116, PPR-10455561, PPR...","[astrophysics, artificial intelligence, virtua..."
...,...,...,...,...,...,...,...
17732,PPR-10461291,Computational Matrix Analysis.,Abstract,10805568,,"[PPR-10452575, PPR-10461111, PPR-10457972]","[renewable energy, big data, cancer research, ..."
7084,PPR-10436584,Convex Analysis,Abstract,10819748,,"[PPR-10454907, PPR-10461664, PPR-10461070]","[deep learning, quantum mechanics, data queryi..."
7933,PPR-10439061,Introduction to Functional Differential Equations,Abstract,13499553,[13848719],"[PPR-10459866, PPR-10430373, PPR-10462943, PPR...","[data analytics, biochemistry, energy storage,..."
5493,PPR-10430045,Une contribution à l'allocation des ressources...,Abstract,10709944,,"[PPR-10460544, PPR-10430995, PPR-10437645, PPR...","[robotics, cybersecurity, cognitive science, m..."


In [517]:
prep_proceedings_df

Unnamed: 0,proceeding_id,chairman,name,edition_date,edition_id,edition_city,paper_id,proceeding_type
2338,PR-000001,CH-000001,Springer,2023-01-18,PR-000001-13392,Charlotte,"[PPR-10436203, PPR-10458199, PPR-10455000, PPR...",workshop
36675,PR-000001,CH-000004,Springer,2020-11-19,PR-000001-398,Baltimore,"[PPR-10460955, PPR-10459975]",workshop
2477,PR-000005,CH-000007,Universitäsbibliothek Essen,2019-07-03,PR-000005-14,Beijing,"[PPR-10429137, PPR-32837, PPR-10462380, PPR-10...",conference
7671,PR-000001,CH-000008,Springer,2019-05-14,PR-000001-3717,Lagos,"[PPR-10442196, PPR-10457668, PPR-10462377, PPR...",workshop
4270,PR-000001,CH-000009,Springer,2019-05-14,PR-000001-4048,Tulsa,"[PPR-10462783, PPR-10458813, PPR-10460105]",workshop
...,...,...,...,...,...,...,...,...
41807,PR-000010,CH-004351,Elsevier,2022-11-22,PR-000010-303,Kolkata,"[PPR-10456761, PPR-10442034]",conference
51719,PR-000001,CH-000100,Springer,2019-07-02,PR-000001-11621,Denver,"[PPR-10456509, PPR-10453868, PPR-10431102]",workshop
22747,PR-000010,CH-001852,Elsevier,2022-11-25,PR-000010-283,Tokyo,"[PPR-10455313, PPR-49620, PPR-10456199, PPR-10...",conference
10564,PR-000001,CH-004353,Springer,2019-08-22,PR-000001-11776,Miami,"[PPR-10461458, PPR-10461860, PPR-10462772, PPR...",workshop


In [518]:
papers_from_journal_df

Unnamed: 0,paper_id,journal,date,title,volume,year,core_author,co_author,reviewer,journal_id,volume_id,keywords,citation_id,abstract
2784723,9785958,Complex.,2021-04-29,Finite-Time Lyapunov Functions and Impulsive C...,2020,2020,10812532,[10922620],"[12363059, 10976860, 11026952, 10626394, 11973...",14156835,14156835-2020-2020,"[sociology, CRISPR, deep learning, machine eth...","[7696533, 9393366, 10165031, 9763089]",Abstract text
3059211,10062207,CoRR,2020-03-10,Stability and error estimates for the variable...,abs/2003.03534,2020,13673279,"[13145331, 10708795]","[10832418, 10856913]",14156859,14156859-abs/2003.03534-2020,"[oceanography, neural networks, wearable techn...","[7487874, 7434692, 7178519, 8966880, 7817648]",Abstract text
1222888,8222501,IEEE Trans. Geosci. Remote. Sens.,2023-09-24,Remote-Sensing Interpretation for Soil Element...,61,2023,14100535,"[10976001, 11056861, 10603499, 10685983, 10962...","[11300592, 10672159]",14155794,14155794-61-2023,"[deep learning, indexing, sociology, renewable...","[9005504, 7807650, 8871217]",Abstract text
1610349,8610901,IEEE Access,2023-09-30,Bounds on Topological Descriptors of the Coron...,7,2019,13198705,"[13517777, 13945265, 11336081, 11478170, 11138...","[12101728, 10819700, 11220473]",14156045,14156045-7-2019,"[machine learning, clean technology, renewable...","[9571257, 8551761, 7382165, 9388226]",Abstract text
855916,7853440,Games Econ. Behav.,2020-02-22,Agenda control as a cheap talk game: Theory an...,72,2011,11769534,,"[13639285, 12986876, 10723498]",14155560,14155560-72-2011,"[epidemiology, data processing, neural network...","[8770264, 7475319, 7109740, 9344030, 8310635]",Abstract text
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2192295,9193003,RFC,2019-05-14,IPv6 over Social Networks.,5514,2009,10857529,,"[13627705, 13939134, 12121301]",14156403,14156403-5514-2009,"[cybersecurity, political science, drug discov...","[7270955, 10128650, 8417591]",Abstract text
2305004,9305728,Expert Syst. Appl.,2022-12-05,FacialSCDnet: A deep learning approach for the...,210,2022,10605666,"[10605667, 10609484, 12565452, 10609498, 10609...","[10655099, 11155077]",14156486,14156486-210-2022,"[cloud computing, cognitive science, augmented...","[7280349, 8839855, 9216237, 7757715, 8565677]",Abstract text
2556321,9557307,Computer,2023-08-28,Fifteen Years to Open Source.,49,2016,10600693,,"[11246708, 11293777, 10999501, 12190126]",14156679,14156679-49-2016,"[nutrition science, space exploration, digital...","[8383771, 7910156, 8460091]",Abstract text
1951317,8952008,Z. für Medienpsychologie,2020-05-13,Verkaufsstart von,17,2005,11254670,,"[10939793, 10728935, 11599617]",14156217,14156217-17-2005,"[cybersecurity, organic chemistry, air polluti...","[7585647, 9223617, 7550847, 8966880]",Abstract text


In [519]:
papers_from_proceeding_df

Unnamed: 0,paper_id,title,abstract,year,core_author,co_author,reviewer,citation_id,keywords
17817,PPR-10461382,Security Patterns - Integrating Security and S...,Abstract,2005,10755193,"[14002639, 11040723, 10626027, 11517468]",[Dylan P. Losey],"[PPR-10459656, PPR-10452675, PPR-10460568, PPR...","[quantum computing, astrophysics, clean techno..."
4991,PPR-10429543,From Genomic to Functional models.,Abstract,2009,11105548,,"[Tingyang Xu, Yunzhan Zhou, Darya Klyamer]","[PPR-10457480, PPR-10430299, PPR-10454385, PPR...","[ecosystem services, industrial automation, na..."
1257,PPR-34349,Multiple Fuzzy Classification Systems,Abstract,2012,10613895,,"[Imed Romdhani, Wenkai Zhang, Paolo Bientinesi...","[PPR-10462870, PPR-10461817, PPR-10459994]","[quantum computing, nanotechnology, 3D printin..."
11794,PPR-10454641,Computational Rhythm Description - a Review an...,Abstract,2008,12979937,,"[Xiaofei Wang, Glen Kramer, Atta Oveisi]","[PPR-10454740, PPR-33306, PPR-10454293, PPR-10...","[data storage, quantum computing, nutrition sc..."
11366,PPR-10454173,"Mobile Computing - Grundlagen, Technik, Konzepte.",Abstract,2002,10832144,,"[Ranjan K. Mallik, Seok-Lyong Lee, Michael Rob...","[PPR-10457814, PPR-10457116, PPR-10455561, PPR...","[astrophysics, artificial intelligence, virtua..."
...,...,...,...,...,...,...,...,...,...
17732,PPR-10461291,Computational Matrix Analysis.,Abstract,2012,10805568,,"[Chanchal K. Roy, Baocang Ding, Ravi G. Patel]","[PPR-10452575, PPR-10461111, PPR-10457972]","[renewable energy, big data, cancer research, ..."
7084,PPR-10436584,Convex Analysis,Abstract,1970,10819748,,[Andrea Albright],"[PPR-10454907, PPR-10461664, PPR-10461070]","[deep learning, quantum mechanics, data queryi..."
7933,PPR-10439061,Introduction to Functional Differential Equations,Abstract,1993,13499553,[13848719],"[Xiangbo Shu, Shotaro Kataoka, Yongxuan Zhang]","[PPR-10459866, PPR-10430373, PPR-10462943, PPR...","[data analytics, biochemistry, energy storage,..."
5493,PPR-10430045,Une contribution à l'allocation des ressources...,Abstract,2014,10709944,,"[Chuansheng Chen, Magnus Egerstedt, Martin Mai...","[PPR-10460544, PPR-10430995, PPR-10437645, PPR...","[robotics, cybersecurity, cognitive science, m..."


In [520]:
journals_prep_df

Unnamed: 0,id,name,editor_id
3,14154891,SIGMOD Rec.,ED_000001
4,14154892,World Wide Web,ED_000002
6,14154894,IEEE Trans. Educ.,ED_000003
7,14154895,Int. J. Neural Syst.,ED_000004
8,14154896,Comput. Optim. Appl.,ED_000005
...,...,...,...
1986,14156874,Scientometrics,ED_001231
1988,14156876,Serv. Oriented Comput. Appl.,ED_001232
1989,14156877,Netw. Secur.,ED_001233
1990,14156878,Int. J. Web Inf. Syst.,ED_001234


In [521]:
author_nodes_df

Unnamed: 0,id,full_name,birth_date
4,10597996,Daniel C. Alexander,1988-03-18
11,10598003,Zhengyou Zhang,1956-03-19
18,10598010,Szymon Rusinkiewicz,1963-07-27
19,10598011,Amit K. Agrawal,1973-11-14
24,10598016,Matti Pietikäinen,1945-04-06
...,...,...,...
3556454,14154446,Kurtulus B. Öner,1988-12-27
3556547,14154539,António Pais Antunes,1973-04-04
3556683,14154675,Christoph Hartmann 0003,1975-10-26
3556806,14154798,Weidong Gao 0001,1978-05-09


In [522]:
chairman_nodes_df

Unnamed: 0,person_id,person_name
0,CH-000001,Gerhard Schwabe
1,CH-000002,Oliver Deussen
2,CH-000003,Anita de Waard
3,CH-000004,Andrea Polini
4,CH-000005,Bill Lin 0001
...,...,...
4348,CH-004349,Carlos Delgado Kloos
4349,CH-004350,Henri Gilbert
4350,CH-004351,Cai Wingfield
4351,CH-004352,Anthony Gar-On Yeh


In [523]:
editor_nodes_df

Unnamed: 0,editor_id,editor_name
0,ED_000001,Morgan Smith
1,ED_000002,Leslie Taylor
2,ED_000003,Riley Wilson
3,ED_000004,Casey Jones
4,ED_000005,Jordan Perez
...,...,...
1230,ED_001231,Alex Thompson
1231,ED_001232,Dana Jones
1232,ED_001233,Quinn Martinez
1233,ED_001234,Alex Thomas


# Neo4j import folder path

In [524]:
neo4j_import_folder_path = '/Users/danilakokin/Library/Application Support/Neo4j Desktop/Application/relate-data/dbmss/dbms-d3231510-48ce-4369-b724-f487562fd7a8/import'

# Slice nodes #

In [525]:
papers_from_journal_nodes_df = papers_from_journal_df[['paper_id', 'title', 'keywords', 'abstract']].drop_duplicates(['paper_id', 'title', 'abstract'])
papers_from_proceeding_nodes_df = papers_from_proceeding_df[['paper_id', 'title', 'keywords', 'abstract']].drop_duplicates(['paper_id', 'title', 'abstract'])

journal_nodes_df = journals_prep_df[['id', 'name']].drop_duplicates(['id', 'name'])
proceeding_nodes_df = prep_proceedings_df[['proceeding_id', 'name']].drop_duplicates(['proceeding_id', 'name'])

volume_nodes_df = papers_from_journal_df[['volume_id', 'year']].drop_duplicates(['volume_id'])
edition_nodes_df = prep_proceedings_df[['edition_id', 'edition_city', 'edition_date']].drop_duplicates(['edition_id', 'edition_date'])

# Save nodes

In [526]:
author_nodes_df.to_csv(f'{neo4j_import_folder_path}/author_nodes.csv', index=False)
editor_nodes_df.to_csv(f'{neo4j_import_folder_path}/editor_nodes.csv', index=False)
chairman_nodes_df.to_csv(f'{neo4j_import_folder_path}/chairman_nodes.csv', index=False)

In [527]:
papers_from_journal_nodes_df.to_csv(f'{neo4j_import_folder_path}/papers_from_journal_nodes.csv', index=False)
papers_from_proceeding_nodes_df.to_csv(f'{neo4j_import_folder_path}/papers_from_proceeding_nodes.csv', index=False)

In [528]:
journal_nodes_df.to_csv(f'{neo4j_import_folder_path}/journal_nodes.csv', index=False)
proceeding_nodes_df.to_csv(f'{neo4j_import_folder_path}/proceeding_nodes.csv', index=False)

In [529]:
volume_nodes_df.to_csv(f'{neo4j_import_folder_path}/volume_nodes.csv', index=False)
edition_nodes_df.to_csv(f'{neo4j_import_folder_path}/edition_nodes.csv', index=False)

# Slice relations #

In [535]:
paper_CITES_paper_journal_df = papers_from_journal_df[['paper_id', 'citation_id']].explode('citation_id').dropna()
paper_CITES_paper_proceeding_df = papers_from_proceeding_df[['paper_id', 'citation_id']].explode('citation_id').dropna()

paper_PUBLISHED_IN_WORKSHOP_edition_proceeding_df = prep_proceedings_df[prep_proceedings_df['proceeding_type'] == 'workshop'][['paper_id', 'edition_id']].explode('paper_id').drop_duplicates(['paper_id', 'edition_id'])
paper_PUBLISHED_IN_CONFERENCE_edition_proceeding_df = prep_proceedings_df[prep_proceedings_df['proceeding_type'] == 'conference'][['paper_id', 'edition_id']].explode('paper_id').drop_duplicates(['paper_id', 'edition_id'])

paper_PUBLISHED_IN_VOLUME_journal_df = papers_from_journal_df[['paper_id', 'volume_id']].drop_duplicates(['paper_id', 'volume_id'])

author_IS_CORE_AUTHOR_paper_journal_df = papers_from_journal_df[['paper_id', 'core_author']]
author_IS_CORE_AUTHOR_paper_proceeding_df = papers_from_proceeding_df[['paper_id', 'core_author']]

author_IS_CO_AUTHOR_paper_journal_df = papers_from_journal_df[['paper_id', 'co_author']].explode('co_author').dropna()
author_IS_CO_AUTHOR_paper_proceeding_df = papers_from_proceeding_df[['paper_id', 'co_author']].explode('co_author').dropna()

author_REVIEWS_paper_journal_df = papers_from_journal_df[['paper_id', 'reviewer']].explode('reviewer')
author_REVIEWS_paper_proceeding_df = papers_from_proceeding_df[['paper_id', 'reviewer']].explode('reviewer')

# proceeding_CONTAINS_WORKSHOP_edition_df = prep_proceedings_df[prep_proceedings_df['proceeding_type'] == 'workshop'][['proceeding_id', 'edition_id']].drop_duplicates(['proceeding_id', 'edition_id'])
proceeding_CONTAINS_CONFERENCE_edition_df = prep_proceedings_df[prep_proceedings_df['proceeding_type'] == 'conference'][['proceeding_id', 'edition_id']].drop_duplicates(['proceeding_id', 'edition_id'])
journal_CONTAINS_VOLUME_volume_df = papers_from_journal_df[['journal_id', 'volume_id']].drop_duplicates(['journal_id', 'volume_id'])

journal_HAS_EDITOR_person_df = journals_prep_df[['id', 'editor_id']].drop_duplicates(['id', 'editor_id'])
proceeding_HAS_WORKSHOP_CHAIRMAN_person_df = prep_proceedings_df[prep_proceedings_df['proceeding_type'] == 'workshop'][['proceeding_id', 'chairman']].drop_duplicates(['proceeding_id', 'chairman'])
proceeding_HAS_CONFERENCE_CHAIRMAN_person_df = prep_proceedings_df[prep_proceedings_df['proceeding_type'] == 'conference'][['proceeding_id', 'chairman']].drop_duplicates(['proceeding_id', 'chairman'])

In [536]:
paper_PUBLISHED_IN_WORKSHOP_edition_proceeding_df

Unnamed: 0,paper_id,edition_id
2338,PPR-10436203,PR-000001-13392
2338,PPR-10458199,PR-000001-13392
2338,PPR-10455000,PR-000001-13392
2338,PPR-10429779,PR-000001-13392
2338,PPR-10455323,PR-000001-13392
...,...,...
10564,PPR-10461458,PR-000001-11776
10564,PPR-10461860,PR-000001-11776
10564,PPR-10462772,PR-000001-11776
10564,PPR-10453845,PR-000001-11776


In [537]:
paper_PUBLISHED_IN_CONFERENCE_edition_proceeding_df.to_csv('aa.csv', index=False)

In [531]:
paper_CITES_paper_journal_df.to_csv(f'{neo4j_import_folder_path}/paper_CITES_paper_journal.csv', index=False)
paper_CITES_paper_proceeding_df.to_csv(f'{neo4j_import_folder_path}/paper_CITES_paper_proceeding.csv', index=False)
paper_PUBLISHED_IN_WORKSHOP_edition_proceeding_df.to_csv(f'{neo4j_import_folder_path}/paper_PUBLISHED_IN_WORKSHOP_edition_proceeding.csv', index=False)
paper_PUBLISHED_IN_CONFERENCE_edition_proceeding_df.to_csv(f'{neo4j_import_folder_path}/paper_PUBLISHED_IN_CONFERENCE_edition_proceeding.csv', index=False)
paper_PUBLISHED_IN_VOLUME_journal_df.to_csv(f'{neo4j_import_folder_path}/paper_PUBLISHED_IN_VOLUME_journal.csv', index=False)
author_IS_CORE_AUTHOR_paper_journal_df.to_csv(f'{neo4j_import_folder_path}/author_IS_CORE_AUTHOR_paper_journal.csv', index=False)
author_IS_CORE_AUTHOR_paper_proceeding_df.to_csv(f'{neo4j_import_folder_path}/author_IS_CORE_AUTHOR_paper_proceeding.csv', index=False)
author_IS_CO_AUTHOR_paper_journal_df.to_csv(f'{neo4j_import_folder_path}/author_IS_CO_AUTHOR_paper_journal.csv', index=False)
author_IS_CO_AUTHOR_paper_proceeding_df.to_csv(f'{neo4j_import_folder_path}/author_IS_CO_AUTHOR_paper_proceeding.csv', index=False)
author_REVIEWS_paper_journal_df.to_csv(f'{neo4j_import_folder_path}/author_REVIEWS_paper_journal.csv', index=False)
author_REVIEWS_paper_proceeding_df.to_csv(f'{neo4j_import_folder_path}/author_REVIEWS_paper_proceeding.csv', index=False)
proceeding_CONTAINS_WORKSHOP_edition_df.to_csv(f'{neo4j_import_folder_path}/proceeding_CONTAINS_WORKSHOP_edition.csv', index=False)
proceeding_CONTAINS_CONFERENCE_edition_df.to_csv(f'{neo4j_import_folder_path}/proceeding_CONTAINS_CONFERENCE_edition.csv', index=False)
journal_CONTAINS_VOLUME_volume_df.to_csv(f'{neo4j_import_folder_path}/journal_CONTAINS_EDITION_edition.csv', index=False)
journal_HAS_EDITOR_person_df.to_csv(f'{neo4j_import_folder_path}/journal_HAS_EDITOR_person.csv', index=False)
proceeding_HAS_WORKSHOP_CHAIRMAN_person_df.to_csv(f'{neo4j_import_folder_path}/proceeding_HAS_WORKSHOP_CHAIRMAN_person.csv', index=False)
proceeding_HAS_CONFERENCE_CHAIRMAN_person_df.to_csv(f'{neo4j_import_folder_path}/proceeding_HAS_CONFERENCE_CHAIRMAN_person.csv', index=False)