# INDEX

-  [MODELING, loadING, EVOLVING](#modeling-loading-evolving)

    1. [Modeling](#modeling)

    2. [loading](#loading)

    3. [Evolving](#evolving)

- [QUERYING](#querying)
    
    1. [Query 1](#query-1:-find-the-top-3-most-cited-papers-of-each-conference.)

    2. [Query 2](#query-2:-for-each-conference-find-its-community:-i.e.,-those-authors-that-have-published-papers-on-that-conference-in,-at-least,-4-different-editions.)

    3. [Query 3](#query-3:-find-the-impact-factors-of-the-journals-in-your-graph.)

    4. [Query 4](#query-4:-find-the-h-indexes-of-the-authors-in-your-graph.)

# IMPORTS

In [8]:
from faker import Faker
import random
import json
import pandas as pd
from neo4j import GraphDatabase

In [9]:
uri = "bolt://localhost:7687"
user = "neo4j"
password = "gwaB$3DMlab2"

driver = GraphDatabase.driver(uri, auth=(user, password))

# MODELING, loadING, EVOLVING

[Back to Index](#INDEX)

## Modeling

[Back to Index](#INDEX)

## loading

[Back to Index](#INDEX)

In [10]:
random.seed(168)

faker = Faker()

num_authors = 100
num_papers = 200
num_conferences = 5
num_journals = 3
num_topics = 10
num_keywords = 50
num_reviews_per_paper = 3
num_venues = 20
num_years = 50

In [11]:
# Generate Authors
authors = [
    {
        "name": faker.name(),
        "affiliation": faker.company()
    }
    for _ in range(num_authors)
]

# Generate Topics and Keywords
topics = [{"topic": topic} for topic in [
    "Machine Learning",
    "Natural Language Processing",
    "Computer Vision",
    "Reinforcement Learning",
    "Robotics",
    "Data Mining",
    "Database Management",
    "Neural Networks",
    "Time-Series Analysis",
    "Software Engineering"
]]

keywords = []
for topic in topics:
    for _ in range(5):
        keywords.append({"word": faker.word(), "topic": topic["topic"]})

# Generate Venues
cities = [
    "New York",
    "London",
    "Tokyo",
    "Paris",
    "Sydney",
    "Toronto",
    "Berlin",
    "Singapore",
    "Hong Kong",
    "Dubai",
    "Los Angeles",
    "Barcelona",
    "Moscow",
    "São Paulo",
    "Johannesburg",
    "Rome",
    "Istanbul",
    "Beijing",
    "Mumbai",
    "Buenos Aires"
]

venues = [{"city": city} for city in cities]

# Generate Years
years = [{"year": str(year)} for year in range(2024 - num_years, 2024)]

# Generate Papers
papers = []
for _ in range(num_papers):
    num_authors_per_paper = random.randint(2, 5)
    list_of_authors_names = [author["name"] for author in authors]
    author_list = random.sample(list_of_authors_names  , k = num_authors_per_paper)
    corresponding_author = random.choice(author_list)
    author_list.remove(corresponding_author)
    num_keywords_per_paper = random.randint(3, 7)
    keyword_list = random.sample(keywords, k=num_keywords_per_paper)
    
    list_of_years = [year["year"] for year in years]
    year_paper = random.choice(list_of_years)
    
    prefix = "https://doi.org/"
    directory = random.randint(10, 99)
    journal_number = random.randint(1000000, 9999999)
    article_number = random.randint(1000000, 9999999)
    
    
    
    paper = {
        "title": faker.sentence(nb_words=6),
        "abstract": faker.paragraph(nb_sentences=5),
        "year": year_paper,
        "pages": random.randint(5, 35),
        "DOI": f"{prefix}{directory}.{journal_number}.{year_paper}.{article_number}",
        "authors": author_list,
        "corresponding_author": corresponding_author,
        "keywords": keyword_list
    }
    papers.append(paper)

# Generate Conferences and Editions
conferences = [{"name": f"Conference of {faker.word()}"} for i in range(num_conferences)]

editions = []
for conf in conferences:
    num_editions = random.randint(1, 20)
    starting_year = random.randint(1970, 1970 + num_years - num_editions)
    for i in range(num_editions):
        edition = {
            "conference_name": conf["name"],
            "attendees": random.randint(100, 1000),
            "venue": random.choice(venues)["city"],
            "year": str(starting_year + i)
        }
        editions.append(edition)

# Generate Journals and Volumes
journals = [{"name": f"Journal {i+1}"} for i in range(num_journals)]

volumes = []
for journal in journals:
    num_volumes = random.randint(1, 20)
    for i in range(num_volumes):
        volume = {
            "volume_number": i + 1,
            "pages": random.randint(50, 300)
        }
        volumes.append(volume)

# Generate Reviews as Relationships
reviews = []
for paper in papers:
    eligible_reviewers = [author for author in authors if author not in paper["authors"]]
    reviewers = random.sample(eligible_reviewers, k=num_reviews_per_paper)
    for reviewer in reviewers:
        review = {
            "paper_DOI": paper["DOI"],
            "reviewer_name": reviewer["name"]
        }
        reviews.append(review)

In [12]:
data = {
    "authors": authors,
    "papers": papers,
    "conferences": conferences,
    "editions": editions,
    "journals": journals,
    "volumes": volumes,
    "keywords": keywords,
    "topics": topics,
    "reviews": reviews,
    "venues": venues,
    "years": years
}

# Save to json file for safekeeping
with open("faked_data.json", "w") as f:
    json.dump(data, f, indent=4)

print("Faked data generated and saved to faked_data.json")

Faked data generated and saved to faked_data.json


In [13]:
""" we convert the json file to csv """

pathname_import = "/Users/guillemmirabentrubinat/Library/Application Support/Neo4j Desktop/Application/relate-data/dbmss/dbms-85682a6c-ffc5-4848-8ae3-81bf5af53bcf/import"

# Save authors
pd.DataFrame(data['authors']).to_csv(f'{pathname_import}/authors.csv', index=False)

# Save topics
pd.DataFrame(data['topics']).to_csv(f'{pathname_import}/topics.csv', index=False)

# Save keywords
keywords_df = pd.DataFrame(data['keywords'])
keywords_df.to_csv(f'{pathname_import}/keywords.csv', index=False)

# Save venues
pd.DataFrame(data['venues']).to_csv(f'{pathname_import}/venues.csv', index=False)

# Save years
pd.DataFrame(data['years']).to_csv(f'{pathname_import}/years.csv', index=False)

# Save papers
papers = pd.DataFrame(data['papers'])
papers.to_csv(f'{pathname_import}/papers.csv', index=False)

# Save paper authors
paper_authors = []
for paper in data['papers']:
    for author in paper['authors']:
        paper_authors.append({'DOI': paper['DOI'], 'author_name': author})
pd.DataFrame(paper_authors).to_csv(f'{pathname_import}/paper_authors.csv', index=False)

# Save corresponding authors
paper_corresponding_authors = []
for paper in data['papers']:
    paper_corresponding_authors.append({'DOI': paper['DOI'], 'corresponding_author': paper['corresponding_author']})
pd.DataFrame(paper_corresponding_authors).to_csv(f'{pathname_import}/paper_corresponding_authors.csv', index=False)

# Save paper keywords
paper_keywords = []
for paper in data['papers']:
    for keyword in paper['keywords']:
        paper_keywords.append({'DOI': paper['DOI'], 'keyword': keyword['word']})
pd.DataFrame(paper_keywords).to_csv(f'{pathname_import}/paper_keywords.csv', index=False)

# Save conferences
pd.DataFrame(data['conferences']).to_csv(f'{pathname_import}/conferences.csv', index=False)

# Save editions
editions = pd.DataFrame(data['editions'])
editions.to_csv(f'{pathname_import}/editions.csv', index=False)

# Save journals
pd.DataFrame(data['journals']).to_csv(f'{pathname_import}/journals.csv', index=False)

# Save volumes
volumes = pd.DataFrame(data['volumes'])
volumes.to_csv(f'{pathname_import}/volumes.csv', index=False)

# Save reviews
reviews = pd.DataFrame(data['reviews'])
reviews.to_csv(f'{pathname_import}/reviews.csv', index=False)

In [14]:
def clear_database(tx):
    tx.run("MATCH (n) DETACH DELETE n")

def load_data(tx):
    # Load Authors
    tx.run("""
    LOAD CSV WITH HEADERS FROM 'file:///authors.csv' AS row
    CREATE (:Author {name: row.name, affiliation: row.affiliation});
    """)

    # Load Topics
    tx.run("""
    LOAD CSV WITH HEADERS FROM 'file:///topics.csv' AS row
    CREATE (:Topic {topic: row.topic});
    """)

    # Load Keywords
    tx.run("""
    LOAD CSV WITH HEADERS FROM 'file:///keywords.csv' AS row
    MATCH (t:Topic {topic: row.topic})
    CREATE (k:Keyword {word: row.word})-[:BELONGS_TO]->(t);
    """)

    # Load Venues
    tx.run("""
    LOAD CSV WITH HEADERS FROM 'file:///venues.csv' AS row
    CREATE (:Venue {city: row.city});
    """)

    # Load Years
    tx.run("""
    LOAD CSV WITH HEADERS FROM 'file:///years.csv' AS row
    CREATE (:Year {year: row.year});
    """)

    # Load Papers
    tx.run("""
    LOAD CSV WITH HEADERS FROM 'file:///papers.csv' AS row
    CREATE (p:Paper {title: row.title, abstract: row.abstract, year: row.year, pages: row.pages, DOI: row.DOI});
    """)

    tx.run("""
    LOAD CSV WITH HEADERS FROM 'file:///paper_authors.csv' AS row
    MATCH (p:Paper {DOI: row.DOI}), (a:Author {name: row.author_name})
    CREATE (p)-[:AUTHORED_BY]->(a);
    """)

    tx.run("""
    LOAD CSV WITH HEADERS FROM 'file:///paper_corresponding_authors.csv' AS row
    MATCH (p:Paper {DOI: row.DOI}), (a:Author {name: row.corresponding_author})
    CREATE (p)-[:CORRESPONDING_AUTHOR]->(a);
    """)

    tx.run("""
    LOAD CSV WITH HEADERS FROM 'file:///paper_keywords.csv' AS row
    MATCH (p:Paper {DOI: row.DOI}), (k:Keyword {word: row.keyword})
    CREATE (p)-[:HAS_KEYWORD]->(k);
    """)

    # Load Conferences
    tx.run("""
    LOAD CSV WITH HEADERS FROM 'file:///conferences.csv' AS row
    CREATE (:Conference {name: row.name});
    """)

    # Load Editions
    tx.run("""
    LOAD CSV WITH HEADERS FROM 'file:///editions.csv' AS row
    MATCH (c:Conference {name: row.conference_name}), (v:Venue {city: row.venue}), (y:Year {year: row.year})
    CREATE (e:Edition {conference_name: row.conference_name, attendees: row.attendees})-[:HELD_AT]->(v)-[:HELD_IN_YEAR]->(y)-[:PUBLISHED_IN_EDITION]->(c);
    """)

    # Load Journals
    tx.run("""
    LOAD CSV WITH HEADERS FROM 'file:///journals.csv' AS row
    CREATE (:Journal {name: row.name});
    """)

    # Load Volumes
    tx.run("""
    LOAD CSV WITH HEADERS FROM 'file:///volumes.csv' AS row
    MATCH (j:Journal {name: row.journal_name}), (y:Year {year: row.year})
    CREATE (v:Volume {volume_number: row.volume_number, pages: row.pages})-[:PUBLISHED_IN_YEAR]->(y)-[:PUBLISHED_IN_JOURNAL]->(j);
    """)

    # Load Reviews
    tx.run("""
    LOAD CSV WITH HEADERS FROM 'file:///reviews.csv' AS row
    MATCH (p:Paper {DOI: row.paper_DOI}), (a:Author {name: row.reviewer_name})
    CREATE (p)-[:REVIEWED_BY]->(a);
    """)

with driver.session() as session:
    # Clear the database
    session.execute_write(clear_database)

    # Load the data
    session.execute_write(load_data)

print("Data loaded into Neo4j")

Data loaded into Neo4j


## Evolving

[Back to Index](#INDEX)

# QUERYING

[Back to Index](#INDEX)

## Query 1: Find the top 3 most cited papers of each conference.

[Back to Index](#INDEX)

## Query 2: For each conference find its community: i.e., those authors that have published papers on that conference in, at least, 4 different editions.

[Back to Index](#INDEX)

## Query 3: Find the impact factors of the journals in your graph.

[Back to Index](#INDEX)

## Query 4: Find the h-indexes of the authors in your graph.

[Back to Index](#INDEX)