In [3]:
import re
import ijson

In [5]:
# Define a function to fix a line
def fix_line(line):
    return re.sub(r'NumberInt\((\d+)\)', r'\1', line)

# Process the file line by line
with open('dblpv13.json', 'r') as infile, open('dblpv13_fixed.json', 'w') as outfile:
    for line in infile:
        outfile.write(fix_line(line))

In [6]:
def validate_large_json(filename):
    with open(filename, 'r') as f:
        parser = ijson.parse(f)
        try:
            for prefix, event, value in parser:
                pass
            print("The JSON file is valid.")
        except ijson.JSONError as e:
            print(f"Invalid JSON file: {e}")

validate_large_json('dblpv13_fixed.json')

KeyboardInterrupt: 

docker run --name advdaba_labo2 `
-p7474:7474 `
-p7687:7687 `
-v $HOME/neo4j/advDB-lab2/logs:/logs `
-v $HOME/neo4j/advDB-lab2/data:/data `
-v $HOME/neo4j/advDB-lab2/import:/var/lib/neo4j/import `
--memory="3g" `
--env NEO4J_AUTH=neo4j/testtest `
neo4j:latest

In [9]:
import json
from neo4j import GraphDatabase
import re

max_line_to_load = 10
filename = "dblpv13.json"
uri = "bolt://0.0.0.0:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "testtest"))

def stream_json_objects(file):
    depth = 0
    obj = []
    for line in file:
        stripped = line.strip()
        if not stripped:
            continue

        if stripped[0] == '{':
            depth += 1
        if stripped[-1] == '}':
            depth -= 1
        
        obj.append(stripped)

        if depth == 0:
            yield json.loads(''.join(obj))
            obj = []

def corrected_json_lines(file):
    for line in file:
        yield re.sub(r'NumberInt\((\d+)\)', r'\1', line)

def add_article_and_related_data(tx, article):
    # Create ARTICLE node
    article_node = tx.run("MERGE (a:Article {_id: $id, title: $title}) RETURN a",
                          id=article["_id"], title=article["title"]).single()["a"]

    # Create AUTHORED relationships
    for author_name in article.get("authors", []):
        author_node = tx.run("MERGE (a:Author {_id: $name, name: $name}) RETURN a", 
                             name=author_name).single()["a"]
        tx.run("MERGE (a)-[:AUTHORED]->(b)", a=author_node, b=article_node)

    # Create CITES relationships
    for cited_article_id in article.get("references", []):
        tx.run("""
        MATCH (a:Article), (b:Article)
        WHERE a._id = $id AND b._id = $ref_id
        MERGE (a)-[:CITES]->(b)
        """, id=article["_id"], ref_id=cited_article_id)

print("Start processing...")
with open(filename, 'r') as file:
    articles = stream_json_objects(file)
    with driver.session() as session:
        for idx, item in enumerate(articles):
            if idx >= max_line_to_load:
                break
            if "title" not in item:
                print(f"Article {item['_id']} does not have a title. Skipping.")
                continue
            try:
                session.write_transaction(add_article_and_related_data, item)
            except Exception as e:
                print(f"Error processing article {item['_id']}: {e}")

driver.close()


Start processing...


JSONDecodeError: Expecting value: line 1 column 2 (char 1)

In [None]:
import json
import re

def corrected_json_lines(file):
    for line in file:
        yield re.sub(r'NumberInt\((\d+)\)', r'\1', line)

with open('dblpv13.json', 'r') as json_file:
    corrected_lines = corrected_json_lines(json_file)
    
    for line in corrected_lines:
        try:
            data = json.loads(line)
            # Process the JSON data here
            print(data)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")


In [22]:
import pandas as pd
import json

filename = 'biggertest.json'

# Load the data from the file
with open(filename, 'r') as file:
    data = json.load(file)

# Create a list to hold consolidated data
consolidated_data = []

for article in data:
    article_id = article['_id']
    article_title = article['title']
    authors = article.get('authors', [])
    references = article.get('references', [])
    
    for author in authors:
        # Handle the case when author name or _id is missing
        author_name = author.get('name', None)  # Use None if 'name' key is missing
        author_id = author.get('_id', None)  # Use None if '_id' key is missing
        
        # If no references, append article and author details only
        if not references:
            consolidated_data.append({
                'article_id': article_id,
                'article_title': article_title,
                'author_id': author_id,
                'author_name': author_name,
                'cited_article_id': None
            })
        else:
            for reference in references:
                consolidated_data.append({
                    'article_id': article_id,
                    'article_title': article_title,
                    'author_id': author_id,
                    'author_name': author_name,
                    'cited_article_id': reference
                })
    
    # Handle case where there are no authors but there are references
    if not authors and references:
        for reference in references:
            consolidated_data.append({
                'article_id': article_id,
                'article_title': article_title,
                'author_id': None,
                'author_name': None,
                'cited_article_id': reference
            })

df = pd.DataFrame(consolidated_data)
print(df)


                   article_id  \
0    53e99784b7602d9701f3e133   
1    53e99784b7602d9701f3e133   
2    53e99784b7602d9701f3e133   
3    53e99784b7602d9701f3e133   
4    53e99784b7602d9701f3e133   
..                        ...   
139  53e99784b7602d9701f3f95a   
140  53e99784b7602d9701f3f95a   
141  53e99784b7602d9701f3f95a   
142  53e99784b7602d9701f3f95a   
143  53e99784b7602d9701f3f95b   

                                         article_title  \
0    The relationship between canopy parameters and...   
1    The relationship between canopy parameters and...   
2    The relationship between canopy parameters and...   
3    The relationship between canopy parameters and...   
4    The relationship between canopy parameters and...   
..                                                 ...   
139                                             FACETS   
140                                             FACETS   
141                                             FACETS   
142                    

In [25]:
import pandas as pd
import ijson

filename = 'dblpv13.json'

# Function to process an individual article from the JSON file
def process_article(article):
    data_list = []
    article_id = article['_id']
    article_title = article['title']
    authors = article.get('authors', [])
    references = article.get('references', [])

    for author in authors:
        author_name = author.get('name', None)
        author_id = author.get('_id', None)

        if not references:
            data_list.append({
                'article_id': article_id,
                'article_title': article_title,
                'author_id': author_id,
                'author_name': author_name,
                'cited_article_id': None
            })
        else:
            for reference in references:
                data_list.append({
                    'article_id': article_id,
                    'article_title': article_title,
                    'author_id': author_id,
                    'author_name': author_name,
                    'cited_article_id': reference
                })

    if not authors and references:
        for reference in references:
            data_list.append({
                'article_id': article_id,
                'article_title': article_title,
                'author_id': None,
                'author_name': None,
                'cited_article_id': reference
            })

    return data_list

# Iteratively read and process the JSON file
consolidated_data = []
with open(filename, 'r') as file:
    articles = ijson.items(file, 'item')
    for article in articles:
        consolidated_data.extend(process_article(article))

df = pd.DataFrame(consolidated_data)
display(df)



UnexpectedSymbol: Unexpected symbol 'N' at 103

In [34]:
import pandas as pd
import ijson
import re

filename = 'dblpv13.json'

# Function to preprocess JSON lines
def corrected_json_lines(file):
    for line in file:
        yield re.sub(r'NumberInt\((\d+)\)', r'\1', line)

# Function to process an individual article from the JSON file
def process_article(article):
    data_list = []
    article_id = article['_id']
    article_title = article['title']
    authors = article.get('authors', [])
    references = article.get('references', [])

    for author in authors:
        author_name = author.get('name', None)
        author_id = author.get('_id', None)

        if not references:
            data_list.append({
                'article_id': article_id,
                'article_title': article_title,
                'author_id': author_id,
                'author_name': author_name,
                'cited_article_id': None
            })
        else:
            for reference in references:
                data_list.append({
                    'article_id': article_id,
                    'article_title': article_title,
                    'author_id': author_id,
                    'author_name': author_name,
                    'cited_article_id': reference
                })

    if not authors and references:
        for reference in references:
            data_list.append({
                'article_id': article_id,
                'article_title': article_title,
                'author_id': None,
                'author_name': None,
                'cited_article_id': reference
            })

    return data_list

# Iteratively read and process the JSON file
consolidated_data = []
with open(filename, 'r') as file:
    # Apply preprocessing on each line for MongoDB's NumberInt
    corrected_file = corrected_json_lines(file)
    
    articles = ijson.items(corrected_file, 'item')
    for article in articles:
        print(article)
        consolidated_data.extend(process_article(article))

# df = pd.DataFrame(consolidated_data)
# display(df)


ValueError: not enough values to unpack (expected 3, got 2)

In [36]:
import pandas as pd
import json

filename = 'biggertest.json'

# Load the data from the file
with open(filename, 'r') as file:
    data = json.load(file)

# Create a list to hold consolidated data
consolidated_data = []

for article in data:
    article_id = article['_id']
    article_title = article['title']
    authors = article.get('authors', [])
    references = article.get('references', [])
    
    for author in authors:
        # Handle the case when author name or _id is missing
        author_name = author.get('name', None)  # Use None if 'name' key is missing
        author_id = author.get('_id', None)  # Use None if '_id' key is missing
        
        # If no references, append article and author details only
        if not references:
            consolidated_data.append({
                'article_id': article_id,
                'article_title': article_title,
                'author_id': author_id,
                'author_name': author_name,
                'cited_article_id': None
            })
        else:
            for reference in references:
                consolidated_data.append({
                    'article_id': article_id,
                    'article_title': article_title,
                    'author_id': author_id,
                    'author_name': author_name,
                    'cited_article_id': reference
                })
    
    # Handle case where there are no authors but there are references
    if not authors and references:
        for reference in references:
            consolidated_data.append({
                'article_id': article_id,
                'article_title': article_title,
                'author_id': None,
                'author_name': None,
                'cited_article_id': reference
            })

df = pd.DataFrame(consolidated_data)
display(df)


Unnamed: 0,article_id,article_title,author_id,author_name,cited_article_id
0,53e99784b7602d9701f3e133,The relationship between canopy parameters and...,53f45728dabfaec09f209538,Peijuan Wang,
1,53e99784b7602d9701f3e133,The relationship between canopy parameters and...,5601754345cedb3395e59457,Jiahua Zhang,
2,53e99784b7602d9701f3e133,The relationship between canopy parameters and...,53f38438dabfae4b34a08928,Donghui Xie,
3,53e99784b7602d9701f3e133,The relationship between canopy parameters and...,5601754345cedb3395e5945a,Yanyan Xu,
4,53e99784b7602d9701f3e133,The relationship between canopy parameters and...,53f43d25dabfaeecd6995149,Yun Xu,
...,...,...,...,...,...
139,53e99784b7602d9701f3f95a,FACETS,53f45ab1dabfaeb22f511541,Richard Groebner,
140,53e99784b7602d9701f3f95a,FACETS,53f4359fdabfaeee229a4700,Satish Balay,
141,53e99784b7602d9701f3f95a,FACETS,53f44990dabfaee4dc7ddf84,Lois C. McInnes,
142,53e99784b7602d9701f3f95a,FACETS,562cb37445cedb3398c9befe,Hong Zhang,


In [2]:
import pandas as pd
import ijson
import re

filename = 'dblpv13.json'

# Function to preprocess JSON lines
def corrected_json_content(file):
    corrected_lines = list(corrected_json_lines(file))
    return '\n'.join(corrected_lines)

def corrected_json_lines(file):
    for line in file:
        yield re.sub(r'NumberInt\((\d+)\)', r'\1', line)

# Function to process an individual article from the JSON file
def process_article(article):
    data_list = []
    article_id = article['_id']
    article_title = article['title']
    authors = article.get('authors', [])
    references = article.get('references', [])

    for author in authors:
        author_name = author.get('name', None)  # Use None if 'name' key is missing
        author_id = author.get('_id', None)  # Use None if '_id' key is missing

        # If no references, append article and author details only
        if not references:
            data_list.append({
                'article_id': article_id,
                'article_title': article_title,
                'author_id': author_id,
                'author_name': author_name,
                'cited_article_id': None
            })
        else:
            for reference in references:
                data_list.append({
                    'article_id': article_id,
                    'article_title': article_title,
                    'author_id': author_id,
                    'author_name': author_name,
                    'cited_article_id': reference
                })

    # Handle case where there are no authors but there are references
    if not authors and references:
        for reference in references:
            data_list.append({
                'article_id': article_id,
                'article_title': article_title,
                'author_id': None,
                'author_name': None,
                'cited_article_id': reference
            })

    return data_list

# Iteratively read, preprocess, and process the JSON file
consolidated_data = []
with open(filename, 'r') as file:
    corrected_content = corrected_json_content(file)
    articles = ijson.items(corrected_content, 'item')
    for article in articles:
        # consolidated_data.extend(process_article(article))
        print(article)

# df = pd.DataFrame(consolidated_data)
# display(df)


KeyboardInterrupt: 

In [2]:
import pandas as pd
import ijson
import re
from neo4j import GraphDatabase
import gc

filename = 'dblpv13.json'
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "testtest"))

# Function to preprocess JSON lines
def corrected_json_content(file):
    corrected_lines = list(corrected_json_lines(file))
    return '\n'.join(corrected_lines)

def corrected_json_lines(file):
    for line in file:
        yield re.sub(r'NumberInt\((\d+)\)', r'\1', line)

# Function to process an individual article from the JSON file
def process_article(article):
    data_list = []
    article_id = article['_id']
    article_title = article['title']
    authors = article.get('authors', [])
    references = article.get('references', [])

    for author in authors:
        author_name = author.get('name', None)
        author_id = author.get('_id', None)

        # If no references, append article and author details only
        if not references:
            data_list.append({
                'article_id': article_id,
                'article_title': article_title,
                'author_id': author_id,
                'author_name': author_name,
                'cited_article_id': None
            })
        else:
            for reference in references:
                data_list.append({
                    'article_id': article_id,
                    'article_title': article_title,
                    'author_id': author_id,
                    'author_name': author_name,
                    'cited_article_id': reference
                })

    # Handle case where there are no authors but there are references
    if not authors and references:
        for reference in references:
            data_list.append({
                'article_id': article_id,
                'article_title': article_title,
                'author_id': None,
                'author_name': None,
                'cited_article_id': reference
            })

    return data_list

def add_article(tx, article_id, title, authors, cited_articles):
    # Merge the current ARTICLE node with its title and _id properties
    tx.run("MERGE (a:Article {_id: $id}) SET a.title = $title", id=article_id, title=title)
    
    for author in authors:
        if author:  
            # Merge the AUTHOR node with its name and _id properties
            tx.run("MERGE (a:Author {_id: $author_id, name: $name})", author_id=author["_id"], name=author["name"])
            # Merge the AUTHORED relationship between AUTHOR and ARTICLE
            tx.run("""
                MATCH (a:Author {_id: $author_id}), (b:Article {_id: $article_id})
                MERGE (a)-[:AUTHORED]->(b)
            """, author_id=author["_id"], article_id=article_id)
            
    for cited_article_id in cited_articles:
        tx.run("""
            MERGE (a:Article {_id: $article_id})
            MERGE (b:Article {_id: $cited_article_id})
            MERGE (a)-[:CITES]->(b)
        """, article_id=article_id, cited_article_id=cited_article_id)



def send_chunk_to_neo4j(chunk):
    with driver.session() as session:
        for _, row in chunk.iterrows():
            if row['author_name'] and row['author_id']:
                authors = [{"name": row['author_name'], "_id": row['author_id']}]
                if row['cited_article_id']:  # check if there's a valid cited_article_id
                    session.execute_write(add_article, row['article_id'], row['article_title'], authors, [row['cited_article_id']])
                else:
                    session.execute_write(add_article, row['article_id'], row['article_title'], authors, [])

chunk_size = 10  # adjust this based on your memory and performance needs
buffered_data = []

with open(filename, 'r') as file:
    articles = ijson.items(corrected_json_content(file), 'item')
    for article in articles:
        buffered_data.extend(process_article(article))
        if len(buffered_data) >= chunk_size:
            chunk_df = pd.DataFrame(buffered_data)
            send_chunk_to_neo4j(chunk_df)
            buffered_data = []  # reset the buffer
            del chunk_df  # delete the DataFrame
            gc.collect()  # collect garbage

# Send remaining buffered data (if any)
if buffered_data:
    chunk_df = pd.DataFrame(buffered_data)
    send_chunk_to_neo4j(chunk_df)
    del chunk_df  # delete the DataFrame
    gc.collect()  # collect garbage



KeyboardInterrupt: 

In [None]:
import ijson
import re
from neo4j import GraphDatabase

filename = 'dblpv13.json'
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "testtest"))

# Function to preprocess JSON lines
def corrected_json_content(file):
    corrected_lines = list(corrected_json_lines(file))
    return '\n'.join(corrected_lines)

def corrected_json_lines(file):
    for line in file:
        yield re.sub(r'NumberInt\((\d+)\)', r'\1', line)

# Function to process an individual article from the JSON file
def process_article(article):
    data_list = []
    article_id = article['_id']
    article_title = article['title']
    authors = article.get('authors', [])
    references = article.get('references', [])

    for author in authors:
        author_name = author.get('name', None)
        author_id = author.get('_id', None)

        # If no references, append article and author details only
        if not references:
            data_list.append({
                'article_id': article_id,
                'article_title': article_title,
                'author_id': author_id,
                'author_name': author_name,
                'cited_article_id': None
            })
        else:
            for reference in references:
                data_list.append({
                    'article_id': article_id,
                    'article_title': article_title,
                    'author_id': author_id,
                    'author_name': author_name,
                    'cited_article_id': reference
                })

    # Handle case where there are no authors but there are references
    if not authors and references:
        for reference in references:
            data_list.append({
                'article_id': article_id,
                'article_title': article_title,
                'author_id': None,
                'author_name': None,
                'cited_article_id': reference
            })

    return data_list

def add_article(tx, article_id, title, authors, cited_articles):
    # Merge the current ARTICLE node with its title and _id properties
    tx.run("MERGE (a:Article {_id: $id}) SET a.title = $title", id=article_id, title=title)
    
    for author in authors:
        if author:  
            # Merge the AUTHOR node with its name and _id properties
            tx.run("MERGE (a:Author {_id: $author_id, name: $name})", author_id=author["_id"], name=author["name"])
            # Merge the AUTHORED relationship between AUTHOR and ARTICLE
            tx.run("""
                MATCH (a:Author {_id: $author_id}), (b:Article {_id: $article_id})
                MERGE (a)-[:AUTHORED]->(b)
            """, author_id=author["_id"], article_id=article_id)
            
    for cited_article_id in cited_articles:
        tx.run("""
            MERGE (a:Article {_id: $article_id})
            MERGE (b:Article {_id: $cited_article_id})
            MERGE (a)-[:CITES]->(b)
        """, article_id=article_id, cited_article_id=cited_article_id)

def articles_generator(filename):
    with open(filename, 'r') as file:
        articles = ijson.items(corrected_json_content(file), 'item')
        for article in articles:
            yield article

def process_and_send_to_neo4j(article):
    with driver.session() as session:
        data_list = process_article(article)
        for data in data_list:
            if data['author_name'] and data['author_id']:
                authors = [{"name": data['author_name'], "_id": data['author_id']}]
                if data['cited_article_id']:
                    session.execute_write(add_article, data['article_id'], data['article_title'], authors, [data['cited_article_id']])
                else:
                    session.execute_write(add_article, data['article_id'], data['article_title'], authors, [])

for article in articles_generator(filename):
    process_and_send_to_neo4j(article)
