#### Test on one file

In [64]:
import json

def load_json(file_path):
    # Open the JSON file and load its content
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data
        
# Specify the path to your JSON file
file_path = 'saved_json/article_92181.json'

res = load_json(file_path)

### Process all files

In [None]:
import json
import pandas as pd
import os
from tqdm import tqdm

def load_json(file_path):
    # Open the JSON file and load its content
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def extract_authors(article):
    # Initialize an empty list to store author names
    authors_list = []
    try:
        # Extract the author data
        authors = article['PubmedArticle']['MedlineCitation']['Article']['AuthorList']['Author']
        if isinstance(authors, list):  # Check if authors is a list of multiple authors
            for author in authors:
                full_name = f"{author.get('ForeName', '')} {author.get('LastName', '')}".strip()
                authors_list.append(full_name)
        else:  # Single author case
            full_name = f"{authors.get('ForeName', '')} {authors.get('LastName', '')}".strip()
            authors_list.append(full_name)
    except KeyError:
        # In case the author information is missing or the format is unexpected
        authors_list.append("No author info available")
    return authors_list

def extract_year(article):
    try:
        pub_date = article['PubmedArticle']['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']
        year = pub_date.get('Year', 'No year available')
        return year
    except KeyError:
        return "No year available"

def extract_abstract(article):
    try:
        abstract = article['PubmedArticle']['MedlineCitation']['Article']['Abstract']['AbstractText']
        if isinstance(abstract, list):
            return " ".join(abstract)  # Join list elements if abstract is provided as a list
        return abstract
    except KeyError:
        return "No abstract available"

def create_article_dataframe(directory, num_files):
    ids = []
    titles = []
    journals = []
    authors = []
    publication_years = []
    abstracts = []

    # Initialize tqdm around the loop to show progress bar
    for i in tqdm(range(num_files), desc="Loading articles"):
        file_path = os.path.join(directory, f'article_{i}.json')
        try:
            article = load_json(file_path)
            # Extract ID, title, journal, authors, year, and abstract
            pmid = article['PubmedArticle']['MedlineCitation']['PMID']['#text']
            title = article['PubmedArticle']['MedlineCitation']['Article']['ArticleTitle']
            journal_name = article['PubmedArticle']['MedlineCitation']['Article']['Journal']['Title']
            authors_list = extract_authors(article)
            year = extract_year(article)
            abstract = extract_abstract(article)

            ids.append(pmid)
            titles.append(title)
            journals.append(journal_name)
            authors.append(", ".join(authors_list))
            publication_years.append(year)
            abstracts.append(abstract)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue

    # Create DataFrame
    df = pd.DataFrame({
        'PMID': ids,
        'Title': titles,
        'Journal': journals,
        'Authors': authors,
        'Publication Year': publication_years,
        'Abstract': abstracts
    })

    return df

# Path to the directory containing JSON files
directory = 'saved_json'
# Number of files to process
num_files = 400000

# Create the DataFrame
article_df = create_article_dataframe(directory, num_files)
print(article_df)


In [66]:
article_df.to_csv('final.csv')

In [75]:
article_df_sample = article_df.copy()
article_df_sample = article_df_sample[article_df_sample['Abstract']!='No abstract available']

In [92]:
article_df_sample.sample(10000, random_state=42).to_csv('homeo_sample.csv')