In [4]:
import json
import pandas as pd
from collections import Counter

# Load data
with open('pdf-info.json', 'r') as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data['papers'])

print(f"Total number of papers: {len(df)}")
df.head()

Total number of papers: 115


Unnamed: 0,paper_id,doi,title,published_year,author_list,countries,purpose_of_work,keywords,pages,keyword-page,sections,tables,figures
0,1,https://doi.org/10.3390/life14111380,A Real-Life Study in Patients Newly Diagnosed ...,2024,"[Ana Valea, Mihai Costachescu, Mihaela Stanciu...",[Romania],To analyze the thyroid panel in newly diagnose...,"[antibody, autoimmune, asthenia, thyroid, thyr...",18,2.0,"[Abstract, introduction, Materials and Methods...",7,8
1,2,,A Prospective Study to Evaluate the Possible R...,2022,"[Biva Bhakat, Sumit Kumar Chakraborty, Sukdeb ...",[India],To assess the therapeutic role of Vitamin D in...,"[Randomised Controlled Trial, Negative Correla...",4,1.0,"[AIMS AND OBJECTIVES, MATERIALS AND METHODS, O...",0,1
2,3,10.1097/MD.0000000000035720,Hashimoto's thyroiditis-related myopathy in a ...,2023,"[Zheng Cong Lee, Yu Jun Eugene Wong, Lian Lian...",[Singapore],This study investigates the link between SARS-...,"[case report, COVID-19, creatine kinase, Hashi...",6,1.0,"[Introduction, Case repor, Discussion, Conclu...",2,2
3,4,10.1093/ajcp/aqz145,Hashimoto Thyroiditis in Primary Thyroid Non-H...,2020,"[Antonio Travaglino, Mirella Pace, Silvia Varr...",[Italy],To assess the prevalence of Hashimoto thyroidi...,"[Thyroid, Lymphoma, MALT, Hashimoto, Thyroiditis]",9,1.0,"[ABSTRACT, Materials and Methods, Results, Dis...",1,6
4,5,https://doi.org/10.1016/j.intimp.2025.114069,Advanced oxidation protein products induce apo...,2025,"[Jie Tan, Ruoting Ding, Shitong Yu, Kewu Tu, J...",[China],To investigate the role of advanced oxidation ...,"[Advanced oxidation protein products, Apoptosi...",11,1.0,"[ABSTRACT, Introduction, Materials and Methods...",0,6


In [5]:
print("=" * 50)
print("Metadata OVERVIEW")
print("=" * 50)
print(f"\nTotal Papers: {len(df)}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nData Types:\n{df.dtypes}")
print(f"\nMissing Values:\n{df.isnull().sum()}")

Metadata OVERVIEW

Total Papers: 115

Columns: ['paper_id', 'doi', 'title', 'published_year', 'author_list', 'countries', 'purpose_of_work', 'keywords', 'pages', 'keyword-page', 'sections', 'tables', 'figures']

Data Types:
paper_id            object
doi                 object
title               object
published_year       int64
author_list         object
countries           object
purpose_of_work     object
keywords            object
pages                int64
keyword-page       float64
sections            object
tables               int64
figures              int64
dtype: object

Missing Values:
paper_id           0
doi                6
title              0
published_year     0
author_list        0
countries          0
purpose_of_work    0
keywords           0
pages              0
keyword-page       8
sections           0
tables             0
figures            0
dtype: int64


In [None]:
print("=" * 50)
print("PUBLISHED YEAR ANALYSIS")
print("=" * 50)

valid_years = df['published_year']

print(f"Earliest year: {int(valid_years.min())}")
print(f"Latest year: {int(valid_years.max())}")
print(f"Year range: {int(valid_years.max() - valid_years.min())} years")

print(f"\n--- Count for Each Year ---")
year_value_counts = valid_years.value_counts().sort_index()
for year, count in year_value_counts.items():
    percentage = (count / len(valid_years)) * 100
    print(f"Papers from {int(year)}: {count} ({percentage:.1f}%)")

PUBLISHED YEAR ANALYSIS
Earliest year: 2013
Latest year: 2025
Year range: 12 years
Total papers with valid years: 115

--- Count for Each Year ---
Papers from 2013: 1 (0.9%)
Papers from 2014: 1 (0.9%)
Papers from 2015: 2 (1.7%)
Papers from 2016: 1 (0.9%)
Papers from 2017: 5 (4.3%)
Papers from 2018: 3 (2.6%)
Papers from 2019: 6 (5.2%)
Papers from 2020: 8 (7.0%)
Papers from 2021: 4 (3.5%)
Papers from 2022: 14 (12.2%)
Papers from 2023: 5 (4.3%)
Papers from 2024: 40 (34.8%)
Papers from 2025: 25 (21.7%)


In [None]:
print("=" * 50)
print("PAGE RANGE ANALYSIS")
print("=" * 50)

print(f"Min pages: {df['pages'].min()}")
print(f"Max pages: {df['pages'].max()}")

print(f"\n--- Count for Each Number of Tables ---")
table_value_counts = df['pages'].value_counts().sort_index()
for num_tables, count in table_value_counts.items():
    percentage = (count / len(df)) * 100
    print(f"Papers with {num_tables} pages(s): {count} ({percentage:.1f}%)")


PAGE RANGE ANALYSIS
Min pages: 1
Max pages: 37

--- Count for Each Number of Tables ---
Papers with 1 pages(s): 1 (0.9%)
Papers with 2 pages(s): 2 (1.7%)
Papers with 3 pages(s): 1 (0.9%)
Papers with 4 pages(s): 6 (5.2%)
Papers with 5 pages(s): 3 (2.6%)
Papers with 6 pages(s): 7 (6.1%)
Papers with 7 pages(s): 10 (8.7%)
Papers with 8 pages(s): 10 (8.7%)
Papers with 9 pages(s): 17 (14.8%)
Papers with 10 pages(s): 14 (12.2%)
Papers with 11 pages(s): 8 (7.0%)
Papers with 12 pages(s): 7 (6.1%)
Papers with 13 pages(s): 3 (2.6%)
Papers with 14 pages(s): 3 (2.6%)
Papers with 15 pages(s): 4 (3.5%)
Papers with 16 pages(s): 2 (1.7%)
Papers with 17 pages(s): 3 (2.6%)
Papers with 18 pages(s): 6 (5.2%)
Papers with 19 pages(s): 2 (1.7%)
Papers with 20 pages(s): 1 (0.9%)
Papers with 22 pages(s): 1 (0.9%)
Papers with 29 pages(s): 1 (0.9%)
Papers with 32 pages(s): 2 (1.7%)
Papers with 37 pages(s): 1 (0.9%)


In [None]:
print("=" * 50)
print("TABLES ANALYSIS")
print("=" * 50)

print(f"Min tables: {df['tables'].min()}")
print(f"Max tables: {df['tables'].max()}")

print(f"\n--- Count for Each Number of Tables ---")
table_value_counts = df['tables'].value_counts().sort_index()
for num_tables, count in table_value_counts.items():
    percentage = (count / len(df)) * 100
    print(f"Papers with {num_tables} table(s): {count} ({percentage:.1f}%)")


TABLES ANALYSIS
Min tables: 0
Max tables: 9

--- Count for Each Number of Tables ---
Papers with 0 table(s): 22 (19.1%)
Papers with 1 table(s): 25 (21.7%)
Papers with 2 table(s): 28 (24.3%)
Papers with 3 table(s): 20 (17.4%)
Papers with 4 table(s): 8 (7.0%)
Papers with 5 table(s): 4 (3.5%)
Papers with 6 table(s): 3 (2.6%)
Papers with 7 table(s): 4 (3.5%)
Papers with 9 table(s): 1 (0.9%)


In [None]:
print("=" * 50)
print("FIGURES ANALYSIS")
print("=" * 50)

print(f"\nFigure Statistics:")
print(f"Min figures: {df['figures'].min()}")
print(f"Max figures: {df['figures'].max()}")

print(f"\n--- Count for Each Number of Tables ---")
table_value_counts = df['figures'].value_counts().sort_index()
for num_tables, count in table_value_counts.items():
    percentage = (count / len(df)) * 100
    print(f"Papers with {num_tables} figures(s): {count} ({percentage:.1f}%)")

FIGURES ANALYSIS

Figure Statistics:
Min figures: 0
Max figures: 13

--- Count for Each Number of Tables ---
Papers with 0 figures(s): 19 (16.5%)
Papers with 1 figures(s): 18 (15.7%)
Papers with 2 figures(s): 22 (19.1%)
Papers with 3 figures(s): 15 (13.0%)
Papers with 4 figures(s): 17 (14.8%)
Papers with 5 figures(s): 11 (9.6%)
Papers with 6 figures(s): 7 (6.1%)
Papers with 7 figures(s): 2 (1.7%)
Papers with 8 figures(s): 1 (0.9%)
Papers with 9 figures(s): 1 (0.9%)
Papers with 13 figures(s): 2 (1.7%)


In [None]:
print("=" * 50)
print("KEYWORD PAGE LOCATION ANALYSIS")
print("=" * 50)

# Categorize keyword page locations
def categorize_keyword_page(page):
    if pd.isna(page) or page is None:
        return 'No keyword'
    elif page == 1:
        return 'First page'
    elif page == 2:
        return 'Second page'
    else:
        return f'Page {page}'

df['keyword_location'] = df['keyword-page'].apply(categorize_keyword_page)

print("\nKeyword Location Distribution:")
keyword_dist = df['keyword_location'].value_counts()
print(keyword_dist)

KEYWORD PAGE LOCATION ANALYSIS

Keyword Location Distribution:
keyword_location
First page     98
Second page     8
No keyword      8
Page 7.0        1
Name: count, dtype: int64


In [None]:
print("=" * 50)
print("DOI ANALYSIS")
print("=" * 50)

# Check for null/missing DOIs
df['has_doi'] = df['doi'].notna() & (df['doi'] != '') & (df['doi'] != 'null')

print(f"\nPapers with DOI: {df['has_doi'].sum()} ({df['has_doi'].sum()/len(df)*100:.1f}%)")
print(f"Papers without DOI: {(~df['has_doi']).sum()} ({(~df['has_doi']).sum()/len(df)*100:.1f}%)")

DOI ANALYSIS

Papers with DOI: 109 (94.8%)
Papers without DOI: 6 (5.2%)


In [None]:
print("=" * 50)
print("COUNTRY DISTRIBUTION ANALYSIS")
print("=" * 50)

# Flatten country lists
all_countries = []
for countries in df['countries']:
    if isinstance(countries, list):
        all_countries.extend(countries)
    elif pd.notna(countries):
        all_countries.append(countries)

country_counts = Counter(all_countries)

print(f"\nTotal unique countries: {len(country_counts)}")
print(f"Total country appearances: {sum(country_counts.values())}")

print(f"\n--- All Countries (sorted by count) ---")
for country, count in country_counts.most_common():
    percentage = (count / len(df)) * 100
    print(f"{country}: {count} papers ({percentage:.1f}%)")

# Papers per country collaboration
df['num_countries'] = df['countries'].apply(lambda x: len(x) if isinstance(x, list) else 0)
print(f"\n--- Multi-country Collaboration ---")
print(f"Single country: {(df['num_countries'] == 1).sum()} papers")
print(f"Two countries: {(df['num_countries'] == 2).sum()} papers")
print(f"Three countries: {(df['num_countries'] == 3).sum()} papers")
print(f"Four+ countries: {(df['num_countries'] >= 4).sum()} papers")

# Detailed breakdown of number of countries per paper
print(f"\n--- Papers by Number of Collaborating Countries ---")
collab_counts = df['num_countries'].value_counts().sort_index()
for num_countries, count in collab_counts.items():
    percentage = (count / len(df)) * 100
    print(f"{num_countries} country/countries: {count} papers ({percentage:.1f}%)")

COUNTRY DISTRIBUTION ANALYSIS

Total unique countries: 39
Total country appearances: 131

--- All Countries (sorted by count) ---
China: 41 papers (35.7%)
Italy: 9 papers (7.8%)
USA: 9 papers (7.8%)
Poland: 7 papers (6.1%)
UK: 5 papers (4.3%)
Japan: 5 papers (4.3%)
Russia: 4 papers (3.5%)
Egypt: 3 papers (2.6%)
Germany: 3 papers (2.6%)
Iran: 3 papers (2.6%)
Brazil: 3 papers (2.6%)
Austria: 3 papers (2.6%)
Romania: 2 papers (1.7%)
India: 2 papers (1.7%)
United States: 2 papers (1.7%)
Saudi Arabia: 2 papers (1.7%)
Croatia: 2 papers (1.7%)
Spain: 2 papers (1.7%)
Greece: 2 papers (1.7%)
Turkey: 2 papers (1.7%)
Portugal: 2 papers (1.7%)
Singapore: 1 papers (0.9%)
United Kingdom: 1 papers (0.9%)
Indonesia: 1 papers (0.9%)
Belgium: 1 papers (0.9%)
South Korea: 1 papers (0.9%)
United Arab Emirates: 1 papers (0.9%)
TÃ¼rkiye: 1 papers (0.9%)
Republic of Korea: 1 papers (0.9%)
Palestine: 1 papers (0.9%)
Nepal: 1 papers (0.9%)
Norway: 1 papers (0.9%)
Colombia: 1 papers (0.9%)
Pakistan: 1 papers (0