In [72]:
import os
import json
import pandas as pd

In [None]:
def extract_field(data, field_path, default=None):
    """Safely extract nested fields from JSON."""
    keys = field_path.split('.')
    for key in keys:
        if isinstance(data, dict) and key in data:
            data = data[key]
        else:
            return default
    return data


def extract_plain_text(items):
    """Extract plain text from idxterms or classificationgroup."""
    if isinstance(items, list):
        return ", ".join([item.get('$', 'Unknown') for item in items if isinstance(item, dict)])
    return 'Unknown'

def extract_classifications(classification_group):
    """Extract only text-based classifications from the classification group."""
    if not isinstance(classification_group, dict):  
        return "unknown"

    classifications = classification_group.get('classifications', [])
    if not isinstance(classifications, list):  
        return "unknown"

   
    classification_texts = []
    for classification in classifications:
        if isinstance(classification, dict):
            classification_value = classification.get('classification', {})
            
            if isinstance(classification_value, dict):
                desc = classification_value.get('classification-description', '').strip()
                if desc and not desc.isdigit():  # Ensure it's text-based
                    classification_texts.append(desc.lower())

           
            elif isinstance(classification_value, list):
                for item in classification_value:
                    if isinstance(item, dict):
                        value = item.get('$', '').strip()
                        if value and not value.isdigit():  # Exclude numeric values
                            classification_texts.append(value.lower())

           
            elif isinstance(classification_value, str) and not classification_value.isdigit():
                classification_texts.append(classification_value.strip().lower())
                
    return ", ".join(classification_texts) if classification_texts else "unknown"


In [None]:
def process_json_file(file_path):
    """Process a single JSON file to extract fields."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        abstracts_data = data.get('abstracts-retrieval-response', {})
        coredata = abstracts_data.get('coredata', {})
        authors = abstracts_data.get('authors', {}).get('author', [])
        idx_terms = abstracts_data.get('idxterms', {}).get('mainterm', [])
        auth_keywords = abstracts_data.get('authkeywords', {}).get('author-keyword', [])
        enhancement = abstracts_data.get('item', {}).get('bibrecord', {}).get('head', {}).get('enhancement', {})
        classification_group = enhancement.get('classificationgroup', {})
        subject_areas = abstracts_data.get('subject-areas', {}).get('subject-area', [])

        return {
            "Title": extract_field(coredata, 'dc:title', 'Unknown'),
            "Authors": ", ".join([author.get('ce:indexed-name', 'Unknown') for author in authors]),
            "Publication Date": extract_field(coredata, 'prism:coverDate', 'Unknown'),
            "Keywords": extract_plain_text(idx_terms) + ", " + extract_plain_text(auth_keywords),
            "Abstract": extract_field(coredata, 'dc:description', 'Unknown'),
            "Classification": extract_classifications(classification_group),  
            "Subject Areas": extract_plain_text(subject_areas),
        }
        
    except Exception as e:
        print(f"ERROR: Failed to process {file_path}. Reason: {e}")
        return None


In [None]:
def process_directory(directory_path, output_csv_path):
    """Process all JSON files in a directory and save to CSV."""
    combined_data = []
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if os.path.isfile(file_path):  # Ensure it's a file
            processed_data = process_json_file(file_path)
            if processed_data:
                combined_data.append(processed_data)

    # Convert to DataFrame and save to CSV
    df = pd.DataFrame(combined_data)
    df.to_csv(output_csv_path, index=False, encoding='utf-8')
    print(f"Cleaned data saved to {output_csv_path}")


In [83]:
if __name__ == "__main__":
    input_directory = "DataSciProject/All/2023"  # Replace with the path to your 2018 folder
    output_csv = "Cleaned_2023_Papers.csv"  # Replace with desired output CSV file name
    process_directory(input_directory, output_csv)

ERROR: Failed to process DataSciProject/All/2023/202302851. Reason: 'NoneType' object has no attribute 'get'
ERROR: Failed to process DataSciProject/All/2023/202301882. Reason: 'NoneType' object has no attribute 'get'
ERROR: Failed to process DataSciProject/All/2023/202302409. Reason: 'NoneType' object has no attribute 'get'
ERROR: Failed to process DataSciProject/All/2023/202301876. Reason: 'NoneType' object has no attribute 'get'
ERROR: Failed to process DataSciProject/All/2023/202301278. Reason: 'NoneType' object has no attribute 'get'
ERROR: Failed to process DataSciProject/All/2023/202302099. Reason: 'NoneType' object has no attribute 'get'
ERROR: Failed to process DataSciProject/All/2023/202302436. Reason: 'NoneType' object has no attribute 'get'
ERROR: Failed to process DataSciProject/All/2023/202300153. Reason: 'NoneType' object has no attribute 'get'
ERROR: Failed to process DataSciProject/All/2023/202301081. Reason: 'NoneType' object has no attribute 'get'
ERROR: Failed to pr

In [32]:
output_csv = "Cleaned_2018_Papers.csv"  # Replace with your actual file path
df = pd.read_csv(output_csv)
print(df.head())  # Display the first few rows
print(f"Total rows with 'unknown': {len(df[df['Classification'] == 'unknown'])}")
print(f"Total rows in file: {len(df)}")


                                               Title  \
0  The benefit of punishment sensitivity on motor...   
1  Memory T cell subsets in healthy gingiva and p...   
2  Rational use of paracetamol among out-patients...   
3  Effectiveness of Tuberculosis Screening Techno...   
4  Non uniform exponential bounds on normal appro...   

                                             Authors Publication Date  \
0  Manley H., Beattie S., Roberts R., Lawrence G....       2018-06-01   
1  Mahanonda R., Champaiboon C., Subbalekha K., S...       2018-01-01   
2               Dorji T., Gyeltshen K., Pongpirul K.       2018-09-10   
3  Khumsri J., Hiransuthikul N., Hanvoravongchai ...       2018-09-01   
4      Kamjornkittikoon K., Neammanee K., Chaidee N.       2018-03-04   

                                            Keywords  \
0  Adaptation, Psychological, Adult, Cognition, F...   
1  CD4-Positive T-Lymphocytes, CD8-Positive T-Lym...   
2  Acetaminophen, Adolescent, Adult, Aged, Analge...   


In [26]:
df = pd.read_csv("Cleaned_2018_Papers.csv")  # Replace with your actual file path
print(df)  # Display rows with 'unknown'
print(f"Total rows with 'unknown': {len(df[df['Classification'] == 'unknown'])}")
print(f"Total rows in file: {len(df)}")



                                                Title  \
0   The benefit of punishment sensitivity on motor...   
1   Memory T cell subsets in healthy gingiva and p...   
2   Rational use of paracetamol among out-patients...   
3   Effectiveness of Tuberculosis Screening Techno...   
4   Non uniform exponential bounds on normal appro...   
..                                                ...   
80  Acanthus ebracteatus leaf extract provides neu...   
81  Dietary cation and anion difference: Effects o...   
82  Carbon dioxide and methane emission rates from...   
83  Monitoring minimal residual disease in canine ...   
84  Enhancement of biocompatibility on aligned ele...   

                                              Authors Publication Date  \
0   Manley H., Beattie S., Roberts R., Lawrence G....       2018-06-01   
1   Mahanonda R., Champaiboon C., Subbalekha K., S...       2018-01-01   
2                Dorji T., Gyeltshen K., Pongpirul K.       2018-09-10   
3   Khumsri J., Hir