In [1]:
import os
import json
import pandas as pd

# Directories containing JSON files
directories = ['2018']

# List to store extracted data
extracted_data = []

# Iterate through each directory
for directory in directories:
    # Iterate through each file in the current directory
    for filename in os.listdir(directory):
        if filename.endswith('.json'):  # Ensure it's a JSON file
            filepath = os.path.join(directory, filename)
            
            # Open and load the JSON file
            with open(filepath, 'r') as file:
                data = json.load(file)
                
                # Extract the relevant part of the JSON
                response = data.get('abstracts-retrieval-response', {})
                
                # Extract specific fields
                title = response.get('coredata', {}).get('dc:title', None)
                publicationName = response.get('coredata', {}).get('prism:publicationName', None)
                abstract = response.get('item', {}).get('bibrecord', {}).get('head', {}).get('abstracts', None)
                
                subject_area_list = response.get('subject-areas', {}).get('subject-area', [])
                subjectArea = subject_area_list[0].get('@abbrev', None) if subject_area_list else None

                # Extract author keywords and combine into a single string
                # Extract author keywords and combine into a single string
                auth_keywords = response.get('authkeywords', {})
                if isinstance(auth_keywords, dict):  # Ensure it's a dictionary
                    keywords_list = auth_keywords.get('author-keyword', [])
                    if isinstance(keywords_list, list):  # Ensure 'author-keyword' is a list
                        combined_keywords = " ".join(
                        [kw.get('$', '') for kw in keywords_list if isinstance(kw, dict)]
                        )
                else:
                    combined_keywords = None  # Handle cases where auth_keywords is not a dictionary

                # Append the extracted data as a dictionary
                extracted_data.append({
                    'title': title,
                    'publicationName': publicationName,
                    'abstract' : abstract,
                    'keywords': combined_keywords,
                    'subjectArea': subjectArea
                })

# Convert the extracted data into a DataFrame
df = pd.DataFrame(extracted_data)


In [2]:
df

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea
0,Effects of iron content on the microstructure ...,Materials Chemistry and Physics,© 2018The microstructure and corrosion behavio...,EIS Microstructure Pitting corrosion Polarizat...,MATE
1,The critical factors of research and innovatio...,International Journal of Trade and Global Markets,Copyright © 2018 Inderscience Enterprises Ltd....,Critical factors Innovation creation Public un...,BUSI
2,Is the occiput-wall distance valid and reliabl...,Musculoskeletal Science and Practice,© 2018Background: Hyperkyphosis may be frequen...,Cobb angle Dowager's hump Round back Spine,HEAL
3,Comparison of soil composition between farmlan...,Eurasian Journal of Analytical Chemistry,© 2018 Society for Innovative Research. All ri...,Agriculture land management Conserved area Soi...,CHEM
4,The impact of wire caliber on ERCP outcomes: a...,Gastrointestinal Endoscopy,© 2018Background and Aims: Wire-guided biliary...,,MEDI
...,...,...,...,...,...
2787,Estimating actual evapotranspiration from NDVI...,Frontiers in Artificial Intelligence and Appli...,© 2018 The authors and IOS Press. All rights r...,Actual evapotranspiration Landsat 8 NDVI Remot...,COMP
2788,Genome-wide association study identified new s...,Scientific Reports,© 2018 The Author(s).We have performed a genom...,,MULT
2789,Effects of transcranial direct current stimula...,Journal of the Medical Association of Thailand,"© 2018, Medical Association of Thailand. All r...",Motor cortex Rehabilitation Stroke Transcrania...,MEDI
2790,Prevalence and risk factors for canine cogniti...,Thai Journal of Veterinary Medicine,© 2018 Chulalongkorn University Printing House...,Canine cognitive dysfunction syndrome Prevalen...,VETE


In [3]:
df.shape

(2792, 5)

In [4]:
#df.shape

In [5]:
df.isnull().sum()

title                0
publicationName      0
abstract           106
keywords           555
subjectArea          0
dtype: int64

In [6]:
df.dropna(inplace=True) #axis=0 

In [7]:
df.shape

(2223, 5)

Cleannnnn Abstract

In [8]:
'''
import re

def clean_abstract(abstract):
    # Remove copyright and year text (e.g., "© 2019")
    cleaned_abstract = re.sub(r'© \\d{4}', '', abstract)

    # Remove any unwanted text like reference markers or citation brackets
    cleaned_abstract = re.sub(r'\\[.*?\\]', '', cleaned_abstract)

    # Remove special characters, keeping only alphabets and spaces
    cleaned_abstract = re.sub(r'[^a-zA-Z\\s]', '', cleaned_abstract)

    # Remove extra spaces and make text lowercase
    cleaned_abstract = re.sub(r'\\s+', ' ', cleaned_abstract).strip().lower()

    return cleaned_abstract
'''


'\nimport re\n\ndef clean_abstract(abstract):\n    # Remove copyright and year text (e.g., "© 2019")\n    cleaned_abstract = re.sub(r\'© \\d{4}\', \'\', abstract)\n\n    # Remove any unwanted text like reference markers or citation brackets\n    cleaned_abstract = re.sub(r\'\\[.*?\\]\', \'\', cleaned_abstract)\n\n    # Remove special characters, keeping only alphabets and spaces\n    cleaned_abstract = re.sub(r\'[^a-zA-Z\\s]\', \'\', cleaned_abstract)\n\n    # Remove extra spaces and make text lowercase\n    cleaned_abstract = re.sub(r\'\\s+\', \' \', cleaned_abstract).strip().lower()\n\n    return cleaned_abstract\n'

In [9]:
abstracts = ["© 2019 2019 Aungsuroch et al., published by Sciendo.There is no single study that has examined nursing research priorities in Vietnam. This study aimed to gain consensus from experts on the nursing research priorities in Vietnam. A three-round modified Delphi study was used in this study. A focus group discussion among experts was conducted in round I to identify the nursing research priorities (n=23). Data in round I were analyzed using content analysis. In round II, participants were invited to rate the importance of each nursing priority topic in a 5-point Likert scale questionnaire, which had a 74% (n=17) response rate. In round III, the questionnaire was returned to the experts (n=17) until consensus was reached. Data from round II and round III were analyzed to produce mean score and final rank. The top 12 research priority lists were identified, which included subthemes and areas of possible investigations. All priorities were classified into three groups in the rank order, namely: (i) nursing management and leadership, which included (1) nursing care quality, (2) management and leadership of nurse managers, (3) nursing image, (4) professional nurse competency, and (5) human resource management; (ii) nursing education, which included (1) knowledge-specific domain, (2) the linkage between education and practice, and (3) nurse teacher workforce; and (iii) nursing service, which included (1) adult nursing concern, (2) patient safety, (3) public health nursing concern, and (4) quality of life of patients and nurses. Consensus among experts was achiever, and the findings are considered as the basis of resources to the most essential research needs in Vietnam.",
]

In [10]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to clean abstract text
def clean_abstract(abstract):
    # Remove copyright symbols and years (like "© 2019")
    abstract = re.sub(r'©\s*\d{4}', '', abstract)
    
    # Process text with spaCy NLP model
    doc = nlp(abstract)

    # Keep only alphanumeric words (remove punctuation, digits, etc.)
    cleaned_text = " ".join([token.text.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_digit])

    # Optional: Perform lemmatization if necessary
    lemmatized_text = " ".join([token.lemma_ for token in nlp(cleaned_text)])

    # Optional: Truncate text to a maximum length
    max_length = 300
    if len(lemmatized_text) > max_length:
        lemmatized_text = lemmatized_text[:max_length] + "..."
    
    return lemmatized_text

# Example abstract
abstract = "© 2019 2019 Aungsuroch et al., published by Sciendo. There is no single study that has examined nursing research priorities in Vietnam..."

# Clean the abstract
cleaned_abstract = clean_abstract(abstract)

# Output cleaned abstract
print(cleaned_abstract)


   aungsuroch et al publish sciendo single study examine nursing research priority vietnam


In [11]:
df['abstract']

0       © 2018The microstructure and corrosion behavio...
1       Copyright © 2018 Inderscience Enterprises Ltd....
2       © 2018Background: Hyperkyphosis may be frequen...
3       © 2018 Society for Innovative Research. All ri...
5       © 2017, NATCO. All rights reserved.Introductio...
                              ...                        
2784    © 2018, The Author(s).There is no recent natio...
2786    © 2018 Indian Journal of Critical Care Medicin...
2787    © 2018 The authors and IOS Press. All rights r...
2789    © 2018, Medical Association of Thailand. All r...
2790    © 2018 Chulalongkorn University Printing House...
Name: abstract, Length: 2223, dtype: object

In [12]:
# ไม่เวิคมั้ง
# df['abstract_cleaned'] = [clean_abstract(a) for a in df['abstract'].to_list()]

In [17]:
df['abstract_cleaned'] = [clean_abstract(a) for a in df['abstract'].to_list()]


In [18]:
df['combined'] = df['title'] + " " + df['publicationName'] + " " + df['abstract_cleaned'] + " " + df['keywords']
df['combined'].head()

0    Effects of iron content on the microstructure ...
1    The critical factors of research and innovatio...
2    Is the occiput-wall distance valid and reliabl...
3    Comparison of soil composition between farmlan...
5    The influence of neighbor effect and urbanizat...
Name: combined, dtype: object

In [None]:
# check

In [25]:
df['title'].iloc[30]

'The nanoporous carbon derived from melamine based polybenzoxazine and NaCl templating'

In [27]:
df['publicationName'].iloc[30]

'Key Engineering Materials'

In [24]:
df['keywords'].iloc[30]

'Acid catalysis Melamine Polybenzoxazine Porous carbon Ring-opening polymerization Templating'

In [20]:
df['abstract_cleaned'].iloc[30]

'   trans tech publication switzerlandnanoporous carbon successfully prepare polybenzoxazine synthesize bisphenol melamine formaldehyde precursor vary hcl amount add pre polymer solution catalyst ring open polymerization reaction trace ftir dsc addition degradation behavior study tga textural propert...'

In [19]:
df['combined'].iloc[30]

'The nanoporous carbon derived from melamine based polybenzoxazine and NaCl templating Key Engineering Materials    trans tech publication switzerlandnanoporous carbon successfully prepare polybenzoxazine synthesize bisphenol melamine formaldehyde precursor vary hcl amount add pre polymer solution catalyst ring open polymerization reaction trace ftir dsc addition degradation behavior study tga textural propert... Acid catalysis Melamine Polybenzoxazine Porous carbon Ring-opening polymerization Templating'