<a href="https://colab.research.google.com/github/galenos-project/literature-mining/blob/main/07_22_openalex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import requests
import nltk
from collections import namedtuple
from nltk.corpus import brown
import time  # Import the time module for pausing between requests
import re  # Import the regex module

# Ensure NLTK resources are downloaded
nltk.download('brown')

# Define helper functions
def toAbstract(p):
    # Placeholder function, replace with actual implementation
    return p.get('abstract', '')

def is_en(text, bc_brown, threshold=0.2):
    """
    Function to determine the papers' language
    PARAM text: receiving some text (should be title + abstract text)
    RETURN language: language code, an exception or empty
    RETURN return_code: PASS, FAIL or EMPTY
    """
    if text is None or text == "":
        language = 'NA'
        return_code = 'EMPTY'
    else:
        try:
            tok = set(re.findall(r"\w+", text.lower()))
            if len(tok) <= 0:
                language = 'NA'
                return_code = 'EMPTY'
            elif len(tok & bc_brown.value) / len(tok) <= threshold:
                language = 'non-en'
                return_code = 'PASS'
            else:
                language = 'en'
                return_code = 'PASS'
        except Exception as e:
            language = f"{e}"
            return_code = 'FAIL'
    return return_code, language

def getCitations(c):
    if c is None or c == '':
        return ''
    return '|'.join(c).replace('https://openalex.org/W', '')

def getAuthors(Authors):
    ret = ''
    for a in Authors:
        if ret == '':
            if 'id' in a['author']:
                ret = str(a['author']['id']).replace('https://openalex.org/A', '')
        else:
            if 'id' in a['author']:
                ret += '|' + str(a['author']['id']).replace('https://openalex.org/A', '')
    return ret

def getConcepts(Concepts):
    ret = ''
    for c in Concepts:
        if ret == '':
            ret = c['id'].replace('https://openalex.org/C', '')
        else:
            ret += '|' + c['id'].replace('https://openalex.org/C', '')
    return ret

def calculate_total_pages(total_results, page_size):
    return total_results // page_size + (total_results % page_size > 0)

# Initialize parameters
cursor = '*'
sample_size = 50000
papersDict = []
current_page = 1

# Prepare for API requests
Brown = namedtuple("Brown", field_names=['value'])
bc_brown = Brown(value=set([x.lower() for x in brown.words()]))
headers = {'email': 'avahomiar@gmail.com'}
search_term = '(((mood) OR (depress*) OR (affective disorder)) OR ((psychosis) OR (schizo*) OR (psychotic))) AND ((psychedelic) OR (hallucinogen) OR (entheogen) OR (hallucinogenic) OR (psychotropic)) AND (2014/2024[Date - Publication])'
encoded_search_term = requests.utils.quote(search_term)
baseUrl = f'https://api.openalex.org/works?search={encoded_search_term}'

# Create the directory if it does not exist
output_dir = '/content'
os.makedirs(output_dir, exist_ok=True)

# Fetch and process articles
while cursor:
    print(f"Processing page {current_page}...")

    r = requests.get(url=f"{baseUrl}&per_page=100&cursor={cursor}", headers=headers)

    if r.status_code == 200:
        try:
            theJson = r.json()
        except requests.exceptions.JSONDecodeError:
            print(f"Error decoding JSON response. Status code: {r.status_code}")
            print("Response content:", r.text)  # Print the raw response for debugging
            break  # Exit the loop if JSON decoding fails

        if 'next_cursor' in theJson['meta']:
            cursor = theJson['meta']['next_cursor']
            if 'results' in theJson and theJson['results']:
                for p in theJson['results']:
                    abstract = toAbstract(p)
                    title = p.get('title', '').replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
                    pub_date = p.get('publication_date', '')
                    pub_year = p.get('publication_year', '')
                    passed, lang = is_en(title + ' ' + abstract, bc_brown)
                    if lang != 'non-en':
                        papersDict.append({
                            'PaperId': p['id'].replace('https://openalex.org/W', ''),
                            'PaperTitle': title,
                            'Citations': getCitations(p.get('referenced_works', [])),
                            'coFoS': getConcepts(p.get('concepts', [])),
                            'Authors': getAuthors(p.get('authorships', [])),
                            'Abstract': abstract,
                            'Lang': 'en',
                            'PubYear': pub_year,
                            'PubDate': pub_date
                        })
            else:
                cursor = None
        else:
            cursor = None

        # Pause for 1 second between requests to avoid rate limiting
        time.sleep(1)
    else:
        print(f"Request failed with status code {r.status_code}")
        break

df2 = pd.DataFrame(papersDict)
output_file_path = os.path.join(output_dir, 'moodpsychosispsychadelics.csv')
df2.to_csv(output_file_path, sep=',', index=False)

print(f'Done. File saved to {output_file_path}')


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


Processing page 1...
Done. File saved to /content/moodpsychosispsychadelics.csv


https://openalex.org/works?page=1&filter=title_and_abstract.search%3A%28%28mood%29%20OR%20%28depress%2a%29%20OR%20%28affective%20disorder%29%29%20AND%20%28%28psychosis%29%20OR%20%28schizo%2a%29%20OR%20%28psychotic%29%29%20AND%20%28%28psychedelic%29%20OR%20%28hallucinogen%29%20OR%20%28entheogen%29%20OR%20%28hallucinogenic%29%20OR%20%28psychotropic%29%29,publication_year%3A2014-2024&sort=publication_year%3Adesc

In [None]:
import requests
import pandas as pd
import time
import urllib.parse

# Define search parameters
search_term = '((((mood) OR (depress*) OR (affective disorder)) OR ((psychosis) OR (schizo*) OR (psychotic)) OR ((anxiety) OR (anxious))) AND ((psychedelic) OR (hallucinogen) OR (entheogen) OR (hallucinogenic) OR (psychotropic)))'
publication_year_range = '2014-2024'
base_url = 'https://api.openalex.org/works'

# URL encode the search term
encoded_search_term = urllib.parse.quote(search_term)

# Construct the search URL
search_url = f'{base_url}?filter=title_and_abstract.search:{encoded_search_term},publication_year:{publication_year_range}&sort=publication_year:desc'

# Initialize variables for pagination
cursor = '*'
all_results = []

# Fetch and process articles
while cursor:
    full_url = f"{search_url}&per_page=100&cursor={cursor}"
    print(f"Fetching results from: {full_url}")

    # Make the request to the OpenAlex API
    response = requests.get(full_url)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()

        # Extract results and add to the list
        if 'results' in data and data['results']:
            all_results.extend(data['results'])
        else:
            print("No results found or end of results.")
            break

        # Check for the next cursor
        cursor = data['meta'].get('next_cursor', None)

        # Sleep to avoid rate limits
        time.sleep(1)
    else:
        print(f"Request failed with status code {response.status_code}")
        break

# Process the collected results
for result in all_results:
    print(result.get('title', 'No Title'))

print(f"Total number of results fetched: {len(all_results)}")

df = pd.DataFrame(all_results)
output_file_path = '/content/psychadelics.csv'
df.to_csv(output_file_path, index=False)

print(f'Done. File saved to {output_file_path}')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Lifetime use of MDMA/ecstasy and psilocybin is associated with reduced odds of major depressive episodes
National Tele-Mental Health Program in India: A step towards mental health care for all?
Brief Behavioral Therapy and Bupropion for Sleep and Fatigue in Young Adults With Crohn’s Disease: An Exploratory Open Trial Study
Hypnotic medication use among inpatients with schizophrenia and major depressive disorder: results of a nationwide study
Impact of COVID-19 restrictions on behavioural and psychological symptoms in home-dwelling people with dementia: a prospective cohort study (PAN.DEM).
Prescription Psychedelics: The Road from FDA Approval to Clinical Practice
Aberrant default mode connectivity in adolescents with early-onset psychosis: A resting state fMRI study
Multi-centre classification of functional neurological disorders based on resting-state functional connectivity
Use of Benzodiazepines and Antipsychotic Drugs

In [None]:
import os
import pandas as pd
import requests
import nltk
from collections import namedtuple
from nltk.corpus import brown
import time
import re
import urllib.parse

nltk.download('brown')

# Define helper functions
def toAbstract(p):
    """
    Extract the abstract from the paper.
    """
    return p.get('abstract', '')

def is_en(text, bc_brown, threshold=0.2):
    """
    Function to determine the papers' language.
    """
    if text is None or text == "":
        return 'EMPTY', 'NA'
    try:
        tok = set(re.findall(r"\w+", text.lower()))
        if len(tok) == 0:
            return 'EMPTY', 'NA'
        if len(tok & bc_brown.value) / len(tok) <= threshold:
            return 'PASS', 'non-en'
        else:
            return 'PASS', 'en'
    except Exception as e:
        return 'FAIL', str(e)

def getCitations(c):
    """
    Get citations from the paper.
    """
    if not c:
        return ''
    return '|'.join(c).replace('https://openalex.org/W', '')

def getAuthors(Authors):
    """
    Get authors from the paper.
    """
    return '|'.join([a['author']['id'].replace('https://openalex.org/A', '') for a in Authors if 'id' in a['author']])

def getConcepts(Concepts):
    """
    Get concepts from the paper.
    """
    return '|'.join([c['id'].replace('https://openalex.org/C', '') for c in Concepts])

# Initialize parameters
cursor = '*'
papersDict = []

# Prepare for API requests
Brown = namedtuple("Brown", field_names=['value'])
bc_brown = Brown(value=set([x.lower() for x in brown.words()]))
headers = {'email': 'avahomiar@gmail.com'}
search_term = '((((mood) OR (depress*) OR (affective disorder)) OR ((psychosis) OR (schizo*) OR (psychotic)) OR ((anxiety) OR (anxious))) AND ((psychedelic) OR (hallucinogen) OR (entheogen) OR (hallucinogenic) OR (psychotropic)))'
publication_year_range = '2014-2024'
encoded_search_term = urllib.parse.quote(search_term)
baseUrl = f'https://api.openalex.org/works?filter=title_and_abstract.search:{encoded_search_term},publication_year:{publication_year_range}&sort=publication_year:desc'

# Create the directory
output_dir = '/content'
os.makedirs(output_dir, exist_ok=True)

# Fetch and process articles
while cursor:
    full_url = f"{baseUrl}&per_page=100&cursor={cursor}"
    print(f"Fetching results from: {full_url}")

    response = requests.get(full_url, headers=headers)

    if response.status_code == 200:
        try:
            theJson = response.json()
        except requests.exceptions.JSONDecodeError:
            print(f"Error decoding JSON response. Status code: {response.status_code}")
            print("Response content:", response.text)
            break

        if 'results' in theJson and theJson['results']:
            for p in theJson['results']:
                abstract = toAbstract(p)
                title = p.get('title', '')
                if title is None:
                    title = ''
                title = title.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
                pub_date = p.get('publication_date', '')
                pub_year = p.get('publication_year', '')
                passed, lang = is_en(title + ' ' + abstract, bc_brown)
                if lang != 'non-en':
                    papersDict.append({
                        'PaperId': p['id'].replace('https://openalex.org/W', ''),
                        'PaperTitle': title,
                        'Citations': getCitations(p.get('referenced_works', [])),
                        'coFoS': getConcepts(p.get('concepts', [])),
                        'Authors': getAuthors(p.get('authorships', [])),
                        'Abstract': abstract,
                        'Lang': 'en',
                        'PubYear': pub_year,
                        'PubDate': pub_date
                    })
            cursor = theJson['meta'].get('next_cursor', None)
        else:
            cursor = None

        # Pause to avoid rate limiting
        time.sleep(1)
    else:
        print(f"Request failed with status code {response.status_code}")
        break

df2 = pd.DataFrame(papersDict)
output_file_path = os.path.join(output_dir, 'moodpsychosispsychadelics.csv')
df2.to_csv(output_file_path, index=False)

print(f'Done. File saved to {output_file_path}')


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Fetching results from: https://api.openalex.org/works?filter=title_and_abstract.search:%28%28%28%28mood%29%20OR%20%28depress%2A%29%20OR%20%28affective%20disorder%29%29%20OR%20%28%28psychosis%29%20OR%20%28schizo%2A%29%20OR%20%28psychotic%29%29%20OR%20%28%28anxiety%29%20OR%20%28anxious%29%29%29%20AND%20%28%28psychedelic%29%20OR%20%28hallucinogen%29%20OR%20%28entheogen%29%20OR%20%28hallucinogenic%29%20OR%20%28psychotropic%29%29%29,publication_year:2014-2024&sort=publication_year:desc&per_page=100&cursor=*
Fetching results from: https://api.openalex.org/works?filter=title_and_abstract.search:%28%28%28%28mood%29%20OR%20%28depress%2A%29%20OR%20%28affective%20disorder%29%29%20OR%20%28%28psychosis%29%20OR%20%28schizo%2A%29%20OR%20%28psychotic%29%29%20OR%20%28%28anxiety%29%20OR%20%28anxious%29%29%29%20AND%20%28%28psychedelic%29%20OR%20%28hallucinogen%29%20OR%20%28entheogen%29%20OR%20%28hallucinogenic%29%20OR%20%28psychotropic%29%29%29,publication_year:2014-2024&sort=publication_year:desc&per_pa

original openalex code


In [5]:
import pandas as pd
import requests
import json
from nltk.corpus import brown
from collections import namedtuple
import nltk

In [9]:
def toAbstract(s):
    if s == None:
        return ''
    if not 'abstract_inverted_index' in s:
        return ''
    s = s['abstract_inverted_index']
    vocab = []
    #s = str(json.loads(s))
    try:
        invIdx = s #["InvertedIndex"]
        for iidx in invIdx.items():
            for i in iidx[1]:
                vocab.append((iidx[0], i))
        vocab.sort(key=lambda x:x[1])
        retS = ''
        for x in range(len(vocab)):
            retS = retS + vocab[x][0] + ' '
        #return re.sub(r'''(?x)(?:\s+)''', " ", " ".join([x[0] for x in vocab]))
        return retS.replace('\n\r', ' ').replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    except:
        return ''

def is_en(text, bc_brown, threshold=0.2):
    """
    Function to determine the papers' language
    PARAM text: receiving some text (should be title + abstract text)
    RETURN language: language code, an exception or empty
    RETURN return_code: PASS, FAIL or EMPTY
    """

    if text is None or text == "":
        # text is an empty stirng
        language = 'NA'
        return_code = 'EMPTY'
    else:
        # text is a non empty string
        try:
            tok = set(re.findall(r"\w+", text.lower()))

            if len(tok) <= 0:
                # no tokens extracted - parse as empty
                language = 'NA'
                return_code = 'EMPTY'
            elif len(tok & bc_brown.value) / len(tok) <= threshold:
                # tokens extracted but too few are english terms
                language = 'non-en'
                return_code = 'PASS'
            else:
                # tokens extracted and enough are english terms
                language = 'en'
                return_code = 'PASS'

        except Exception as e:
            # something bad happened
            language = f"{e}"
            return_code = 'FAIL'

    return return_code, language

def getCitations(c):
    if c == None or c == '':
        return ''
    return str('|').join(c).replace('https://openalex.org/W', '')

def getAuthors(Authors):
    ret = ''
    for a in Authors:
        if ret == '':
            if 'id' in a['author']:
                ret = str(a['author']['id']).replace('https://openalex.org/A', '')
        else:
            if 'id' in a['author']:
                ret += '|' + str(a['author']['id']).replace('https://openalex.org/A', '')
    return ret

def getConcepts(Concepts):
    ret = ''
    for c in Concepts:
        if ret == '':
            ret = c['id'].replace('https://openalex.org/C', '')
        else:
            ret += '|' + c['id'].replace('https://openalex.org/C', '')
    return ret

def calculate_total_pages(total_results, page_size):
    return total_results // page_size + (total_results % page_size > 0)

In [10]:
import os
import pandas as pd
import requests
import nltk
from collections import namedtuple
from nltk.corpus import brown
import time
import re

nltk.download('brown')

# Define helper functions
def toAbstract(p):
    return p.get('abstract', '')

def is_en(text, bc_brown, threshold=0.2):
    if text is None or text == "":
        return 'EMPTY', 'NA'
    try:
        tok = set(re.findall(r"\w+", text.lower()))
        if len(tok) == 0:
            return 'EMPTY', 'NA'
        if len(tok & bc_brown.value) / len(tok) <= threshold:
            return 'PASS', 'non-en'
        else:
            return 'PASS', 'en'
    except Exception as e:
        return 'FAIL', str(e)

def getCitations(c):
    if not c:
        return ''
    return '|'.join(c).replace('https://openalex.org/W', '')

def getAuthors(Authors):
    return '|'.join([a['author']['id'].replace('https://openalex.org/A', '') for a in Authors if 'id' in a['author']])

def getConcepts(Concepts):
    return '|'.join([c['id'].replace('https://openalex.org/C', '') for c in Concepts])

# Initialize parameters
cursor = '*'
sample_size = 50000
papersDict = []
current_page = 1

Brown = namedtuple("Brown", field_names=['value'])
bc_brown = Brown(value=set([x.lower() for x in brown.words()]))
headers = {'email': 'avahomiar@gmail.com'}
baseUrl = 'https://openalex.org/works?page=1&filter=title_and_abstract.search%3A%28%28mood%29%20OR%20%28depress%2a%29%20OR%20%28affective%20disorder%29%29%20AND%20%28%28psychosis%29%20OR%20%28schizo%2a%29%20OR%20%28psychotic%29%29%20AND%20%28%28psychedelic%29%20OR%20%28hallucinogen%29%20OR%20%28entheogen%29%20OR%20%28hallucinogenic%29%20OR%20%28psychotropic%29%29,publication_year%3A2014-2024&sort=publication_year%3Adesc'

# Create the directory if it does not exist
output_dir = './output'
os.makedirs(output_dir, exist_ok=True)

# Fetch and process articles
while cursor:
    full_url = f"{baseUrl}&per_page=100&cursor={cursor}"
    print(f"Fetching results from: {full_url}")

    r = requests.get(full_url, headers=headers)

    if r.status_code == 200:
        try:
            theJson = r.json()
        except requests.exceptions.JSONDecodeError:
            print(f"Error decoding JSON response. Status code: {r.status_code}")
            print("Response content:", r.text)
            break

        if 'results' in theJson and theJson['results']:
            for p in theJson['results']:
                abstract = toAbstract(p)
                title = p.get('title', '')
                if title is None:
                    title = ''
                title = title.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
                pub_date = p.get('publication_date', '')
                pub_year = p.get('publication_year', '')
                passed, lang = is_en(title + ' ' + abstract, bc_brown)
                if lang != 'non-en':
                    papersDict.append({
                        'PaperId': p['id'].replace('https://openalex.org/W', ''),
                        'PaperTitle': title,
                        'Citations': getCitations(p.get('referenced_works', [])),
                        'coFoS': getConcepts(p.get('concepts', [])),
                        'Authors': getAuthors(p.get('authorships', [])),
                        'Abstract': abstract,
                        'Lang': 'en',
                        'PubYear': pub_year,
                        'PubDate': pub_date
                    })
            cursor = theJson['meta'].get('next_cursor', None)
        else:
            cursor = None

        # Pause to avoid rate limiting
        time.sleep(1)
    else:
        print(f"Request failed with status code {r.status_code}")
        break

df2 = pd.DataFrame(papersDict)
output_file_path = os.path.join(output_dir, 'openalex_moodpsychosispsychadelics.csv'
df2.to_csv(output_file_path, index=False)

print(f'Done. File saved to {output_file_path}')


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Fetching results from: https://openalex.org/works?page=1&filter=title_and_abstract.search%3A%28%28mood%29%20OR%20%28depress%2a%29%20OR%20%28affective%20disorder%29%29%20AND%20%28%28psychosis%29%20OR%20%28schizo%2a%29%20OR%20%28psychotic%29%29%20AND%20%28%28psychedelic%29%20OR%20%28hallucinogen%29%20OR%20%28entheogen%29%20OR%20%28hallucinogenic%29%20OR%20%28psychotropic%29%29,publication_year%3A2014-2024&sort=publication_year%3Adesc&per_page=100&cursor=*
Error decoding JSON response. Status code: 200
Response content: <!doctype html><html lang="en"><head><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width,initial-scale=1"><link rel="icon" href="/favicon.png"><title>OpenAlex</title><link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:100,300,400,500,700,900"><link href="https://fonts.googleapis.com/css2?family=Dosis:wght@100;200;300;400;500;600;700;800;900&display=swap" rel="stylesheet"><scrip