In [1]:
import requests
import pandas as pd
from tqdm import tqdm
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# API endpoints
ENDPOINT = 'https://api.core.ac.uk/v3/search/works/'
headers = {'Authorization': 'Bearer gi18bstBoavqIYL9uNnzG2kp6lU3WmRd'}

In [None]:
response = requests.get(ENDPOINT, headers=headers)

if response.status_code == 200:
    rate_limit_remaining = response.headers.get('X-RateLimit-Remaining')
    rate_limit_retry_after = response.headers.get('X-RateLimit-Retry-After')
    rate_limit_limit = response.headers.get('X-RateLimit-Limit')

    print(f"Rate Limit Remaining: {rate_limit_remaining}")
    print(f"Rate Limit Retry After: {rate_limit_retry_after}")
    print(f"Rate Limit: {rate_limit_limit}")
else:
    print(f"Failed to fetch data: {response.status_code}")


In [None]:
yearPublished_start = '2019'

params_search = {
    'q' : rf"(yearPublished>={yearPublished_start}) AND (language:English) AND (title:economic+complexity OR title:economics+complexity OR title:complexity+economics OR title:economy+complexity OR title:complexity+economy OR title:complex+economics OR title:economics+complex OR title:complex+systems+economics OR title:complexity+theory+economics OR title:complex+adaptive+systems+economics)",
    'limit' : 2000,
    'offset' : 0
}

response = requests.get(ENDPOINT, params=params_search, headers=headers)
data = response.json()

In [None]:
data['totalHits']

In [None]:
ids = []
titles = []
authors = []
years = []
abstracts = []
dois = []
languages = []
publishers = []
full_texts = []

for i, paper in tqdm(enumerate(data['results']), total=len(data['results'])):
    if paper['fullText'] is not None:
        full_texts.append(paper['fullText'])
    else:
        continue
    ids.append(paper.get('id', None))
    titles.append(paper.get('title', None))
    authors.append([author.get('name', None) for author in paper.get('authors', None)])
    years.append(paper.get('yearPublished', None))
    abstracts.append(paper.get('abstract', None))
    dois.append(paper.get('doi', None))
    languages.append(paper.get('language', None))
    publishers.append(paper.get('publisher', None))


corpus = pd.DataFrame({
    'COREid':ids,
    'title':titles,
    'author(s)':authors,
    'publication_year':years,
    'abstract':abstracts,
    'doi':dois,
    'lang':languages,
    'publisher':publishers,
    'text':full_texts
})

In [None]:
corpus

In [None]:
corpus_unique_clean = corpus.drop_duplicates(subset='title').dropna(subset=['title', 'text'])

In [None]:
corpus_unique_clean

In [None]:
corpus_unique_clean['title_text'] = [f'{i}\n\n{j}' for i,j in zip(corpus_unique_clean['title'], corpus_unique_clean['text'])]

In [None]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    
    return ' '.join(filtered_text)

corpus_unique_clean_nostop = corpus_unique_clean
corpus_unique_clean_nostop['title_text_nostop'] = corpus_unique_clean_nostop['title_text'].apply(remove_stopwords)

In [None]:
corpus_unique_clean_nostop

In [None]:
corpus_unique_clean_nostop.to_csv(f'economic_complexity_corpus_{yearPublished_start}-2024_{len(corpus_unique_clean)}.tsv', sep='\t', escapechar='\\', index_label='id')

In [None]:
df = pd.read_table(f'economic_complexity_corpus_{yearPublished_start}-2024_{len(corpus_unique_clean_nostop)}.tsv') 

output_dir = f'corpus_txt/{yearPublished_start}-2024_txt'

for index, row in df.iterrows():
    filename = f"{row['id']}_{row['COREid']}_{row['publication_year']}.txt"
    filepath = os.path.join(output_dir, filename)
    with open(filepath, 'w', encoding='utf-8') as file:
        file.write(f"{row['title_text_nostop']}")

print(f"Saved {len(df)} files in {output_dir} directory.")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data
dist = dict(df['publication_year'].value_counts())

# Extract years and counts
years = list(dist.keys())
counts = list(dist.values())

# Normalize counts for color intensity
norm_counts = np.array(counts) / max(counts)  # Normalize to [0, 1]

# Create color gradient based on normalized counts
colors = plt.cm.Blues(norm_counts)

# Create bar chart
plt.figure(figsize=(8, 6))
bars = plt.bar(years, counts, color=colors)

# Add titles and labels
plt.xlabel('Publication Year')
plt.ylabel('Number of Papers')
plt.xticks(years)

# Add text labels on top of the bars with larger font size
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval + 0.2, int(yval), ha='center', va='bottom', fontsize=10)

# Adjust the font size as per your preference (e.g., fontsize=10)

# Show plot
plt.show()
