# Data Collection - Academic Journal Data

### Semantic Scholar API 
"S2ORC is the largest publicly-available collection of full text for open-access scientific papers." <br>
Description of database content: https://www.semanticscholar.org/reader/cb92a7f9d9dbcf9145e32fdfa0e70e2a6b828eb1,
https://aclanthology.org/2020.acl-main.447.pdf

S2ORC = Full text <br>
S2AG = Metadata (citations, nodes, etc.)

In [1]:
import requests
import pandas as pd
import bs4

# PDF stuff
import pdfminer.pdfinterp
import pdfminer.converter
import pdfminer.layout
import pdfminer.pdfpage

# More
import re #for regexs
import urllib.parse #For joining urls
import io #for making http requests look like files
import json #For Tumblr API responses
import os.path #For checking if files exist
import os #For making directories

import math
import numpy as np

# Week 3
import lucem_illud
import sklearn
import sklearn.feature_extraction.text
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.datasets
import sklearn.cluster
import sklearn.decomposition
import sklearn.metrics
# import scipy #For hierarchical clustering and some visuals
# #import scipy.cluster.hierarchy
# import gensim#For topic modeling

### Retrieve Abstracts via API

In [178]:
# Set Semantic Scholar API key
S2_API_KEY = os.environ.get('S2_API_KEY')

In [179]:
# Sample code to CHECK CONTENTS OF PAPER
# Define the paper search endpoint URL

# import time

url = 'https://api.semanticscholar.org/graph/v1/paper/search/bulk'

headers = {
        'X-API-KEY': S2_API_KEY,
    }

# Define the required query parameter and its value (in this case, the keyword we want to search for)
query_params = {
    'query': 'holocaust survivor interview',
    'limit': 100,
    'offset': 1100,
    'fields': 'title,abstract,year'
    # 'fields': 'title,publicationTypes,publicationDate'
}

# time.sleep(1.1)

# Make the GET request with the URL and query parameters
searchResponse = requests.get(url, params=query_params)

In [181]:
searchResponse.text



In [182]:
# Convert the string to a dictionary
response_d1 = json.loads(searchResponse.text)

# Now, data_dict is a Python dictionary
print(response_d1)



**View results**

In [183]:
response_d1['data'][:5]

[{'paperId': '00557a0bd108e22aba0476fa6706a0d8fd821d9a',
  'title': 'Archiving the Memory of the Holocaust',
  'abstract': None,
  'year': 2020},
 {'paperId': '01299e0a5c913f413ec1add7ec1c407d52e3d904',
  'title': 'Forgotten victims of World War II: Hungarian women in Soviet forced labour camps.',
  'abstract': 'Tra la fine del 1944 e l’inizio del 1945 migliaia di giovani donne dai 15 anni in su, incluse donne incinte, furono deportate dall’Ungheria orientale verso campi di lavoro sovietici. Furono vittime innocenti, per “essere state nel posto sbagliato al momento sbagliato”. Per periodi dai due ai quattro anni furono obbligate a vivere nelle condizioni più primitive ed atroci, lavorando in miniere di carbone ed in fattorie collettive. La maggior partesi queste donne non sopravvisse. Quelle che vi riuscirono ritornarono con infermità e malattie fisiche e psicologiche che le afflissero per il resto della vita. Al loro ritorno furono trattate dal regime comunista come criminali di guerr

In [184]:
abstracts = {}
for paper in response_d1['data']:
        if paper['abstract']:
                abstracts[paper['paperId']] = paper['abstract']

len(abstracts)

610

In [185]:
# Save to CSV
import csv

csv_file = 's2orc_abstracts_dict6_610.csv'

# Open the CSV file in write mode
with open(csv_file, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=abstracts.keys())

    # Write the header
    writer.writeheader()

    # Write the data
    writer.writerow(abstracts)

In [239]:
# Read in
# Specify the CSV file path
csv_files = ['s2orc_abstracts_dict1.csv', 's2orc_abstracts_dict2_88.csv',
             's2orc_abstracts_dict3_74.csv', 's2orc_abstracts_dict4_88.csv',
             's2orc_abstracts_dict5_93.csv', 's2orc_abstracts_dict6_610.csv']

abstracts_text_data = pd.DataFrame()

for file in csv_files:
    df = pd.read_csv(file, header=None, index_col=0)
    df = df.transpose()
    df.columns = ['paper_ID', 'abstract']
    abstracts_text_data = pd.concat([abstracts_text_data, df], axis=0)

In [240]:
abstracts_text_data.shape

(1036, 2)

In [242]:
# Check for redundant paper_IDs
duplicates_in_paper_ID = abstracts_text_data[abstracts_text_data.duplicated('paper_ID')]
len(duplicates_in_paper_ID)

270

In [243]:
# Remove duplicate papers
abstracts_text_data = abstracts_text_data.drop_duplicates(subset=['paper_ID'])
abstracts_text_data.shape # As expected

(766, 2)

In [245]:
# Explort data 
# abstracts_text_data.to_csv('abstracts_text_data_766.csv')

### Embed abstracts

In [257]:
# from openai import OpenAi
import os
import openai

# Embedding generation
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed # Parallelize

# Embedding functions
# import embedding_utils as utils

In [260]:
import getpass

# OPENAI_API_KEY = getpass.getpass()
# openai.api_key = OPENAI_API_KEY

os.environ["OPENAI_API_KEY"] = "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"
client = openai.OpenAI(api_key = os.environ['OPENAI_API_KEY'])

In [262]:
# Functions for embedding - embedding_utils.py is READY to be used, but somehow
# I couldn't get it to work remotely. 

def get_embedding(text, model="text-embedding-3-small"):
    # Maximum number of tokens allowed by the GPT model
    max_tokens = 8192
    
    # If the text is shorter than the maximum tokens, use it directly
    if len(text.split()) <= max_tokens:
        return client.embeddings.create(input = [text], model=model).data[0].embedding
    
    # Calculate the start and end indices for the middle chunk
    start_index = (len(text.split()) - max_tokens) // 2
    end_index = start_index + max_tokens
    
    # Extract the middle chunk of text
    middle_chunk = ' '.join(text.split()[start_index:end_index])
    
    # Generate the embedding for the middle chunk
    return client.embeddings.create(input=[middle_chunk], model=model).data[0].embedding


# https://platform.openai.com/docs/guides/embeddings/use-cases

# Function to be executed in parallel
def get_embedding_for_text(row, col_to_embed, index_name):
    text_index = row[index_name]
    text = row[col_to_embed]
    embedding = get_embedding(text)  # Make the API call to get the embedding
    return text_index, embedding


def create_text_embedding_json_parallel(df, col_to_embed, index_name, output_file_name="text_embeddings_parallel.json", workers=8):
    with open(output_file_name, 'w') as outfile, ThreadPoolExecutor(max_workers=workers) as executor:
        # Use a dictionary to keep track of futures, with the song title as the key
        future_to_text = {executor.submit(get_embedding_for_text, row, col_to_embed, index_name): row[col_to_embed] for _, row in df.iterrows()}
        
        # Process completed futures as they complete
        for future in tqdm(as_completed(future_to_text), total=len(df), desc="Processing text"):
            text_title = future_to_text[future]
            try:
                # Get the result from the future
                text_index, embedding = future.result()
                # Create a JSON object for the current song and its embedding
                text_embedding_json = json.dumps({text_index: embedding})
                # Write the JSON object to the file on a new line
                outfile.write(text_embedding_json + '\n')
            except Exception as exc:
                print(f'{text_title} generated an exception: {exc}')

    print(f"Embeddings saved to {output_file_name}")

In [263]:
# Test sample
# df_test = abstracts_text_data.sample(30)
# create_text_embedding_json_parallel(df_test, "abstract", "paper_ID", "test_abstract_embeddings_parallel.json", workers=8)

Processing text: 100%|██████████| 30/30 [00:01<00:00, 27.53it/s]

Embeddings saved to test_abstract_embeddings_parallel.json





In [264]:
# Make all embeddings
create_text_embedding_json_parallel(abstracts_text_data, "abstract", "paper_ID", "abstracts_embeddings_all.json", workers=8)

Processing text: 100%|██████████| 766/766 [00:22<00:00, 33.73it/s]

Embeddings saved to abstracts_embeddings_all.json





In [265]:
# Append embeddings to all other data

# Read in json
def read_text_embeddings_from_json(file_name):
    # Create an empty dictionary to store the embeddings
    embeddings = {}
    
    # Open the JSON file and read each line
    with open(file_name, 'r') as infile:
        for line in infile:
            # Parse the JSON object from the line
            text_embedding = json.loads(line)
            
            # Update the embeddings dictionary with the song and its embedding
            embeddings.update(text_embedding)
    
    return embeddings

In [270]:
abstracts_dict = read_text_embeddings_from_json("abstracts_embeddings_all.json")

In [290]:
# Reset the index of the DataFrame
abstracts_text_data.reset_index(drop=True, inplace=True)

In [291]:
# Match key with paper_ID

# Iterate over the dictionary items
for key, value in abstracts_dict.items():
    # Find the index of the row where the key matches the 'ID' column
    index = abstracts_text_data.index[abstracts_text_data['paper_ID'] == key].tolist()
    # If the key exists in the DataFrame
    if index:
        # Update the corresponding row in the specified column with the dictionary value
        abstracts_text_data.at[index[0], 'embedding'] = value

In [293]:
print(abstracts_text_data.shape)
abstracts_text_data[:3]

(766, 3)


Unnamed: 0,paper_ID,abstract,embedding
0,329f5441ffcbdc970ea5868ad27aae13c212ea08,"In April 1983, the first American Gathering of...","[0.025907130911946297, 0.012855629436671734, 0..."
1,4cf5d504f0ccca7da2a65900d2c48f5a1b99f620,Deviating from foundational assumptions regard...,"[0.05267952382564545, 0.08909037709236145, -0...."
2,19fb1c14084496b10bb5be6cb86d2650d72f9ae7,Holocaust survivors are an integral part to Ho...,"[0.0017098549287766218, 0.03559383749961853, 0..."


In [294]:
# Export 
# abstracts_text_data.to_csv('abstracts.csv')