In [None]:
import xml.etree.ElementTree as ET

def extract_title_and_abstract(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Define the XML namespaces
    namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}

    # Extract the title
    title_elem = root.find('.//tei:titleStmt/tei:title[@level="a"][@type="main"]', namespaces)
    title = title_elem.text if title_elem is not None and title_elem.text else "Title not found"

    # Extract the abstract
    abstract_elem = root.find('.//tei:div/tei:p', namespaces)
    abstract = abstract_elem.text if abstract_elem is not None else "Abstract not found"

    return title, abstract

# Usage
file_path = "/Users/gabesmithline/Desktop/gnn_project/data/paper-xml/5a4aef6f17c44a2190f7877f.xml"
title, abstract = extract_title_and_abstract(file_path)
print("Title:", title)
print("\nAbstract:", abstract)

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode the title and abstract
inputs = tokenizer(title + " " + abstract, return_tensors="pt", truncation=True, max_length=512, padding=True)

# Run the input through BERT
with torch.no_grad():
    outputs = model(**inputs)

# Get the pooled output (representation of the entire input)
pooled_output = outputs.pooler_output

print("BERT embedding shape:", pooled_output.shape)
print("First few values of the embedding:", pooled_output[0][:5])


In [None]:
import os
from tqdm import tqdm

def count_tokens(text):
    return len(tokenizer.encode(text, add_special_tokens=True))

def process_xml_files(directory):
    under_512 = 0
    over_512 = 0
    total_files = 0

    for filename in tqdm(os.listdir(directory)):
        if filename.endswith(".xml"):
            file_path = os.path.join(directory, filename)
            try:
                _, abstract = extract_title_and_abstract(file_path)
                token_count = count_tokens(abstract)
                
                if token_count <= 512:
                    under_512 += 1
                else:
                    over_512 += 1
                
                total_files += 1
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

    return under_512, over_512, total_files

# Process all XML files in the paper-xml directory
paper_xml_dir = "paper-xml"
under_512, over_512, total_files = process_xml_files(paper_xml_dir)

print(f"Total files processed: {total_files}")
print(f"Abstracts with 512 or fewer tokens: {under_512}")
print(f"Abstracts with more than 512 tokens: {over_512}")
print(f"Percentage of abstracts over 512 tokens: {(over_512 / total_files) * 100:.2f}%")


In [None]:
import pandas as pd
import torch
from tqdm import tqdm

def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.pooler_output.squeeze().numpy()

def process_and_embed_xml_files(directory):
    abstract_embeddings = {}
    title_embeddings = {}

    for filename in tqdm(os.listdir(directory)):
        if filename.endswith(".xml"):
            file_path = os.path.join(directory, filename)
            try:
                title, abstract = extract_title_and_abstract(file_path)
                
                # Embed title
                title_embedding = embed_text(title)
                title_embeddings[filename[:-4]] = title_embedding
                
                # Embed abstract
                abstract_embedding = embed_text(abstract)
                abstract_embeddings[filename[:-4]] = abstract_embedding
                
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

    return title_embeddings, abstract_embeddings

# Process and embed all XML files in the paper-xml directory
paper_xml_dir = "paper-xml"
title_embeddings, abstract_embeddings = process_and_embed_xml_files(paper_xml_dir)

# Convert embeddings to DataFrames
title_df = pd.DataFrame.from_dict(title_embeddings, orient='index')
abstract_df = pd.DataFrame.from_dict(abstract_embeddings, orient='index')

# Save embeddings to CSV files
title_df.to_csv('title_embeddings.csv')
abstract_df.to_csv('abstract_embeddings.csv')

print("Title embeddings saved to 'title_embeddings.csv'")
print("Abstract embeddings saved to 'abstract_embeddings.csv'")


In [None]:
import torch

a = torch.rand(200, 10)
# Normalize the tensor by row
a_normalized = torch.nn.functional.normalize(a, p=1, dim=1)

# Define temperature for the concrete distribution
temperature = 0.1

# Sample from the concrete distribution
gumbel_noise = -torch.log(-torch.log(torch.rand_like(a_normalized)))
gumbel_max_samples = torch.argmax(torch.log(a_normalized) + gumbel_noise, dim=1)

# Convert to one-hot encoding
samples = torch.zeros_like(a_normalized)
samples.scatter_(1, gumbel_max_samples.unsqueeze(1), 1)

print("Shape of samples:", samples.shape)
print("Sum of each row:", samples.sum(dim=1))


In [None]:
samples