# data_exploration.ipynb

# Install necessary libraries if you haven't already

In [None]:
!pip install matplotlib seaborn wordcloud PyPDF2

import os
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from PyPDF2 import PdfReader

# Load and explore the PDF data


In [None]:
pdf_folder = '/home/cmejo/arxiv-dataset/pdf'  # adjust the path as necessary

def pdf_to_text(pdf_folder):
    texts = []
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            reader = PdfReader(pdf_path)
            text = ''
            for page in reader.pages:
                text += page.extract_text()
            texts.append({'title': pdf_file, 'text': text})
    return texts

texts = pdf_to_text(pdf_folder)

# Basic statistics about the dataset

In [None]:
num_papers = len(texts)
print(f"Number of research papers: {num_papers}")

# Length of each document

In [None]:
doc_lengths = [len(text['text'].split()) for text in texts]
print(f"Average document length: {sum(doc_lengths) / len(doc_lengths):.2f} words")

# Plotting the distribution of document lengths


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(doc_lengths, kde=True)
plt.title('Distribution of Document Lengths')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.show()

# Word Cloud of the entire dataset

In [None]:
all_text = " ".join([text['text'] for text in texts])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Research Papers')
plt.show()

# Inspect some documents

In [None]:
for i, text in enumerate(texts[:3]):
    print(f"\nDocument {i+1}: {text['title']}")
    print(text['text'][:500])  # Print first 500 characters