In [5]:
import json
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# File paths
file_path = "arxiv-metadata-oai-snapshot.json"
output_file_path = "cleaned_lemmatized_abstracts.txt"

# Setup
stop_words = set(word.lower() for word in stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Data holders
filtered_tokens_all = []
list_of_abstract_token_lists = []
abstract_count = 0
max_abstracts = 10000

# Open file to write cleaned abstracts
with open(output_file_path, "w", encoding="utf-8") as output_file:
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            if abstract_count >= max_abstracts:
                break
            try:
                json_object = json.loads(line)
                if "abstract" in json_object and json_object["abstract"].strip():
                    abstract = json_object["abstract"]

                    # Tokenize
                    tokens = word_tokenize(abstract)

                    # Clean and lemmatize
                    filtered = [
                        lemmatizer.lemmatize(word.lower(), pos='n')
                        for word in tokens
                        if word.isalpha() and word.lower() not in stop_words
                    ]

                    if filtered:
                        # Add to data holders
                        filtered_tokens_all.extend(filtered)
                        list_of_abstract_token_lists.append(filtered)
                        abstract_count += 1

                        # Write to file (one line per abstract)
                        output_file.write(" ".join(filtered) + "\n")

            except json.JSONDecodeError:
                continue

# Create DataFrame (summary of all tokens)
df_filtered = pd.DataFrame({
    'filtered_word_tokens': [filtered_tokens_all],
    'combined_abstract_cleaned': [" ".join(filtered_tokens_all)]
})

[nltk_data] Downloading package punkt to /home/thoyavan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/thoyavan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/thoyavan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
# Uses the new file created which contains the cleaned and lemmatized abstracts
cleaned_file_path = "cleaned_lemmatized_abstracts.txt"

word_freq = Counter()

with open(cleaned_file_path, "r", encoding="utf-8") as file:
    for line in file:
        tokens = line.strip().split()
        word_freq.update(tokens)

# Shows the most common words
print("\nTop 50 most common words:")
for word, count in word_freq.most_common(50):
    print(f"{word}: {count}")


Top 50 most common words:
model: 5064
result: 3731
field: 3464
system: 3290
show: 3213
state: 2802
also: 2708
two: 2616
energy: 2602
using: 2438
theory: 2424
function: 2418
study: 2399
mass: 2331
quantum: 2291
one: 2136
present: 2032
data: 2017
star: 1998
equation: 1880
case: 1856
time: 1833
method: 1811
effect: 1776
paper: 1769
find: 1716
density: 1710
structure: 1709
phase: 1690
new: 1670
space: 1656
galaxy: 1656
property: 1642
parameter: 1640
distribution: 1588
number: 1557
order: 1536
solution: 1524
magnetic: 1481
large: 1444
temperature: 1439
spectrum: 1426
problem: 1393
different: 1387
based: 1350
observed: 1313
group: 1303
analysis: 1286
first: 1282
term: 1257
