# Load preprocessed data

In [1]:
import json
import pandas as pd

In [2]:
with open('preprocessed_data.json', 'r') as f:
    data = json.load(f)

In [3]:
# preprocessed abstracts
cc = data['climate_change_abstract']
nlp = data['nlp_abstract']

In [4]:
# full datasets
cc_f = pd.read_parquet('./arxiv_climate_change.parquet', engine='pyarrow')
nlp_f = pd.read_parquet('./arxiv_nlp.parquet', engine='pyarrow')

# Combine data

In [5]:
# add cleaned abstracts to the dataframe
cc_f['abstract_cleaned'] = cc
nlp_f['abstract_cleaned'] = nlp

In [6]:
cc_f.columns

Index(['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
       'report-no', 'categories', 'license', 'abstract', 'versions',
       'update_date', 'authors_parsed', 'abstract_cleaned'],
      dtype='object')

# Sort and split by time

In [7]:
cc_sorted = cc_f.sort_values(by=['update_date'])
nlp_sorted = nlp_f.sort_values(by=['update_date'])

In [8]:
cc_t = cc_sorted['update_date'].apply(lambda x: int(x[:4]))
nlp_t = nlp_sorted['update_date'].apply(lambda x: int(x[:4]))

In [9]:
cc_id1 = cc_t[cc_t<2013].index
cc_id2 = cc_t[(cc_t>2012) & (cc_t<2019)].index
cc_id3 = cc_t[cc_t>2018].index

In [10]:
nlp_id1 = nlp_t[nlp_t<2013].index
nlp_id2 = nlp_t[(nlp_t>2012) & (nlp_t<2019)].index
nlp_id3 = nlp_t[nlp_t>2018].index

In [11]:
cc1 = cc_sorted.loc[cc_id1]
cc2 = cc_sorted.loc[cc_id2]
cc3 = cc_sorted.loc[cc_id3]

In [12]:
nlp1 = nlp_sorted.loc[nlp_id1]
nlp2 = nlp_sorted.loc[nlp_id2]
nlp3 = nlp_sorted.loc[nlp_id3]

# Save data

In [13]:
cc1.to_parquet('climate_change_t1.parquet', engine='pyarrow', index=False)
cc2.to_parquet('climate_change_t2.parquet', engine='pyarrow', index=False)
cc3.to_parquet('climate_change_t3.parquet', engine='pyarrow', index=False)

In [14]:
nlp1.to_parquet('nlp_t1.parquet', engine='pyarrow', index=False)
nlp2.to_parquet('nlp_t2.parquet', engine='pyarrow', index=False)
nlp3.to_parquet('nlp_t3.parquet', engine='pyarrow', index=False)

In [15]:
cc_f.to_parquet('climate_change_full.parquet', engine='pyarrow', index=False)
nlp_f.to_parquet('nlp_full.parquet', engine='pyarrow', index=False)