# Import Data

In [1]:
import pandas as pd

In [2]:
nlp_dataset = pd.read_parquet('./arxiv_nlp.parquet', engine='pyarrow')

In [3]:
cc_dataset = pd.read_parquet('./arxiv_climate_change.parquet', engine='pyarrow')

# Preprocess Data

In [4]:
import nltk
nltk.download('stopwords')
import re
import numpy as np
import gensim
from gensim.utils import simple_preprocess
import spacy

[nltk_data] Downloading package stopwords to /Users/moyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Prepare stopwords
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [6]:
cc_abs = cc_dataset.abstract.values.tolist()
cc_ttl = cc_dataset.title.values.tolist()
nlp_abs = nlp_dataset.abstract.values.tolist()
nlp_ttl = nlp_dataset.title.values.tolist()

## Clean texts

In [7]:
# functions

# Tokenize words and cleanup the text
def sent_to_words(sentences):
  for sentence in sentences:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))            
    #deacc=True removes punctuations

# Remove Stopwords, make bigrams and lemmatize
# Define function for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


def make_bigrams(texts):
    # Build the bigram model
    bigram = gensim.models.Phrases(texts, min_count=5, threshold=100) # higher threshold fewer phrases.

    # Faster way to get a sentence clubbed as a bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return [bigram_mod[doc] for doc in texts]


def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


def text_preprocess(docs): 
    # docs: list of str
      
    # Remove new line characters 
    docs = [re.sub('\s+', ' ', doc) for doc in docs]  
    # Remove distracting single quotes 
    docs = [re.sub("\'", "", doc) for doc in docs]

    docs_words = list(sent_to_words(docs))

    # Call preprocessing functions in order
    # Remove Stop Words
    docs_words_nostops = remove_stopwords(docs_words)

    # Form Bigrams
    docs_words_bigrams = make_bigrams(docs_words_nostops)

    # Do lemmatization keeping only noun, adj, vb, adv
    docs_lemmatized = lemmatization(docs_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Create Corpus 
    docs_cleaned = docs_lemmatized
    
    return docs_cleaned

In [8]:
cc_abs_words_cleaned = text_preprocess(cc_abs)
cc_abs_cleaned = list(map(lambda x: ' '.join(x), cc_abs_words_cleaned))

In [9]:
cc_ttl_words_cleaned = text_preprocess(cc_ttl)
cc_ttl_cleaned = list(map(lambda x: ' '.join(x), cc_ttl_words_cleaned))

In [10]:
nlp_abs_words_cleaned = text_preprocess(nlp_abs)
nlp_abs_cleaned = list(map(lambda x: ' '.join(x), nlp_abs_words_cleaned))

In [11]:
nlp_ttl_words_cleaned = text_preprocess(nlp_ttl)
nlp_ttl_cleaned = list(map(lambda x: ' '.join(x), nlp_ttl_words_cleaned))

## save preprocessed data

In [12]:
import json

In [13]:
save_data_dic = {"climate_change_abstract":cc_abs_cleaned, "climate_change_title":cc_ttl_cleaned, "nlp_abstract": nlp_abs_cleaned, "nlp_title": nlp_ttl_cleaned}


In [14]:
with open("preprocessed_data.json", 'w', encoding='utf-8') as fl:
    json.dump(save_data_dic, fl, ensure_ascii=False, indent=4)