In [3]:
# -*- coding: utf-8 -*-
import os
import warnings
warnings.filterwarnings('ignore')
import gensim
from gensim.utils import simple_preprocess
from gensim import corpora
from gensim.models import CoherenceModel
import spacy
import argparse
import re
import time

In [4]:
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

Data Preprocessing:    

    0. Load txt files.   
    1. Transform sentences into words.     
    2. Remove digits and short words(less than 2 characters).
    3. Form bigrams to find proper nouns.
    4. Remove stop words.
    5. Word lemmatization and only keep noun, verb, adj and adv.

In [5]:
# load txt files from folder and delete copyright and source info
def load_files():
    path = 'plaintext_articles'
    files = os.listdir(path)
    data = []
    for i, file in enumerate(files):
        if os.path.splitext(file)[1] == '.txt':
            file_path = os.path.join(path, file)
            with open(file_path, "r", encoding='utf-8') as f:
                txt = [line.strip() for line in f.readlines()]
                del txt[-5:-1]  #retrieved info
                del txt[0]      #copyright line
                try:
                    data.append(txt)
                except Exception as e:
                    print(e)
    print("Finish loading data.")
    return data

In [30]:
# for each article, split words from sentences, digits are removed
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
    print("Finish transforming sentences to words.")

In [7]:
# remove numbers and one character words
def clean_digit_short(texts):
    #texts = [[token for token in doc if not token.isdigit()] for doc in texts]
    texts = [[token for token in doc if len(token) > 3] for doc in texts]
    print("Finish cleaning digits and short words.")
    return texts

In [8]:
def form_bigrams(texts):
    bigram = gensim.models.Phrases(texts, min_count=5, threshold=100.0)
    print("Finish creating bigrams.")
    return [bigram[doc] for doc in texts], bigram

In [9]:
def remove_stopwords(texts):
    stopwords = nlp.Defaults.stop_words
    stopwords |= {'know', 'second', 'include', 'january', 'february', \
                  'april', 'may', 'june', 'july', 'august', 'september',\
                  'october', 'november', 'december', 'monday', 'tuesday',\
                  'wednesday', 'thursday', 'friday', 'saturday', 'sunday'}
    print("Finish removing stop words.")    
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stopwords] for doc in texts]

In [10]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    print("Finish lemmatization.")
    return texts_out

In [11]:
# Call data preparation functions 
data = load_files()

data_words = list(sent_to_words(data))
data_words = clean_digit_short(data_words)
data_words = remove_stopwords(data_words)
data_words = form_bigrams(data_words)[0]
data_words = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

Finish loading data.
Finish transforming sentences to words.
Finish cleaning digits and short words.
Finish removing stop words.
Finish creating bigrams.
Finish lemmatization.


In [12]:
# Save data result into txt file
with open("data_words.txt", "w", encoding = 'utf-8') as f:
    for item in data_words:
        f.write(str(item))
        f.write("\n")
    f.close()