# Notebook for extracting data

In [82]:
import numpy as np
import pandas as pd
import time
import stanza

In [83]:
stanza.download('en')
nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma',  verbose=False)

In [84]:

def tokenize(data):
    """Tokenize words
    Inputs:
        data : str
            A string containing the article text
    Outputs:
        out : list
            List of tokenized and lemmatized words
    """
    nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma',  verbose=False)
    doc = nlp(data)
    out = []
    for sent in doc.sentences:
        for word in sent.words:
            out.append(word.lemma.lower())
            
    return out

In [85]:
def load_data(folder_name, max_words):
    '''
    Loads title and text from the given folder and stores it to a numpy array. Takes max_words amount of words from the text
    
    Inputs:
    folder_name : str
        destination of csv file
    max_words : int
        number of words to take from text
        
    Outputs:
    ret : numpy array
        shape(N,2), where first column is title and second is corresponding text
    '''
    df = pd.read_csv(folder_name)
    df = df[['title', 'content']].copy()
    df = df.dropna()
    titles = df['title'].values
    text = df['content'].apply(lambda x: " ".join(x.split()[0:max_words]))
    ret = np.array([titles, text]).T
    
    return ret

In [86]:
def extract_data(file, max_words):
    ''' 
    Main function for extracting data
    
    Inputs:
    file : str
        destination of csv file
    max_words : int
        number of words to take from text
    
    Outputs:
    data1: numpy array
        An array of shape (N,2), where on each row there is a tokenized and lemmatized title of the article and a 50 word long
        tokenized and lemmatized text.
    '''
    data1 = load_data(file, max_words)
    
    for i in range(data1.shape[0]):
        data1[i,0] = tokenize(data1[i,0])
        data1[i,1] = tokenize(data1[i,1])
        if i%1000 == 0:
            print(i)
            print(time.time()-start)
    
    return data1

### Cells for running extraction

Takes about 1.5 hours each

In [87]:
#start = time.time()
#extract = extract_data('data/articles1.csv',50)
#np.save('data/extracted1.npy', extract)

In [88]:
#start = time.time()
#extract = extract_data('data/articles2.csv',50)
#np.save('data/extracted2.npy', extract)

In [89]:
#start = time.time()
#extract = extract_data('data/articles3.csv',50)
#np.save('data/extracted3.npy', extract)

### Combining the tokenized data

In [68]:
hopo1 = np.load('data/extracted1.npy', allow_pickle=True)

In [69]:
hopo2 = np.load('data/extracted2.npy', allow_pickle=True)

In [70]:
hopo3 = np.load('data/extracted3.npy', allow_pickle=True)

In [71]:
hopo_combined = np.concatenate((hopo1,hopo2,hopo3))

In [74]:
hopo_combined.shape

(142568, 2)

In [78]:
np.save('data/extracted_comb.npy', hopo_combined)