# Notebook for extracting data

In [37]:
import numpy as np
import pandas as pd
import time
import stanza

In [38]:
stanza.download('en')
nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma',  verbose=False)

In [39]:

def tokenize(data):
    """Tokenize words
    Inputs:
        data : str
            A string containing the article text
    Outputs:
        out : list
            List of tokenized and lemmatized words
    """
    nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma',  verbose=False)
    doc = nlp(data)
    out = []
    for sent in doc.sentences:
        for word in sent.words:
            out.append(word.lemma.lower())
            
    return out

In [40]:
def load_data(folder_name, max_words):
    '''
    Loads title and text from the given folder and stores it to a numpy array. Takes max_words amount of words from the text
    
    Inputs:
    folder_name : str
        destination of csv file
    max_words : int
        number of words to take from text
        
    Outputs:
    ret : numpy array
        shape(N,2), where first column is title and second is corresponding text
    '''
    df = pd.read_csv(folder_name)
    titles = df.dropna()['title'].values
    text = df.dropna()['content'].apply(lambda x: " ".join(x.split()[0:max_words]))
    ret = np.array([titles, text]).T
    
    return ret

In [41]:
def extract_data(file, max_words):
    ''' 
    Main function for extracting data
    
    Inputs:
    file : str
        destination of csv file
    max_words : int
        number of words to take from text
    
    Outputs:
    data1: numpy array
        An array of shape (N,2), where on each row there is a tokenized and lemmatized title of the article and a 50 word long
        tokenized and lemmatized text.
    '''
    data1 = load_data(file, max_words)
    
    for i in range(data1.shape[0]):
        data1[i,0] = tokenize(data1[i,0])
        data1[i,1] = tokenize(data1[i,1])
        if i%1000 == 0:
            print(i)
            print(time.time()-start)
    
    return data1

In [107]:
start = time.time()
extract = extract_data('data/articles1.csv',50)
np.save('data/extracted1.npy', extract)

0
3.33554744720459
1000
137.579115152359
2000
273.49046564102173
3000
410.3901255130768
4000
544.7796211242676
5000
680.9490840435028
6000
815.8964951038361
7000
950.2463464736938
8000
1085.5610930919647
9000
1223.2059874534607
10000
1363.0494391918182
11000
1503.3645045757294
12000
1641.325650691986
13000
1780.1372473239899
14000
1919.0838537216187
15000
2058.2279257774353
16000
2196.8848943710327
17000
2337.1308863162994
18000
2479.0618538856506
19000
2623.171090364456
20000
2765.259790420532
21000
2906.0800247192383
22000
3047.417018890381
23000
3188.270562171936
24000
3328.5184848308563
25000
3474.5871415138245
26000
3614.1262538433075
27000
3750.920711517334
28000
3889.0353696346283
29000
4026.547479391098
30000
4164.067597150803
31000
4301.309461355209
32000
4435.953962802887
33000
4566.564794063568
34000
4697.069528341293
35000
4826.099922180176
36000
4955.368531942368
37000
5085.814213037491
38000
5214.623405218124
39000
5344.567630290985
40000
5474.161535978317
41000
5603.8225

In [None]:
start = time.time()
extract = extract_data('data/articles2.csv',50)
np.save('data/extracted2.npy', extract)

In [None]:
start = time.time()
extract = extract_data('data/articles3.csv',50)
np.save('data/extracted3.npy', extract)