# Notebook for extracting data

In [84]:
import numpy as np
import pandas as pd
import time

In [67]:
def load_data(folder_name, max_words):
    '''
    Loads title and text from the given folder and stores it to a numpy array. Takes max_words amount of words from the text
    
    Inputs:
    folder_name : str
        destination of csv file
    max_words : int
        number of words to take from text
        
    Outputs:
    ret : numpy array
        shape(N,2), where first column is title and second is corresponding text
    '''
    df = pd.read_csv(folder_name)
    titles = df['title'].values
    text = df['content'].apply(lambda x: " ".join(x.split()[0:max_words]))
    ret = np.array([titles, text]).T
    
    return ret

In [89]:
def extract_data(max_words):
    ''' 
    Main function for extracting data
    '''
    data1 = load_data('data/articles1.csv', max_words)
    data2 = load_data('data/articles2.csv', max_words)
    data3 = load_data('data/articles3.csv', max_words)
    ret = np.concatenate((data1, data2, data3))
    
    return ret

In [90]:
start = time.time()
hopo_ret = extract_data(50)
print(time.time()-start)

11.015007257461548


In [91]:
hopo_ret.shape

(142570, 2)