In [1]:
import random
import time

In [2]:
def texts_to_tokens( text , stop_words ):
    """ Convert text to array of words, remove stop words and punctuation, and stemming each word
        
    Args:
        text ([string]) : [the text you want to convert]
        stop_words ([string array]) : [words you want to remove]
    
    Return:
        [string array] : [the words array of the converted text]
        
    Examples:
        >>> from nltk.stem.porter import PorterStemmer
        >>> from nltk.tokenize import RegexpTokenizer
        >>> texts_to_tokens( "the end of the string:", ["the", "of"] )
        array(['end', 'string'])
    """
    
    from nltk.stem.porter import PorterStemmer
    from nltk.tokenize import RegexpTokenizer
    porter = PorterStemmer()
    
    tokenizer = RegexpTokenizer(r'\w+')
    # remove punctuation    
    tokens = tokenizer.tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
        
    # remove remaining tokens that are not alphabetic
    words = [word for word in tokens if word.isalpha()]
    # steming words
    stemmed = [porter.stem(word) for word in words]
    # filter out stop words
    words = [word for word in stemmed if not word in stop_words]
        
    return words

In [6]:
def htmltotext(text):
    """ Remove sapces from the text in html
        
        Args:
            text ([string]) : [text]
            
        Return:
            [string] : [the string without "long spaces"]
            
        Examples:
            >>> htmltotext("     for   the    music       ")
            'for\nthe\nmusic'
    """
    
    # break into lines and remove spaces leading and trailing for each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

In [None]:
def some_actions( browser ):
    import random
    import time
    
    step = random.randint( 15,25 )
    bias = 50
    
    for i in range (400,600, step):
        
        if i > 480+step:
            browser.execute_script("window.moveTo({},{})".format(step+2, step)) 
        
        time.sleep(0.2)
        browser.execute_script("window.scrollTo(0, {})".format(bias+i)) 

In [None]:
def query_url(url, browser, wait_time=None):
    """ Send get query and return html
        
        Args:
            url ([string]) : [the url for send get query]
            browser ([selenium webdriver]) : [driver for send query]
            wait_time (float, optional) : [the time after that close query]
            
        Return:
            [string] : [html text]
    """
    
    if wait_time != None:
        browser.set_page_load_timeout(wait_time)

    try:
        browser.get(url)
        some_actions( browser )
        cookies = browser.get_cookies()
        # browser.delete_all_cookies()
        for cookie in cookies:
            browser.add_cookie(cookie)
    except Exception as e:
        # if problem with connect timeout just pass   
        if "timeout:" not in str(e):
            print( str(e) )
          
    html = browser.page_source
    # replace \\ to cleaning html atributes correctly
    html = html.replace( "\\", "" )
    
    return html

In [10]:
def check_new_articles( articles_url_name_new, articles_url_name ):
    """ Chech if appear new articles
    
        Args:
            articles_url_name_new ([string array]) : [the new array of urls]
            articles_url_name ([string array]) : [the old array of urls]
            
        Return:
            [string array] : [the array of new articles]
            
        Examples:
            >>> check_new_articles(['a', 'b', 'c'], ['b', 'c'])
            array(['a'])
    """
      
    old_urls = list( articles_url_name )
    new_urls = list( articles_url_name_new )

    # articles exist in new_urls but none in old    
    new_articles_urls = list( set( new_urls ) - set(old_urls)   )

    return new_articles_urls

In [None]:
def send_query( url, data ):
    # send query to server    
    import requests
    import json

    r = requests.post(url, json = json.dumps(data) , headers = {'Content-type': 'application/json', 'Accept': 'text/plain'} )

In [None]:
def save_data( data, path ):
    # save part of data
    import pandas as pd
    from time import gmtime, strftime
    
    data[:150].to_csv( path+"-"+strftime("%Y-%m-%d-%H-%M", gmtime()) + ".csv" , index=False )
    return data[150:]