In [1]:
from catboost import CatBoostClassifier
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
import pickle
import warnings
warnings.filterwarnings('ignore')

In [22]:
def mean_li(soup):
    lis = []
    for item in soup.find_all('ul'):
        new_bs = BeautifulSoup(str(item))
        lis_num = len(new_bs.find_all('li'))
        if lis_num:
            lis.append(lis_num)
    if lis:
        return int(np.mean(lis))
    else:
        return 0
def href_counter(html_page):

    urls = re.findall(r'href=[\'"]?([^\'" >]+)', html_page.text)
    pattern = r'.php|css|json|xml|mailto|jpg|png|jpeg'
    filtered = [val for val in urls if not re.search(pattern, val)]

    return len(filtered)
def get_feats(soup_page):
    
    news_pat = r'<.*news.*>'
    a_and_p_pat = r'<a.*>[\S\n\t\v ]?<p'
    div_and_h_pat = r'<div.*>[\S\n\t\v ]?<h.?'
    a_and_div_pat = r'<a.*>[\S\n\t\v ]?<div'
    h_and_a_pat = r'<h.*>[\S\n\t\v ]?<a'
    img_and_a_pat = r'<img.*>[\S\n\t\v ]?<a'
    ###
    a_counter = len(soup_page.find_all('a'))
    div_counter = len(soup_page.find_all('div'))
    li_counter = len(soup_page.find_all('li'))
    ul_counter = len(soup_page.find_all('ul'))
    nav_counter = len(soup_page.find_all('nav'))
    ###
    script_counter = len(soup_page.find_all('script'))
    h_counter = len(soup_page.find_all('h'))
    h1_counter = len(soup_page.find_all('h1'))
    h2_counter = len(soup_page.find_all('h2'))
    img_counter = len(soup_page.find_all('img'))
    ###
    p_counter = len(soup_page.find_all('p'))
    link_counter = len(soup_page.find_all('link'))
    button_counter = len(soup_page.find_all('button'))
    span_counter = len(soup_page.find_all('span'))
    ###
    li_mean = mean_li(soup_page)
    href_count = href_counter(soup_page)
    ###
    news_counter = len(re.findall(news_pat, str(soup_page.html)))
    a_and_p_regex = len(re.findall(a_and_p_pat, str(soup_page.html)))
    div_and_h_regex = len(re.findall(div_and_h_pat, str(soup_page.html)))
    a_and_div_regex = len(re.findall(a_and_div_pat, str(soup_page.html)))
    h_and_a_regex = len(re.findall(h_and_a_pat, str(soup_page.html)))
    img_and_a_regex = len(re.findall(img_and_a_pat, str(soup_page.html)))
    
    return [a_counter, div_counter, li_counter, ul_counter,nav_counter,\
            script_counter,h_counter,h1_counter,h2_counter,img_counter,\
            p_counter,link_counter,button_counter,span_counter,\
           li_mean,href_count,\
           news_counter, a_and_p_regex,div_and_h_regex,a_and_div_regex,h_and_a_regex,img_and_a_regex]
def get_pattern(page):
    pat = r'title="([\S\n\t\v ]*?)"'
    items = re.findall(pat, str(page.html))
    new_items = set(items)
    return list(new_items)
len_2 = lambda x: True if len(x) else False
len_3 = lambda x: True if len(x)>2 else False
def remove_special_characters(text):
    # define the pattern to keep
    pat = r'[^a-zA-z0-9а-яА-я.,!?/:;\"\'\s)(«»]' 
    return re.sub(pat, '', text.lower())
def make_words_list(items):
    words = []
    for item in items:
        word_list = item.split()
        words.extend(word_list)
    return list(set(words))
def get_title_text(soup_page):
    items = get_pattern(soup_page)
    items = list(map(remove_special_characters, items))
    words = make_words_list(items)
    return list(filter(len_3,words[:300]))
def get_img_text(soup):
    imgs = soup.find_all('img')
    cleaned = re.sub(r'[\n\t\d\.\-)("«»]', ' ', soup.text)
    texts = ' '.join(cleaned.split()).lower().split()
    return list(filter(len_3,texts[:300]))
def clean_html(soup):
    cleaned = re.sub(r'[\n\t\.)(]', ' ', str(soup.html))
    cleaned = ' '.join(cleaned.split())
    return cleaned

In [26]:
def get_html_and_text_data(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    max_html_size = 100000
    if url[:4] != 'http':
        url = 'https://' + url
    try:
        page = requests.get(url, verify = False, timeout = 10, headers=headers)
    except:
        print('request failed',idx, url,'--'*10)
    soup = BeautifulSoup(page.text)
    soup = BeautifulSoup(clean_html(soup))
    string_power = len(str(soup.html))
    if string_power > max_html_size:
        soup = BeautifulSoup(str(soup.html)[:max_html_size])
    html_feats = get_feats(soup)
    title_text = get_title_text(soup)
    image_text = get_img_text(soup)
    return html_feats, title_text+image_text

In [43]:
def prepare_html_feats(html_feats):
    html_feats = np.array(html_feats)
    html_feats = np.reshape(np.expand_dims(html_feats,-1), (1,22))
    return html_feats

In [44]:
def prepare_text_feats(text, countvect, tfidf):
    text = ' '.join(text)
    transformed_test = countvect.transform([text])
    text_feats = np.array(tfidf.transform(transformed_test).todense())
    return text_feats

In [45]:
def make_predict(model, text_feats,tag_feats):
    vec = np.hstack([text_feats,tag_feats])
    pred = model.predict_proba(vec)
    return pred

In [50]:
def predict_pipeline(url, model, counter, tfidf):
    tags, texts = get_html_and_text_data(url)
    tags = prepare_html_feats(tags) 
    texts = prepare_text_feats(texts,counter,tfidf)
    prediction = make_predict(model, texts,tags)
    return prediction

In [2]:
countvectorizer_NEWS = pickle.load(open("countvectorizer_NEWS.pickle", "rb"))
tfidf_NEWS = pickle.load(open("tfidf_NEWS.pickle", "rb"))

In [4]:
transformed_test = countvectorizer_NEWS.transform(['новости'])
test_tfidf = np.array(tfidf_NEWS.transform(transformed_test).todense())

In [14]:
catboost = CatBoostClassifier()      # parameters not required.
catboost.load_model('catboost_model_news_class')

<catboost.core.CatBoostClassifier at 0x1a25d3f6d0>

In [36]:
data = pd.read_csv('SOURCE_NEWS_LINKS.csv')

In [37]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,url_link_source,target
0,0,https://new.rah.ru/events/articles/,1.0
1,1,https://new.rah.ru/events/news/,1.0
2,2,https://new.rah.ru/education/novosti_sobytiya/...,1.0
3,3,https://medkareta.ru/,0.0
4,4,https://medkareta.ru/news/,1.0
5,5,https://medkareta.ru/uslugi/,0.0
6,6,http://autoshinsnab.com/about/news,1.0
7,7,http://autoshinsnab.com/catalog/,0.0
8,8,http://autoshinsnab.com/producers,0.0
9,9,http://autoshinsnab.com/producers/,0.0


In [40]:
url = 'https://medkareta.ru/news/'

In [56]:
asd = predict_pipeline(url, catboost, countvectorizer_NEWS, tfidf_NEWS)

In [64]:
for idx, row in data[:10].iterrows():
    url = row.url_link_source
    y_true = row.target
    result = 'NEWS' if y_true else 'NOT NEWS'
    zero_prob, one_prob = predict_pipeline(url, catboost, countvectorizer_NEWS, tfidf_NEWS)[0]
    print(f"URL: {url} : news prob: {int(100*one_prob)}%, not news: {int(100*zero_prob)}% TRUE: {result}")

URL: https://new.rah.ru/events/articles/ : news prob: 93%, not news: 6% TRUE: NEWS
URL: https://new.rah.ru/events/news/ : news prob: 96%, not news: 3% TRUE: NEWS
URL: https://new.rah.ru/education/novosti_sobytiya/index.php : news prob: 95%, not news: 4% TRUE: NEWS
URL: https://medkareta.ru/ : news prob: 12%, not news: 87% TRUE: NOT NEWS
URL: https://medkareta.ru/news/ : news prob: 81%, not news: 18% TRUE: NEWS
URL: https://medkareta.ru/uslugi/ : news prob: 27%, not news: 72% TRUE: NOT NEWS
URL: http://autoshinsnab.com/about/news : news prob: 77%, not news: 22% TRUE: NEWS
URL: http://autoshinsnab.com/catalog/ : news prob: 25%, not news: 74% TRUE: NOT NEWS
URL: http://autoshinsnab.com/producers : news prob: 24%, not news: 75% TRUE: NOT NEWS
URL: http://autoshinsnab.com/producers/ : news prob: 24%, not news: 75% TRUE: NOT NEWS


In [65]:
urls = ['lenta.ru', 'tjournal.ru', 'meduza.io']
for url in urls:
    y_true = 1
    result = 'NEWS' if y_true else 'NOT NEWS'
    zero_prob, one_prob = predict_pipeline(url, catboost, countvectorizer_NEWS, tfidf_NEWS)[0]
    print(f"URL: {url} : news prob: {int(100*one_prob)}%, not news: {int(100*zero_prob)}% TRUE: {result}")

URL: lenta.ru : news prob: 91%, not news: 8% TRUE: NEWS
URL: tjournal.ru : news prob: 54%, not news: 45% TRUE: NEWS
URL: meduza.io : news prob: 77%, not news: 22% TRUE: NEWS


In [66]:
urls = ['ozon.ru', 'livetex.ru/', 'www.kiprinform.com/']
for url in urls:
    y_true = 0
    result = 'NEWS' if y_true else 'NOT NEWS'
    zero_prob, one_prob = predict_pipeline(url, catboost, countvectorizer_NEWS, tfidf_NEWS)[0]
    print(f"URL: {url} : news prob: {int(100*one_prob)}%, not news: {int(100*zero_prob)}% TRUE: {result}")

URL: ozon.ru : news prob: 5%, not news: 94% TRUE: NOT NEWS
URL: livetex.ru/ : news prob: 58%, not news: 41% TRUE: NOT NEWS
URL: www.kiprinform.com/ : news prob: 74%, not news: 25% TRUE: NOT NEWS
