In [1]:
import re
import nltk
import datetime
import requests
from lxml import html

In [2]:
class Article:
    def __init__(self, url, title, text, tags, published_at, **kwargs):
        self.url = url
        self.title = title   
        self.text = text
        self.tags = tags
        self.published_at = published_at
        self.domain = kwargs.get('domain', '')
        self.summary = kwargs.get('summary', '')

class NewsPageParserBase(object):
    def __init__(self):
        pass
        
    def get_article(self, page, url):
        tree = self.get_page_tree(page)
        title = self.get_title(tree)
        text = self.get_article_text(tree)
        tags = self.get_tags(tree)
        date = self.get_date_published(tree)
        summary = self.get_summary(tree)
        
        article = Article(url, title, text, tags, date, summary = summary)
        return article
        
    def get_page_tree(self, page):
        return html.fromstring(page)
        
    def get_article_text(self, tree):
        pass
    
    def get_title(self, tree):
        pass
    
    def get_tags(self, tree):
        pass
    
    def get_date_published(self, tree):
        pass
    
    def get_summary(self, tree):
        pass
    
class NewsArchivePageParserBase(object):
    def __init__(self, base_url):
        self.base_url = base_url
    
    def get_news_urls(self, page):
        pass
    
class NewsClientBase(object):
    def __init__(self, base_url, archive_url):
        self.base_url = base_url
        self.archive_url = archive_url
        
    def get_article_page(self, url):
        response = requests.get(url)
        return response.content
    
    def get_archive_page(self, date):
        url = self.get_archive_url(date)
        response = requests.get(url)
        return response.content
            
    def get_archive_url(self, date):
        pass
    
class NewsRequestManagerBase(object):
    def __init__(self, client, news_page_parser):
        self.client = client
        self.news_page_parser = news_page_parser

In [64]:
class PravdaConfig(object):
    base_url = 'http://www.pravda.com.ua'
    archive_url = 'http://www.pravda.com.ua/archives/'
    
    @staticmethod
    def get_base_url(language='ua'):
        if(language == 'rus'):
            return PravdaConfig.base_url + '/rus'
        
        return PravdaConfig.base_url

class PravdaNewsPageParser(NewsPageParserBase):
    def __init__(self):
        super(self.__class__, self).__init__()
        
    def get_article_text(self, tree):
        paragraphs = tree.xpath('//div[@class="text"]//p/text()')
        article = ' '.join(paragraphs)
        return article
    
    def get_title(self, tree):
        title = tree.xpath('//h1[@class="title"]/text()')
        return title[0] if len(title) > 0 else ''
    
    def get_tags(self, tree):
        tags = tree.xpath('//p[@class="tags"]//a/text()')
        return tags   
    
    def get_date_published(self, tree):
        date_published = tree.xpath('//span[@class="dt2"]/text()')
        return date_published
    
class PravdaArchivePageParser(NewsArchivePageParserBase):
    def __init__(self, language):
        self._news_relative_urls_regex = r'^\/news(\/\d{1,}){1,}\/$'
        super(self.__class__, self).__init__(PravdaConfig.get_base_url(language))
    
    def get_news_urls(self, page):
        urls = self._get_news_block_urls(page)
        news_relative_urls = [url for url in urls if re.match(self._news_relative_urls_regex, url)]
        news_urls = [self.base_url + url for url in news_relative_urls]
        return news_urls
    
    def _get_news_block_urls(self, page):
        tree = html.fromstring(page)
        links = tree.xpath('//dl[@class="news4"]//a/@href')
        return list(set(links)) 

class PravdaClient(NewsClientBase):
    def __init__(self, language):
        super(self.__class__, self).__init__(PravdaConfig.get_base_url(language), PravdaConfig.archive_url)
            
    def get_archive_url(self, date):
        return self.archive_url + 'date_' + date.strftime("%d%m%Y")
       
class PravdaRequestManager(NewsRequestManagerBase):
    def __init__(self, language='rus'):
        client = PravdaClient(language=language)
        news_page_parser = PravdaNewsPageParser()
        super(self.__class__, self).__init__(client, news_page_parser)
        self._archive_page_parser = PravdaArchivePageParser(language=language)
        
    def get_news(self, date, limit = 0):
        archive_page = self.client.get_archive_page(date)
        urls = self._archive_page_parser.get_news_urls(archive_page)
        urls = urls if limit <= 0 else urls[:limit]
        pages_with_urls = [{'page':self.client.get_article_page(url), 'url':url} for url in urls]
        return [self.news_page_parser.get_article(item['page'], item['url']) for item in pages_with_urls] 

In [5]:
class RussiaTodayNewsPageParser(NewsPageParserBase):
    def __init__(self):
        super(self.__class__, self).__init__()
        
    def get_article_text(self, tree):
        paragraphs = tree.xpath('//div[@itemprop="articleBody"]/p/text()')
        article = ' '.join(paragraphs)
        return article
    
    def get_title(self, tree):
        title = tree.xpath('//h1[@itemprop="name"]/strong/text()')
        return title[0]
    
    def get_summary(self, tree):
        description = tree.xpath('//div[@itemprop="headline description"]/p/text()')
        return description[0] 
    
    def get_date_published(self, tree):
        date_published = tree.xpath('//time[@itemprop="datePublished"]/@datetime')
        return datetime.datetime.strptime(date_published, "%Y-%m-%dT%H:%M")
        
class RussiaTodayArchivePageParser(NewsArchivePageParserBase):
    def __init__(self, base_url = 'http://russian.rt.com/' ):
        super(self.__class__, self).__init__(base_url)
    
    def get_news_urls(self, page):
        urls = self._get_news_block_urls(page)
        news_urls = [self.base_url + url for url in urls]
        return news_urls
    
    def _get_news_block_urls(self, page):
        tree = html.fromstring(page)
        links = tree.xpath('//section[@id="news"]/article/noindex/a/@href')
        return list(set(links)) 

class RussianTodayClient(NewsClientBase):
    def __init__(self):
        base_url = 'http://russian.rt.com/' 
        archive_url = base_url + 'all/'
        super(RussianTodayClient, self).__init__(base_url, archive_url)
        
    def get_archive_url(self, date):
        return self.archive_url +  date.strftime("%Y/%m/%d")
    
class RussiaTodayRequestManager(NewsRequestManagerBase):
    def __init__(self):
        client = RussianTodayClient()
        news_page_parser = RussiaTodayNewsPageParser()
        super(self.__class__, self).__init__(client, news_page_parser)
        self._archive_page_parser = RussiaTodayArchivePageParser()
        
    def get_news(self, date, limit = 0):
        archive_page = self.client.get_archive_page(date)
        urls = self._archive_page_parser.get_news_urls(archive_page)
        urls = urls if limit <= 0 else urls[:limit]
        pages = [self.client.get_article_page(url) for url in urls]
        return [self.news_page_parser.get_article(page, url) for page in pages] 
    
rt_manager = RussiaTodayRequestManager()

In [6]:
# rt_news = rt_manager.get_news(datetime.datetime.now())
# print len(rt_news)

In [7]:
date_published = u'Воскресенье, 28 июня 2015, 04:42'
date = date_published.split(', ')
date_published = date_published.replace(date[0], '')
months = (u'января', u'февраля', u'марта', u'апреля', u'мая', u'июня', u'июля', u'августа', u'сентября', u'октября', u'ноября', u'декабря')
month = next(i for i,name in enumerate(months,1) if name in u'мая')
date = date_published.split(', ')
ru_month =  date[1].split(' ')[1]
print ru_month
numeric_month = next(i for i,name in enumerate(months,1) if name in ru_month)
print numeric_month
print date_published.replace(ru_month, str(numeric_month))
print date_published

months

июня
6
, 28 6 2015, 04:42
, 28 июня 2015, 04:42


(u'\u044f\u043d\u0432\u0430\u0440\u044f',
 u'\u0444\u0435\u0432\u0440\u0430\u043b\u044f',
 u'\u043c\u0430\u0440\u0442\u0430',
 u'\u0430\u043f\u0440\u0435\u043b\u044f',
 u'\u043c\u0430\u044f',
 u'\u0438\u044e\u043d\u044f',
 u'\u0438\u044e\u043b\u044f',
 u'\u0430\u0432\u0433\u0443\u0441\u0442\u0430',
 u'\u0441\u0435\u043d\u0442\u044f\u0431\u0440\u044f',
 u'\u043e\u043a\u0442\u044f\u0431\u0440\u044f',
 u'\u043d\u043e\u044f\u0431\u0440\u044f',
 u'\u0434\u0435\u043a\u0430\u0431\u0440\u044f')