In [1]:
import re
import nltk
import requests

from datetime import datetime, timedelta
from lxml import html

In [3]:
class PravdaRussianDateParser(object):
    months_rus = [u'января', u'февраля', u'марта', u'апреля', u'мая', 
                       u'июня', u'июля', u'августа', 
                       u'сентября', u'ноября', u'октября', u'декабря']
    months_eng = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
                 'August', 'September', 'October', 'November', 'December']

    def __init__(self):
        pass
    
    @staticmethod
    def get_month_number(month):
        return PravdaRussianDateParser.months.index(month) + 1
    
    @staticmethod
    def get_date_time(date_time):
        months = [[element, PravdaRussianDateParser.months_eng[index]] for index, element in enumerate(PravdaRussianDateParser.months_rus)]
        months = dict(months)
        
        def multipleReplace(text, wordDict):
            for key in wordDict:
                text = text.replace(key, str(wordDict[key]))
            return text
        
        date_str = multipleReplace(date_time, months)
        date_str = date_str.encode('ascii', 'ignore')
        #fix
        try:
            date_time = datetime.strptime(date_str, ", %d %B %Y, %H:%M")
        except ValueError:
            print date_str
            return None
            
        return date_time

In [4]:
class Article:
    def __init__(self, url, title, text, tags, published_at, **kwargs):
        self.url = url
        self.title = title   
        self.text = text
        self.tags = tags
        self.published_at = published_at
        self.domain = kwargs.get('domain', '')
        self.summary = kwargs.get('summary', '')

class NewsPageParserBase(object):
    def __init__(self):
        pass
        
    def get_article(self, page, url):
        tree = self.get_page_tree(page)
        title = self.get_title(tree)
        text = self.get_article_text(tree)
        tags = self.get_tags(tree)
        date = self.get_date_published(tree)
        summary = self.get_summary(tree)
        
        article = Article(url, title, text, tags, date, summary = summary)
        return article
        
    def get_page_tree(self, page):
        return html.fromstring(page)
        
    def get_article_text(self, tree):
        pass
    
    def get_title(self, tree):
        pass
    
    def get_tags(self, tree):
        pass
    
    def get_date_published(self, tree):
        pass
    
    def get_summary(self, tree):
        pass
    
class NewsArchivePageParserBase(object):
    def __init__(self, base_url):
        self.base_url = base_url
    
    def get_news_urls(self, page):
        pass
    
class NewsClientBase(object):
    def __init__(self, base_url, archive_url):
        self.base_url = base_url
        self.archive_url = archive_url
        
    def get_article_page(self, url):
        response = requests.get(url)
        return response.content
    
    def get_archive_page(self, date):
        url = self.get_archive_url(date)
        response = requests.get(url)
        return response.content
            
    def get_archive_url(self, date):
        pass
    
class NewsRequestManagerBase(object):
    def __init__(self, client, news_page_parser, archive_page_parser):
        self.client = client
        self.news_page_parser = news_page_parser
        self.archive_page_parser = archive_page_parser
        
    def get_news(self, date, limit = 0):
        archive_page = self.client.get_archive_page(date)
        urls = self.archive_page_parser.get_news_urls(archive_page)
        urls = urls if limit <= 0 else urls[:limit]
        pages_with_urls = [{'page':self.client.get_article_page(url), 'url':url} for url in urls]
        return [self.news_page_parser.get_article(item['page'], item['url']) for item in pages_with_urls]

In [5]:
class PravdaConfig(object):
    base_url = 'http://www.pravda.com.ua'
    archive_url = 'http://www.pravda.com.ua/archives/'
    
    @staticmethod
    def get_base_url(language='ua'):
        if(language == 'rus'):
            return PravdaConfig.base_url + '/rus'
        
        return PravdaConfig.base_url

class PravdaNewsPageParser(NewsPageParserBase):
    def __init__(self):
        super(self.__class__, self).__init__()
        
    def get_article_text(self, tree):
        paragraphs = tree.xpath('//div[@class="text"]//p/text()')
        article = ' '.join(paragraphs)
        return article
    
    def get_title(self, tree):
        title = tree.xpath('//h1[@class="title"]/text()')
        return title[0] if len(title) > 0 else ''
    
    def get_tags(self, tree):
        tags = tree.xpath('//p[@class="tags"]//a/text()')
        return tags   
    
    def get_date_published(self, tree):
        date_published = tree.xpath('//span[@class="dt2"]/text()')

        if(len(date_published) > 0):
            dt = date_published[0].lstrip()#.decode('unicode-escape')
            return PravdaRussianDateParser.get_date_time(dt)
        
        return None
    
class PravdaArchivePageParser(NewsArchivePageParserBase):
    def __init__(self, language):
        self._news_relative_urls_regex = r'^\/news(\/\d{1,}){1,}\/$'
        super(self.__class__, self).__init__(PravdaConfig.get_base_url(language))
    
    def get_news_urls(self, page):
        urls = self._get_news_block_urls(page)
        news_relative_urls = [url for url in urls if re.match(self._news_relative_urls_regex, url)]
        news_urls = [self.base_url + url for url in news_relative_urls]
        return news_urls
    
    def _get_news_block_urls(self, page):
        tree = html.fromstring(page)
        links = tree.xpath('//dl[@class="news4"]//a/@href')
        return list(set(links)) 

class PravdaClient(NewsClientBase):
    def __init__(self, language):
        super(self.__class__, self).__init__(PravdaConfig.get_base_url(language), PravdaConfig.archive_url)
            
    def get_archive_url(self, date):
        return self.archive_url + 'date_' + date.strftime("%d%m%Y")
       
class PravdaRequestManager(NewsRequestManagerBase):
    def __init__(self, language='rus'):
        client = PravdaClient(language=language)
        news_page_parser = PravdaNewsPageParser()
        super(self.__class__, self).__init__(client, news_page_parser, PravdaArchivePageParser(language=language))

In [17]:
# pravda_rm = PravdaRequestManager()
# news = pravda_rm.get_news(datetime.now() - timedelta(days=3))
# print len(news)

# base = datetime.today()
# date_list = [base - timedelta(days=x) for x in range(0, 10)]

# news = []
# for day in date_list:
#     print day
#     articles = pravda_rm.get_news(day)
#     news = news + articles
    
# articles = [article.__dict__ for article in news]
# len(articles)  

78


In [6]:
class RussiaTodayNewsPageParser(NewsPageParserBase):
    def __init__(self):
        super(self.__class__, self).__init__()
        
    def get_article_text(self, tree):
        paragraphs = tree.xpath('//div[@itemprop="articleBody"]/p/text()')
        article = ' '.join(paragraphs)
        return article
    
    def get_title(self, tree):
        title = tree.xpath('//h1[@itemprop="name"]/strong/text()')
        return title[0] if len(title) > 0 else ''
    
    def get_summary(self, tree):
        description = tree.xpath('//div[@itemprop="headline description"]/p/text()')
        return description[0] if len(description) > 0 else ''
    
    def get_date_published(self, tree):
        date_published = tree.xpath('//time[@itemprop="datePublished"]/@datetime')
        date_published = date_published[0] if len(date_published) > 0 else ''
        if(date_published == ''):
            return None
        
        return datetime.strptime(date_published, "%Y-%m-%dT%H:%M")
        
class RussiaTodayArchivePageParser(NewsArchivePageParserBase):
    def __init__(self, base_url = 'http://russian.rt.com/' ):
        super(self.__class__, self).__init__(base_url)
    
    def get_news_urls(self, page):
        urls = self._get_news_block_urls(page)
        news_urls = [self.base_url + url for url in urls]
        return news_urls
    
    def _get_news_block_urls(self, page):
        tree = html.fromstring(page)
        links = tree.xpath('//section[@id="news"]/article/noindex/a/@href')
        return list(set(links)) 

class RussianTodayClient(NewsClientBase):
    def __init__(self):
        base_url = 'http://russian.rt.com/' 
        archive_url = base_url + 'all/'
        super(RussianTodayClient, self).__init__(base_url, archive_url)
        
    def get_archive_url(self, date):
        return self.archive_url +  date.strftime("%Y/%m/%d")
    
class RussiaTodayRequestManager(NewsRequestManagerBase):
    def __init__(self):
        client = RussianTodayClient()
        news_page_parser = RussiaTodayNewsPageParser()
        super(self.__class__, self).__init__(client, news_page_parser, RussiaTodayArchivePageParser())
    
rt_manager = RussiaTodayRequestManager()

In [100]:
# news = rt_manager.get_news(datetime.now())
# print len(news)

22
