In [125]:
import re
import json
import datetime
import requests

raw_letters = None
with open("raw_letters.json", 'r') as raw_letters_file:
    raw_letters = json.load(raw_letters_file)

In [132]:
timestamp_regex = r"(по|до) (((\d\d?) (января|февраля|марта|апреля|мая|июня|июля|августа|сентября|октября|ноября|декабря))|(\d\d?\.\d\d?(.(\d\d)?\d\d)?))"

month_to_number = {"января":"01", "февраля":"02", "марта":"03",
                   "апреля":"04", "мая":"05", "июня":"06", "июля":"07", "августа":"08", 
                   "сентября":"09", "октября":"10", "ноября":"11", "декабря":"12"}

def extract_date(text, start, end):
    str_date = text[start:end][3:]
    for p in month_to_number:
        str_date = str_date.replace(p, month_to_number[p])

    if len(str_date.split()) == 2:
        str_date = ".".join((str_date + " 2018").split())

    if str_date.count('.') > 2:
        str_date = '.'.join(str_date.split('.')[-3:])
    if (str_date.count('.') == 2 and str_date.split('.')[-1] == "18"):
        str_date = '.'.join(str_date.split('.')[:-1]) + ".2018"
    return str_date

def find_discounts(text):
    discount_regex = r"\d\d?%"
    discount_match = re.findall(discount_regex, text) 
    if (discount_match):
        max_discount = max(discount_match)
        count = len(set(discount_match))
        return max_discount, count
    else:
        return 0, 0

def get_letter_data(raw_letter):
    from_date = datetime.datetime.fromtimestamp(raw_letter["date"]).strftime('%d.%m.%Y')
    to_date = None
    text = None
    discount = None 
    
    date_match = re.search(timestamp_regex, raw_letter["body"]["text"])
    if date_match:
        text = raw_letter["body"]["text"]
        start, end = date_match.span()
        to_date = extract_date(text, start, end)
        discount, count = find_discounts(text)
        text = "Количество текущих предложений: {}".format(count)
    else:
        return None
    return {"from":from_date, "to":to_date, "text":text, "discount":discount}


def get_direct_letter_link(raw_letter):
    return "https://e.mail.ru/message/" + raw_letter["id"]

import base64
from io import BytesIO

def get_company_info(raw_letter):
    company = raw_letter["from"]["name"]
    picture = raw_letter["from"]["avatars"]["default"]

    base, args = picture.split("?")
    base = "https://conf-photos.ru/pic?"
    args = "&".join(filter(lambda x: "email=" in x or "width=" in x or "height=" in x, args.split("&")))
    picture = base + args

    return company, picture

def get_labels(raw_letter):
    text = raw_letter["body"]["text"]
    labels = list()
    if "на все" in text.lower():
        labels.append("На все товары")
    if "промокод" in text.lower():
        labels.append("Есть промокод")
    return labels

In [133]:
def process_letters(raw_letters):
    letters = list()
    for i, raw_letter in enumerate(raw_letters):
        company, picture = get_company_info(raw_letter)
        data = get_letter_data(raw_letter)
        link = get_direct_letter_link(raw_letter)
        subject = raw_letter["subject"]
        labels = get_labels(raw_letter)
        letter = { "company":company, "picture":picture, "data":data, "link":link, "subject":subject, "labels":labels}
        letters.append(letter)
    letters = filter_dates(letters)
    letters = order_by_companies(letters)
    letters = merge_discounts(letters)
    return letters

def filter_dates(letters):
    current_date = datetime.datetime.now().timestamp()
    return list(filter(lambda x: x["company"] and x["data"] and x["data"]["discount"] and  datetime.datetime.strptime(x["data"]["to"], "%d.%m.%Y").timestamp() > current_date, letters)) 

def order_by_companies(letters):
    unique_companies = set(i["company"] for i in letters)
    ordered = {key:[] for key in unique_companies}
    for letter in letters:
        ordered[letter["company"]].append(letter)
    return ordered

def merge_discounts(letters):
    merged = list()
    companies = letters.keys()
    for company in companies:
        discounts = letters[company]
        ends = set(discount["data"]["to"] for discount in discounts)
        for end in ends:
            same_discounts = list(filter(lambda x: x["data"]["to"] == end, discounts))
            same_discounts = sorted(same_discounts, key=lambda x: -int(x["data"]["discount"][:-1]))
            merged.append(same_discounts[0])
    return merged

In [134]:
letters = process_letters(raw_letters)
with open("letters.json", 'w') as letters_file:
    json.dump(letters, letters_file)

1521147600.0

In [135]:
letters

[{'company': 'Reebok',
  'data': {'discount': '50%',
   'from': '28.06.2018',
   'text': 'Количество текущих предложений: 1',
   'to': '16.07.2018'},
  'labels': [],
  'link': 'https://e.mail.ru/message/15302083720000002549',
  'picture': 'https://conf-photos.ru/pic?email=reebok@reebok%2dnews.reebok.com&width=90&height=90',
  'subject': 'Бестселлеры распродажи | Успей купить, пока они есть в наличии'},
 {'company': 'Ivan Ivanov',
  'data': {'discount': '50%',
   'from': '15.07.2018',
   'text': 'Количество текущих предложений: 1',
   'to': '16.07.2018'},
  'labels': [],
  'link': 'https://e.mail.ru/message/15316554350000000392',
  'picture': 'https://conf-photos.ru/pic?email=smartmail_team3@mail.ru&width=90&height=90',
  'subject': 'Re: Бестселлеры распродажи | Успей купить, пока они есть в наличии'},
 {'company': 'MediaMarkt',
  'data': {'discount': '90%',
   'from': '06.07.2018',
   'text': 'Количество текущих предложений: 1',
   'to': '20.07.2018'},
  'labels': [],
  'link': 'https: