In [1]:
import re
import json
import datetime

raw_letters = None
with open("raw_letters.json", 'r') as raw_letters_file:
    raw_letters = json.load(raw_letters_file)

In [2]:
timestamp_regex = r"(–ø–æ|–¥–æ) (((\d\d?) (—è–Ω–≤–∞—Ä—è|—Ñ–µ–≤—Ä–∞–ª—è|–º–∞—Ä—Ç–∞|–∞–ø—Ä–µ–ª—è|–º–∞—è|–∏—é–Ω—è|–∏—é–ª—è|–∞–≤–≥—É—Å—Ç–∞|—Å–µ–Ω—Ç—è–±—Ä—è|–æ–∫—Ç—è–±—Ä—è|–Ω–æ—è–±—Ä—è|–¥–µ–∫–∞–±—Ä—è))|(\d\d?\.\d\d?(.(\d\d)?\d\d)?))"

month_to_number = {"—è–Ω–≤–∞—Ä—è":"01", "—Ñ–µ–≤—Ä–∞–ª—è":"02", "–º–∞—Ä—Ç–∞":"03",
                   "–∞–ø—Ä–µ–ª—è":"04", "–º–∞—è":"05", "–∏—é–Ω—è":"06", "–∏—é–ª—è":"07", "–∞–≤–≥—É—Å—Ç–∞":"08", 
                   "—Å–µ–Ω—Ç—è–±—Ä—è":"09", "–æ–∫—Ç—è–±—Ä—è":"10", "–Ω–æ—è–±—Ä—è":"11", "–¥–µ–∫–∞–±—Ä—è":"12"}

def extract_date(text, start, end):
    str_date = text[start:end][3:]
    for p in month_to_number:
        str_date = str_date.replace(p, month_to_number[p])

    if len(str_date.split()) == 2:
        str_date = ".".join((str_date + " 2018").split())

    if str_date.count('.') > 2:
        str_date = '.'.join(str_date.split('.')[-3:])

    return str_date

def find_discounts(text):
    discount_regex = r"\d\d?%"
    discount_match = re.findall(discount_regex, text) 
    if (discount_match):
        max_discount = max(discount_match)
        count = len(set(discount_match))
        return max_discount, count
    else:
        return 0, 0

def get_letter_data(raw_letter):
    from_date = datetime.datetime.fromtimestamp(raw_letter["date"]).strftime('%d.%m.%Y')
    to_date = None
    text = None
    discount = None 
    
    date_match = re.search(timestamp_regex, raw_letter["body"]["text"])
    if date_match:
        text = raw_letter["body"]["text"]
        start, end = date_match.span()
        to_date = extract_date(text, start, end)
        discount, count = find_discounts(text)
        text = "–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–µ–∫—É—â–∏—Ö –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π: {}".format(count)
    else:
        return None
    return {"from":from_date, "to":to_date, "text":text, "discount":discount}


def get_direct_letter_link(raw_letter):
    return "https://e.mail.ru/message/" + raw_letter["id"]

def get_company_info(raw_letter):
    company = raw_letter["from"]["name"]
    picture = raw_letter["from"]["avatars"]["50x50"]
    return company, picture

def get_labels(raw_letter):
    text = raw_letter["body"]["text"]
    labels = list()
    if "–ø—Ä–æ–º–æ–∫–æ–¥" in text.lower():
        labels.append("–ï—Å—Ç—å –ø—Ä–æ–º–æ–∫–æ–¥")
    if "–Ω–∞ –≤—Å–µ" in text.lower():
        labels.append("–ù–∞ –≤—Å–µ —Ç–æ–≤–∞—Ä—ã")
    
    return labels

In [3]:
def process_letters(raw_letters):
    letters = list()
    for raw_letter in raw_letters:
        company, picture = get_company_info(raw_letter)
        data = get_letter_data(raw_letter)
        link = get_direct_letter_link(raw_letter)
        subject = raw_letter["subject"]
        labels = get_labels(raw_letter)
        letter = { "company":company, "picture":picture, "data":data, "link":link, "subject":subject, "labels":labels}
        letters.append(letter)
    letters = filter_dates(letters)
    return letters

def filter_dates(letters):
    current_date = datetime.datetime.now().timestamp()
    return list(filter(lambda x: x["company"] and x["data"] and x["data"]["discount"] and  datetime.datetime.strptime(x["data"]["to"], "%d.%m.%Y").timestamp() > current_date, letters)) 

def order_by_companies(letters):
    unique_companies = set(i["company"] for i in letters)
    ordered = {key:[] for key in unique_companies}
    for letter in letters:
        ordered[letter["company"]].append(letter)
    return ordered

In [4]:
letters = process_letters(raw_letters)
with open("letters.json", 'w') as letters_file:
    json.dump(letters, letters_file)

In [5]:
letters

[{'company': 'E96.ru',
  'data': {'discount': '5%',
   'from': '12.07.2018',
   'text': '–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–µ–∫—É—â–∏—Ö –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π: 1',
   'to': '16.07.2018'},
  'link': 'https://e.mail.ru/message/15313800380000002802',
  'picture': 'https://filin.mail.ru/pic?user=smartmail_team3@mail.ru&email=e96@e.e96.ru&trust=true&sign=f7fb79db0ac9de18f980623f49824dd4bbc4cbc9&width=50&height=50',
  'subject': 'üéÅ –ü–æ–ª—É—á–∏—Ç–µ —Å–∫–∏–¥–∫—É –∫–æ –¥–Ω—é —Ñ–æ—Ç–æ–≥—Ä–∞—Ñ–∞!'},
 {'company': 'MediaMarkt',
  'data': {'discount': '90%',
   'from': '06.07.2018',
   'text': '–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–µ–∫—É—â–∏—Ö –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π: 1',
   'to': '20.07.2018'},
  'link': 'https://e.mail.ru/message/15308729480000002683',
  'picture': 'https://filin.mail.ru/pic?user=smartmail_team3@mail.ru&email=help@mediamarktsupport.ru&trust=true&sign=b709f9f1627625765ce566b314b4f9d201dcf7b9&width=50&height=50',
  'subject': 'üî• –°–ö–ò–î–ö–ò –¥–æ 90%! üîî –õ–ò–ö–í–ò–î–ê–¶–ò–Ø –≤ —Å–≤—è–∑–∏ c –∑–∞–∫—Ä

In [6]:
datetime.datetime.strptime('16.03.2018', "%d.%m.%Y")

datetime.datetime(2018, 3, 16, 0, 0)