In [15]:
import json
import re
import requests
import sys
import os
# import time
# from sqlalchemy.ext.declarative import declarative_base
# from sqlalchemy import Column, BigInteger, String
# from sqlalchemy.orm import sessionmaker, scoped_session
# sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

# from etl.simple_etl import load_config, TaskBase


# Base = declarative_base()
# Session = scoped_session(sessionmaker())

s = requests.Session()


HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'Accept': 'application/json, text/plain, */*',
    'Cache-Control': 'no-cache',
    'Accept-encoding': 'gzip, deflate, sdch, br',
    'Accept-language': 'en-US,en;q=0.8,lt;q=0.6,ru;q=0.4'}
MAX_CATEGORY_DEPTH = 6
PRODUCT_GROUP_ID_REGEX = r'ProductGroupId\":\"([\da-f-]+)\"'
PRODUCTS_IMPORTED_TIMESTAMP_REGEX = r'ProductsImportedTimestamp\":\"([a-zA-Z0-9]+)\"'
SITECORE_PUBLISHED_STAMP_REGEX = r'SitecorePublishedStamp\":\"([a-zA-Z0-9_]+)\"'
TIMESLOT_UTC_REGEX = r'TimeslotUtc\":\"([0-9-]+)\"'

s = requests.Session()

class Product(Base):
    __tablename__ = 'nemlig_product'
    id = Column(BigInteger, primary_key=True)
    brand = Column(String(length=500))
    name = Column(String(length=500))
    url = Column(String(length=500))
    category1 = Column(String(length=500))
    category2 = Column(String(length=500))
    category3 = Column(String(length=500))
    category4 = Column(String(length=500))


# def initialize_nemlig_on_postgres(engine):
#     Product.__table__.drop(engine, checkfirst=True)
#     Product.__table__.create(engine, checkfirst=True)


class ImportNemligProducts(TaskBase):

    def __init__(self, engine):
        task_name = "ImportNemligProducts"
        config = load_config()
        config = config['NEMLIG']
        table_name = config["NEMLIG_PRODUCTS_TABLE"]
        Session.configure(bind=engine, autoflush=False, expire_on_commit=False)
        self.n_rows = 0
        self.bulk = []
        super().__init__(engine, task_name, table_name)

    def __persist__(self, product, cat1, cat2, cat3, cat4):
        dbproduct = Product()
        dbproduct.name = product['name']
        dbproduct.brand = product.get('brand', None)
        dbproduct.url = product['url']
        dbproduct.category1 = cat1
        dbproduct.category2 = cat2
        dbproduct.category3 = cat3
        dbproduct.category4 = cat4
        self.bulk.append(dbproduct)
        self.n_rows = self.n_rows + 1
        if self.n_rows % 1000 == 0:
            Session.bulk_save_objects(self.bulk)
            self.bulk.clear()
            Session.commit()


    def before(self):
        initialize_nemlig_on_postgres(self.engine)
        return 0

    def run(self):
        status = self.before()
        if status != 0:
            return status

        status = get_all_items(self.__persist__)
        if len(self.bulk) > 0:
            Session.bulk_save_objects(self.bulk)
            Session.commit()

        self.on_success()
        return status


def get_headers(referer):
    headers = HEADERS
    headers['Referer'] = referer
    return headers


def get_categories():
    response = s.get('https://www.nemlig.com/varer', headers=get_headers('https://www.nemlig.com'))
    products_imported_timestamp = get_products_imported_timestamp(response.text)
    sitecore_published_stamp = get_sitecore_published_stamp(response.text)
    timeslot_utc = get_timeslot_utc(response.text)
    categories_response = s.get('https://www.nemlig.com/webapi/{products_imported_timestamp}-{sitecore_published_stamp}/{timeslot_utc}/1/0/Menu/main?navigationDepth=15', headers=get_headers('https://www.nemlig.com/varer'))
    return json.loads(categories_response.text)


# def get_products_imported_timestamp(html: str):
#     products_imported_timestamp_match = re.search(PRODUCTS_IMPORTED_TIMESTAMP_REGEX, html)
#     if products_imported_timestamp_match:
#         return products_imported_timestamp_match.group(1)
#     else:
#         return None


def get_sitecore_published_stamp(html: str):
    sitecore_published_stamp_match = re.search(SITECORE_PUBLISHED_STAMP_REGEX, html)
    if sitecore_published_stamp_match:
        return sitecore_published_stamp_match.group(1)
    else:
        return None

def get_timeslot_utc(html: str):
    timeslot_utc_match = re.search(TIMESLOT_UTC_REGEX, html)
    if timeslot_utc_match:
        return timeslot_utc_match.group(1)
    else:
        return None


def get_items(url, persist_func):
    # Nemlig seems to be throttling requests, so add a little sleep to avoid it
    time.sleep(0.5)
    final_url = f'https://www.nemlig.com{url}'
    response = s.get(final_url, headers=get_headers(final_url))
    match = re.findall(PRODUCT_GROUP_ID_REGEX, response.text, re.DOTALL)
    products_imported_timestamp = get_products_imported_timestamp(response.text)
    sitecore_published_stamp = get_sitecore_published_stamp(response.text)
    timeslot_utc = get_timeslot_utc(response.text)

    if match:
        for product_group_id in match:
            product_group_url = f'https://www.nemlig.com/webapi/{products_imported_timestamp}-{sitecore_published_stamp}/{timeslot_utc}/1/0/Products/GetByProductGroupId?productGroupId={product_group_id}'
            products_response = s.get(product_group_url, headers=get_headers(final_url))
            products = json.loads(products_response.text)['Products']
            for product in products:
                cat1, cat2, cat3, cat4 = get_categories_from_url(final_url)
                p = {'name': product['Name'], 'brand': product.get('Brand', None), 'url': product.get('Url', None)}
                persist_func(p, cat1, cat2, cat3, cat4)

    return 0


def get_categories_from_url(url):
    url_parts = url.split('/')
    url_parts = url_parts[url_parts.index('varer') + 1:]
    cat1 = None if len(url_parts) < 1 else url_parts[0]
    cat2 = None if len(url_parts) < 2 else url_parts[1]
    cat3 = None if len(url_parts) < 3 else url_parts[2]
    cat4 = None if len(url_parts) < 4 else url_parts[3]
    return cat1, cat2, cat3, cat4


def process_category(category, depth, persist_func):
    depth += 1
    if depth >= MAX_CATEGORY_DEPTH:
        return
    else:
        children = category['Children']
        if len(children) > 0:
            for child in children:
                process_category(child, depth, persist_func)
        else:
            get_items(category['Url'], persist_func)


def get_all_items(persist_func):
    categories = get_categories()
    goods_category = [cat for cat in categories if cat['Url'] == '/varer'][0]
    process_category(goods_category, 0, persist_func)
    return 0

SyntaxError: invalid syntax (<ipython-input-15-c7fbb5fac670>, line 122)

In [34]:
import json
import re
import requests
import sys
import os
# import time
# from sqlalchemy.ext.declarative import declarative_base
# from sqlalchemy import Column, BigInteger, String
# from sqlalchemy.orm import sessionmaker, scoped_session
# sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

# from etl.simple_etl import load_config, TaskBase


# Base = declarative_base()
# Session = scoped_session(sessionmaker())


HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'Accept': 'application/json, text/plain, */*',
    'Cache-Control': 'no-cache',
    'Accept-encoding': 'gzip, deflate, sdch, br',
    'Accept-language': 'en-US,en;q=0.8,lt;q=0.6,ru;q=0.4'}
MAX_CATEGORY_DEPTH = 6
PRODUCT_GROUP_ID_REGEX = r'ProductGroupId\":\"([\da-f-]+)\"'
PRODUCTS_IMPORTED_TIMESTAMP_REGEX = r'ProductsImportedTimestamp\":\"([a-zA-Z0-9]+)\"'
SITECORE_PUBLISHED_STAMP_REGEX = r'SitecorePublishedStamp\":\"([a-zA-Z0-9_]+)\"'
TIMESLOT_UTC_REGEX = r'TimeslotUtc\":\"([0-9-]+)\"'

def get_headers(referer):
    headers = HEADERS
    headers['Referer'] = referer
    return headers


def get_categories():
    response = s.get('https://www.nemlig.com/varer', headers=get_headers('https://www.nemlig.com'))
    products_imported_timestamp = get_products_imported_timestamp(response.text)
    #sitecore_published_stamp = get_sitecore_published_stamp(response.text)
    timeslot_utc = get_timeslot_utc(response.text)
    categories_response = s.get('https://www.nemlig.com/webapi/{products_imported_timestamp}-{sitecore_published_stamp}/{timeslot_utc}/1/0/Menu/main?navigationDepth=15', headers=get_headers('https://www.nemlig.com/varer'))
    return json.loads(categories_response.text)


def get_items(url, persist_func):
    # Nemlig seems to be throttling requests, so add a little sleep to avoid it
    time.sleep(0.5)
    final_url = 'https://www.nemlig.com{url}'
    response = s.get(final_url, headers=get_headers(final_url))
    match = re.findall(PRODUCT_GROUP_ID_REGEX, response.text, re.DOTALL)
    products_imported_timestamp = get_products_imported_timestamp(response.text)
    sitecore_published_stamp = get_sitecore_published_stamp(response.text)
    timeslot_utc = get_timeslot_utc(response.text)

    if match:
        for product_group_id in match:
            product_group_url = 'https://www.nemlig.com/webapi/{products_imported_timestamp}-{sitecore_published_stamp}/{timeslot_utc}/1/0/Products/GetByProductGroupId?productGroupId={product_group_id}'
            products_response = s.get(product_group_url, headers=get_headers(final_url))
            products = json.loads(products_response.text)['Products']
            for product in products:
                cat1, cat2, cat3, cat4 = get_categories_from_url(final_url)
                p = {'name': product['Name'], 'brand': product.get('Brand', None), 'url': product.get('Url', None)}
                persist_func(p, cat1, cat2, cat3, cat4)

    return 0


def get_categories_from_url(url):
    url_parts = url.split('/')
    url_parts = url_parts[url_parts.index('varer') + 1:]
    cat1 = None if len(url_parts) < 1 else url_parts[0]
    cat2 = None if len(url_parts) < 2 else url_parts[1]
    cat3 = None if len(url_parts) < 3 else url_parts[2]
    cat4 = None if len(url_parts) < 4 else url_parts[3]
    return cat1, cat2, cat3, cat4



def get_products_imported_timestamp(html):
    products_imported_timestamp_match = re.search(PRODUCTS_IMPORTED_TIMESTAMP_REGEX, html)
    if products_imported_timestamp_match:
        return products_imported_timestamp_match.group(1)
    else:
        return None


def process_category(category, depth):
    depth += 1
    if depth >= MAX_CATEGORY_DEPTH:
        return
    else:
        children = category['Children']
        if len(children) > 0:
            for child in children:
                process_category(child)
        else:
            get_items(category['Url'])

def get_timeslot_utc(html):
    timeslot_utc_match = re.search(TIMESLOT_UTC_REGEX, html)
    if timeslot_utc_match:
        return timeslot_utc_match.group(1)
    else:
        return None


def get_all_items():
    categories = get_categories()
    goods_category = [cat for cat in categories if cat['Url'] == '/varer'][0]
    process_category(goods_category, 0)
    return 0

get_all_items()

TypeError: process_category() takes exactly 2 arguments (1 given)

In [None]:
get_all_items(persist_func)

In [27]:
test_get_offers(shopgun_fixture)

AttributeError: 'function' object has no attribute 'get_offers'