In [1]:
from bs4 import BeautifulSoup
import requests
import sqlite3

TIKI_URL = "https://tiki.vn"

In [2]:
#create connection
conn = sqlite3.connect('tiki.db', timeout=5)
cur = conn.cursor()

In [144]:
def create_categories_table():
    query = """
        CREATE TABLE IF NOT EXISTS categories (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name VARCHAR(255),
            url TEXT, 
            parent_id INT, 
            create_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """
    try:
        cur.execute(query)
    except Exception as err:
        print('ERROR BY CREATE TABLE', err)
create_categories_table()

In [145]:
class Category:
    def __init__(self, cat_id, name, url, parent_id):
        self.cat_id = cat_id
        self.name = name
        self.url = url
        self.parent_id = parent_id

    def __repr__(self):
        return "ID: {}, Name: {}, URL: {}, Parent_id: {}".format(self.cat_id, self.name, self.url, self.parent_id)

    def save_into_db(self):
        query = """
            INSERT INTO categories (name, url, parent_id)
            VALUES (?, ?, ?);
        """
        val = (self.name, self.url, self.parent_id)
        try:
            cur.execute(query, val)
            self.cat_id = cur.lastrowid
            conn.commit()
        except Exception as err:
            print('ERROR BY INSERT:', err)


In [6]:
import time
def get_url(url):
    time.sleep(1)
    try:
        response = requests.get(url).text
        response = BeautifulSoup(response, 'html.parser')
        return response
    except Exception as err:
            print('ERROR BY REQUEST:', err)

In [147]:
def get_main_categories(save_db=False):
    soup = get_url(TIKI_URL)

    result = []
    for a in soup.findAll('a', {'class':'MenuItem__MenuLink-tii3xq-1 efuIbv'}):
        cat_id = None
        name = a.find('span', {'class':'text'}).text
        url = a['href']
        parent_id = None

        cat = Category(cat_id, name, url, parent_id)
        if save_db:
            cat.save_into_db()
        result.append(cat)
    return result

In [148]:
main_categories = get_main_categories(save_db=True)

In [149]:
def get_sub_categories(category, save_db=False):
    name = category.name
    url = category.url
    result = []

    try:
        soup = get_url(url)
        div_containers = soup.findAll('div', {'class':'list-group-item is-child'})
        for div in div_containers:
            sub_id = None
            sub_name = div.a.text
            sub_url = 'http://tiki.vn' + div.a['href']
            sub_parent_id = category.cat_id

            sub = Category(sub_id, sub_name, sub_url, sub_parent_id)
            if save_db:
                sub.save_into_db()
            result.append(sub)
    except Exception as err:
        print('ERROR BY GET SUB CATEGORIES:', err)

    return result

In [125]:
main_categories

[ID: 1, Name: Điện Thoại - Máy Tính Bảng, URL: https://tiki.vn/dien-thoai-may-tinh-bang/c1789?src=c.1789.hamburger_menu_fly_out_banner, Parent_id: None,
 ID: 2, Name: Điện Tử - Điện Lạnh, URL: https://tiki.vn/tivi-thiet-bi-nghe-nhin/c4221?src=c.4221.hamburger_menu_fly_out_banner, Parent_id: None,
 ID: 3, Name: Phụ Kiện - Thiết Bị Số, URL: https://tiki.vn/thiet-bi-kts-phu-kien-so/c1815?src=c.1815.hamburger_menu_fly_out_banner, Parent_id: None,
 ID: 4, Name: Laptop - Thiết bị IT, URL: https://tiki.vn/laptop-may-vi-tinh/c1846?src=c.1846.hamburger_menu_fly_out_banner, Parent_id: None,
 ID: 5, Name: Máy Ảnh - Quay Phim, URL: https://tiki.vn/may-anh/c1801?src=c.1801.hamburger_menu_fly_out_banner, Parent_id: None,
 ID: 6, Name: Điện Gia Dụng, URL: https://tiki.vn/dien-gia-dung/c1882?src=c.1882.hamburger_menu_fly_out_banner, Parent_id: None,
 ID: 7, Name: Nhà Cửa Đời Sống, URL: https://tiki.vn/nha-cua-doi-song/c1883?src=c.1883.hamburger_menu_fly_out_banner, Parent_id: None,
 ID: 8, Name: Hàng 

In [140]:
# cur.execute("DROP TABLE product;")

<sqlite3.Cursor at 0x7ff78d921420>

In [9]:
cat = main_categories[9]
get_sub_categories(cat)

[ID: None, Name: 
                                 Bộ sản phẩm làm đẹp                                                                (363)
 , URL: http://tiki.vn/bo-san-pham-lam-dep/c8161?src=c.1520.hamburger_menu_fly_out_banner, Parent_id: 3210,
 ID: None, Name: 
                                 Chăm sóc cá nhân                                                                (2408)
 , URL: http://tiki.vn/cham-soc-ca-nhan/c1594?src=c.1520.hamburger_menu_fly_out_banner, Parent_id: 3210,
 ID: None, Name: 
                                 Chăm sóc cơ thể                                                                (4639)
 , URL: http://tiki.vn/cham-soc-co-the/c1592?src=c.1520.hamburger_menu_fly_out_banner, Parent_id: 3210,
 ID: None, Name: 
                                 Chăm sóc da mặt                                                                (13992)
 , URL: http://tiki.vn/cham-soc-da-mat/c1582?src=c.1520.hamburger_menu_fly_out_banner, Parent_id: 3210,
 ID: None, Name: 
        

In [150]:
from collections import deque

def get_all_categories(main_categories):
    de = deque(main_categories)
    count = 0

    while de:
        parent_cat = de.popleft()
        sub_cats = get_sub_categories(parent_cat, save_db=True)
        # print(sub_cats)
        de.extend(sub_cats)
        count += 1

        if count % 100 == 0:
            print(count, 'times')


In [151]:
get_all_categories(main_categories)

100 times
200 times
300 times
400 times
500 times
600 times
700 times
800 times
ERROR BY REQUEST: HTTPSConnectionPool(host='tiki.vn', port=443): Max retries exceeded with url: /giay-the-thao-nu-khac/c32874?src=c.1975.hamburger_menu_fly_out_banner (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7ff7735df510>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))
ERROR BY GET SUB CATEGORIES: 'NoneType' object has no attribute 'findAll'
900 times
1000 times
1100 times
1200 times
1300 times
1400 times
1500 times
1600 times
1700 times
1800 times
1900 times
2000 times
2100 times
2200 times
2300 times
2400 times
2500 times
2600 times
2700 times
2800 times
2900 times
3000 times
3100 times


In [161]:
#create a product table
#refer to the sub category with foreign key
def create_product_table():
    query = """
        CREATE TABLE IF NOT EXISTS product (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name VARCHAR(255),
            url TEXT,
            img_url TEXT,
            regular_price TEXT,
            final_price TEXT,            
            sale_tag TEXT,
            comment INT,
            rating TEXT,
            cat_id INT,
            FOREIGN KEY(cat_id) REFERENCES categories(parent_id)
            );
    """
    try:
        cur.execute(query)
    except Exception as err:
        print('ERROR BY CREATE TABLE', err)
create_product_table()

In [160]:
# cur.execute("DROP TABLE product;")

<sqlite3.Cursor at 0x7ff7738b3570>

In [3]:
class Product():
    def __init__(self, product_id, name, url, img_url, regular_price, final_price, sale_tag, comment, rating, cat_id):
        self.product_id = product_id
        self.name = name
        self.url = url
        self.img_url = img_url
        self.regular_price = regular_price
        self.final_price = final_price
        self.sale_tag = sale_tag
        self.comment = comment
        self.rating = rating
        self.cat_id = cat_id

    def __repr__(self):
        return "ID: {}, Name: {}, URL: {}, Img_URL: {}, regular_price: {}, final_price: {}, sale_tag: {}, comment: {}, rating: {}, cat_id: {}".format(self.product_id, self.name, self.url, self.igm_url, self.regular_price, self.final_price, self.sale_tag, self.comment, self.rating, self.cat_id)

    def save_into_db(self):
        query = """
            INSERT INTO product (name, url, img_url, regular_price, final_price, sale_tag, comment, rating, cat_id)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?);
        """
        val = (self.name, self.url, self.img_url, self.regular_price, self.final_price, self.sale_tag, self.comment, self.rating, self.cat_id)
        try:
            cur.execute(query, val)
            self.product_id = cur.lastrowid
            conn.commit()
        except Exception as err:
            print('ERROR BY INSERT:', err)


In [97]:
rating = article.find('span', class_='rating-content').span['style'].split(":")[-1]
print(rating)

84%


In [179]:
#get only the categories of makeup which dont have any child category
def get_product(save_db=False):
    query = """
    SELECT id, name, url FROM categories
    WHERE parent_id BETWEEN 100 AND 113;
    """
    try:
        cur.execute(query)
    except Exception as err:
        print('ERROR BY SELECT TABLE', err)

    rows = cur.fetchall()
    
    for i in rows:
        id = i[0]
        name = " ".join(i[1].strip().split()[:-1])
        url = i[2].strip()
        for page in range(1,4):
            page_url = url + "&page=" + str(page)
#             print(page_url)
            page_html = get_url(page_url)
            result = []
            try:
                products_div = page_html.find_all('div', class_='product-item')[:64]
            except Exception as err:
                 print('ERROR BY DIV FINDALL: ', err)
            if products_div:
                if len(products_div) > 0:
                    for product_div in products_div:
                        product_id = None
                        title = product_div.a['title']
                        url = product_div.a['href']
                        img_url = product_div.img['src']
                        regular_price = product_div.find('span', class_='price-regular').text
                        final_price = product_div.find('span', class_='final-price').text.split()[0]
                        sale_tag = product_div.find('span', class_='sale-tag').text
                        comment = product_div.find('p', class_='review').text.split()[0] + ' review(s))'
                        if product_div.find('span', class_='rating-content'):
                            rating = product_div.find('span', class_='rating-content').find('span')['style'].split(":")[-1]
                        else:
                            rating = '0%'
                        product = Product(product_id, name, url, img_url, regular_price, final_price, sale_tag, comment, rating, id)

                        if save_db:
                            product.save_into_db()
                            print(f'SAVE {title} INTO DTB')
                        result.append(product)
                else:
                    break

In [None]:
get_product()

In [7]:
get_product()

http://tiki.vn/bo-cham-soc-co-the/c8226?src=c.1520.hamburger_menu_fly_out_banner&page=1
https://tiki.vn/bo-combo-3-chai-lam-trang-da-da-ryo-mi-cho-mat-toan-than-tay-te-bao-chet-p15067089.html?src=category-page-1520.8161.8226&2hi=0&page=2


IndexError: list index out of range