In [10]:
from concurrent.futures import ThreadPoolExecutor
import multiprocessing
import json
from tqdm import tqdm
import requests
import pandas as pd
import time
import random
import csv
from itertools import islice
from urllib.error import URLError, HTTPError
from urllib.request import urlopen
from pymongo import MongoClient

class ProductDataCrawling:
    def __init__(self):
        self.product_id_file = './data/final_backup_product_ids_v2.csv'
        self.product_error = './data/API_error_v1.csv'
        self.base_link = 'https://tiki.vn/api/v2/products/{}'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
            'Accept': 'application/json, text/plain, */*',
            'Accept-Language': 'vi-VN,vi;q=0.9,fr-FR;q=0.8,fr;q=0.7,en-US;q=0.6,en;q=0.5',
            'x-guest-token': 'myegxX7sWUZkK2RuvzdFMbQfjw04C8DP',
            'Connection': 'keep-alive',
            'TE': 'Trailers',
        }
        self.params = {
            'platform': 'web'
        }
        
        self.client = MongoClient('mongodb://localhost:27017')
        self.database_name = 'tiki_product_2000s'
       # Kiểm tra xem cơ sở dữ liệu đã tồn tại chưa
        if self.database_name in self.client.list_database_names():
            print(f"Database '{self.database_name}' already exists.")
            self.db = self.client[self.database_name]
        else:
            # Tạo cơ sở dữ liệu mới
            self.db = self.client[self.database_name]
            print(f"Database '{self.database_name}' created.")

        self.collection = self.db[self.database_name]
        
    def save_product_ids_error(self, product_id):
        with open(self.product_error, 'a', newline='') as csvfile:
            writer = csv.writer(csvfile)

            # Kiểm tra nếu file CSV rỗng, thêm header
            if csvfile.tell() == 0:
                writer.writerow(['id'])
            writer.writerow([product_id])
            
    def get_data(self, pid):
        try:
            response = requests.get(self.base_link.format(pid), headers=self.headers, params=self.params)
            if response.status_code == 200:
                try:
                    data = response.json()
                    # Lưu dữ liệu vào MongoDB
                    self.collection.insert_one(data)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON for product {pid}: {e}")
                    self.save_product_ids_error(pid)
                    time.sleep(3)
                time.sleep(2)
            else:
                print(f"get {pid} error")
                self.save_product_ids_error(pid)
                time.sleep(3)
        except (URLError, HTTPError, Exception) as e:
            print(f"Error opening URL:https://tiki.vn/api/v2/products/ {pid}")
            self.save_product_ids_error(pid)
            time.sleep(3)
    
    def read_product_id(self, file_path):
        with open(file_path, mode='r') as csvfile:
            spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
            # Skip the first row (header)
            rows_to_skip = 1
            rows = islice(spamreader, rows_to_skip, None)

            with ThreadPoolExecutor(max_workers=3) as executor:
                futures = []

                for row in rows:
                    id = ', '.join(row)
                    futures.append(executor.submit(self.get_data, id))

                for future in futures:
                    future.result()
            csvfile.close()
            
    def deleteDuplicateID(self):
        pipeline = [
            {
                '$group': {
                    '_id': '$id',
                    'duplicates': {'$addToSet': '$_id'},
                    'count': {'$sum': 1}
                }
            },
            {
                '$match': {
                    'count': {'$gt': 1}
                }
            }
        ]

        result = self.collection.aggregate(pipeline)

        # Xóa các bản ghi trùng nhau
        for doc in result:
            duplicates = doc['duplicates']
            del duplicates[0]  # Giữ lại bản ghi đầu tiên và xóa các bản ghi còn lại
            collection.delete_many({'_id': {'$in': duplicates}})
    
    def main(self):
        self.read_product_id(self.product_id_file)
        print("Done")

if __name__ == "__main__":
    crawler = ProductDataCrawling()
    crawler.main()


Database 'tiki_product_2000s' already exists.
Done
