In [1]:
import pandas as pd
import re
import json
from datetime import datetime

# Sample JSON data
sample_json = '''
[
    {
        "section": "gpu",
        "name": "ASUS ROG STRIX 라데온 RX 6700 XT O12G GAMING OC D6 12GB 대원씨티에스",
        "spec": "RX 6700 XT/7nm/스트림 프로세서:2560개/PCIe4.0x16/GDDR6(DDR6)/출력단자:HDMI2.1,DP1.4/지원기능:멀티VGA 지원,8K 지원,4K 지원,HDR 지원,HDCP 2.3/",
        "price": "판매준비",
        "image_url": "https://img.danawa.com/prod_img/500000/392/653/img/13653392_1.jpg?shrink=80:80",
        "product_url": "https://shop.danawa.com/pc/?controller=estimateDeal&methods=productInformation&productSeq=13653392&marketPlaceSeq=16"
    },
    {
        "section": "gpu",
        "name": "GIGABYTE 지포스 RTX 4070 SUPER WINDFORCE OC D6X 12GB 피씨디렉트",
        "spec": "RTX 4070 SUPER/4nm/부스트클럭:2505MHz/스트림 프로세서:7168개/PCIe4.0x16/GDDR6X(DDR6X)",
        "price": "935,250원",
        "image_url": "https://img.danawa.com/prod_img/500000/520/494/img/32494520_1.jpg?shrink=80:80",
        "product_url": "https://shop.danawa.com/pc/?controller=estimateDeal&methods=productInformation&productSeq=32494520&marketPlaceSeq=16"
    },
    {
        "section": "gpu",
        "name": "PowerColor 라데온 RX 7600 Fighter D6 8GB 대원씨티에스",
        "spec": "RX 7600/부스트클럭:2655MHz/스트림 프로세서:2048개/PCIe4.0x16(at x8)/GDDR6(DDR6)/출력단자:HDMI2.1,DP1.4/",
        "price": "346,680원",
        "image_url": "https://img.danawa.com/prod_img/500000/306/231/img/35231306_1.jpg?shrink=80:80",
        "product_url": "https://shop.danawa.com/pc/?controller=estimateDeal&methods=productInformation&productSeq=35231306&marketPlaceSeq=16"
    },
    {
        "section": "gpu",
        "name": "이엠텍 지포스 RTX 4080 SUPER MIRACLE WHITE D6X 16GB",
        "spec": "RTX 4080 SUPER/4nm/베이스클럭:2295MHz/부스트클럭:2550MHz/스트림 프로세서:10240개/PCIe4.0x16/GDDR6X(DDR6X)",
        "price": "1,655,330원",
        "image_url": "https://img.danawa.com/prod_img/500000/535/157/img/36157535_1.jpg?shrink=80:80",
        "product_url": "https://shop.danawa.com/pc/?controller=estimateDeal&methods=productInformation&productSeq=36157535&marketPlaceSeq=16"
    }
]
'''

# Load JSON data
data = json.loads(sample_json)

# Convert JSON data to DataFrame
shopdanawa_gpu = pd.DataFrame(data)

# Function to remove content inside square brackets
# def remove_square_brackets(text):
#     return re.sub(r'\[.*?\]', '', text).strip()

def remove_square_brackets_and_stars(text):
    text = re.sub(r'\[.*?\]', '', text).strip()  # Remove content inside square brackets
    text = re.sub(r'★.*?★', '', text).strip()  # Remove content between stars and the stars themselves
    return text

# Apply the function to the 'name' column
shopdanawa_gpu['name'] = shopdanawa_gpu['name'].apply(remove_square_brackets_and_stars)

# Convert specified columns to uppercase
columns_to_uppercase = ['name', 'section']
for column in columns_to_uppercase:
    shopdanawa_gpu[column] = shopdanawa_gpu[column].str.upper()

# List of words to be removed from the model
words_to_remove = ['대원시티에스', '피씨디렉트', '디앤디컴', '제이씨현', '에즈윈', '대원씨티에스', '웨이코스', '마이크로닉스', '지포스', '라데온', 'D6X', 'RADEON', 'GEFORCE', 'RADEON™', 'D6']

# Color translation dictionary
color_translation = {
    '핑크': 'Pink', '화이트': 'White', '블랙': 'Black', '레드': 'Red', 
    '블루': 'Blue', '그린': 'Green', '옐로우': 'Yellow', '퍼플': 'Purple', 
    '실버': 'Silver', '골드': 'Gold'
}

# Add chipset_type column based on productName containing '라데온'
shopdanawa_gpu['chipset_type'] = shopdanawa_gpu['name'].apply(lambda x: 'AMD' if '라데온' in x else 'NVIDIA')
shopdanawa_gpu.rename(columns={'section': 'Type'}, inplace=True)

def translate_company(company):
    translations = {
        'EMTEK': '이엠텍',
        'GALAX': '갤럭시',
        'LEADTEK': '리드텍'
    }
    return translations.get(company, company)


# Function to extract company, model, memory, color, and RGB information
def extract_company_model_memory_color_rgb(row):
    parts = row.split()
    company = parts.pop(0)
    memory = ""
    color = ""
    model_parts = []
    has_rgb = False

    for part in parts:
        if re.search(r'\d+GB', part):  # Check if part contains memory information
            memory = part.replace('GB', '').strip()
        elif part.lower() in color_translation.keys() or part.lower() in [v.lower() for v in color_translation.values()]:
            color = color_translation.get(part.lower(), part.capitalize())
        elif part.upper() == 'RGB':
            has_rgb = True
        elif part not in words_to_remove:
            model_parts.append(part)
    
    model = " ".join(model_parts).strip().upper()  # Remaining parts form the model name

    return company,model, memory, color, has_rgb

# Apply the function to the productName column
shopdanawa_gpu[['Company', 'Model', 'Memory', 'Color', 'RGB']] = shopdanawa_gpu['name'].apply(lambda x: pd.Series(extract_company_model_memory_color_rgb(x)))

# Apply the function to the 'company' column
shopdanawa_gpu['Company'] = shopdanawa_gpu['Company'].apply(translate_company)

# Function to reorder words in the Model column
# def reorder_model(row):
#     if row['Company'] == 'ASUS' and 'TUF GAMING' in row['Model']:
#         parts = row['Model'].split(' ')
#         tuf_index = parts.index('TUF')
#         reordered_parts = parts[tuf_index:tuf_index+2] + parts[:tuf_index] + parts[tuf_index+2:]
#         return ' '.join(reordered_parts)
#     return row['Model']

#shopdanawa_gpu.rename(columns={'company': 'Company'}, inplace=True)

# Apply the function to reorder the Model column
#shopdanawa_gpu['Model'] = shopdanawa_gpu.apply(reorder_model, axis=1)

# Remove rows where productName contains '해외'
shopdanawa_gpu = shopdanawa_gpu[~shopdanawa_gpu['name'].str.contains('해외')]

# Remove commas from productPriceStr column and rename it to Price
shopdanawa_gpu['Price'] = shopdanawa_gpu['price'].str.replace(',', '').str.replace("원", "")
shopdanawa_gpu.drop(columns=['price'], inplace=True)


# Add ComponentID column
shopdanawa_gpu['ComponentID'] = shopdanawa_gpu.apply(lambda x: f"{x['Type']}#{x['Company']}#{x['Model']}", axis=1)

# Add Date column with current date and time
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
shopdanawa_gpu['Date'] = current_time

# Add Shop column
shopdanawa_gpu['Shop'] = 'shopdanawa'

# Rename productUrlAll to URL
shopdanawa_gpu.rename(columns={'product_url': 'URL'}, inplace=True)

# Rename productImage to image_url and add https: if it starts with //
#shopdanawa_gpu.rename(columns={'img': 'image_url'}, inplace=True)
shopdanawa_gpu['image_url'] = shopdanawa_gpu['image_url'].apply(lambda x: 'https:' + x if x.startswith('//') else x)

# Convert RGB column to boolean
shopdanawa_gpu['RGB'] = shopdanawa_gpu['RGB'].astype(bool)

# 중복 데이터 제거
shopdanawa_gpu.drop_duplicates(subset=['ComponentID', 'URL'], inplace=True)

# Drop productId, productName, and productSpec columns
shopdanawa_gpu.drop(columns=['spec', 'name'], inplace=True)

# Load the standard GPU CSV data
standard_gpu_path = 'gpu_standard.csv'
standard_gpu = pd.read_csv(standard_gpu_path)

# Create ComponentID in standard_gpu
standard_gpu['ComponentID'] = 'GPU#' + standard_gpu['회사'] + '#' + standard_gpu['모델']

# Convert 메모리 column to int and then to string
standard_gpu['메모리'] = standard_gpu['메모리'].astype(int).astype(str)

# Join shopdanawa_gpu and standard_gpu on ComponentID and Memory
final_gpu = pd.merge(shopdanawa_gpu, standard_gpu, left_on=['ComponentID', 'Memory'], right_on=['ComponentID', '메모리'], how='inner')

# Drop the columns that are no longer needed after join
final_gpu.drop(columns=['메모리', '회사', '모델'], inplace=True)
final_gpu.fillna('', inplace = True)

# Convert final DataFrame back to JSON
final_data = final_gpu[['ComponentID', 'Type', 'Date', 'Shop', 'Price', 'URL']].copy()
final_json_data = final_data.to_dict(orient='records')

# Save the final cleaned JSON data
final_output_path_v2 = 'processed_shopdanawa_gpu2222.json'
with open(final_output_path_v2, 'w', encoding='utf-8') as f:
    json.dump(final_json_data, f, ensure_ascii=False, indent=4)



In [2]:
shopdanawa_gpu

Unnamed: 0,Type,image_url,URL,chipset_type,Company,Model,Memory,Color,RGB,Price,ComponentID,Date,Shop
0,GPU,https://img.danawa.com/prod_img/500000/392/653...,https://shop.danawa.com/pc/?controller=estimat...,AMD,ASUS,ROG STRIX RX 6700 XT O12G GAMING OC,12,,False,판매준비,GPU#ASUS#ROG STRIX RX 6700 XT O12G GAMING OC,2024-07-02 15:18:52,shopdanawa
1,GPU,https://img.danawa.com/prod_img/500000/520/494...,https://shop.danawa.com/pc/?controller=estimat...,NVIDIA,GIGABYTE,RTX 4070 SUPER WINDFORCE OC,12,,False,935250,GPU#GIGABYTE#RTX 4070 SUPER WINDFORCE OC,2024-07-02 15:18:52,shopdanawa
2,GPU,https://img.danawa.com/prod_img/500000/306/231...,https://shop.danawa.com/pc/?controller=estimat...,AMD,POWERCOLOR,RX 7600 FIGHTER,8,,False,346680,GPU#POWERCOLOR#RX 7600 FIGHTER,2024-07-02 15:18:52,shopdanawa
3,GPU,https://img.danawa.com/prod_img/500000/535/157...,https://shop.danawa.com/pc/?controller=estimat...,NVIDIA,이엠텍,RTX 4080 SUPER MIRACLE,16,White,False,1655330,GPU#이엠텍#RTX 4080 SUPER MIRACLE,2024-07-02 15:18:52,shopdanawa


In [None]:
import pandas as pd
import re
import json
from datetime import datetime

# Sample JSON data
sample_json = '''
[
    {
        "title": "gpu",
        "name": "[emTek] GeForce RTX 4060 Ti STORM X Dual OC D6 8GB",
        "company": "emTek",
        "spec": "RTX 4060 Ti / 4nm / 베이스클럭: 2310MHz / 부스트클럭: 2685MHz / 스트림 프로세서: 4352개 / PCIe4.0x16(at x8) / GDDR6(DDR6) / 출력단자: HDMI2.1 , DP1.4 / 부가기능: 제로팬(0-dB기술) , 8K 지원 , 4K 지원 , HDR 지원 , HDCP 2.3 / 사용전력: 160W / 정격파워 650W 이상 / 전원 포트: 8핀 x1개 / 2개 팬 / 가로(길이): 249.9mm / 두께: 40.1mm / 백플레이트 / DrMOS / LED 라이트 / ThunderMaster / A/S 3년",
        "img": "https://image3.compuzone.co.kr/img/product_img/2023/0524/1029950/1029950_600.jpg",
        "price": "580000",
        "link": "https://www.compuzone.co.kr/product/product_detail.htm?ProductNo=1029950&BigDivNo=4&MediumDivNo=1016&DivNo=0"
    },
    {
        "title": "gpu",
        "name": "[ASUS] DUAL 지포스 RTX 4070 SUPER O12G EVO OC D6X 12GB ★ 컴퓨존 5만원 다운로드 쿠폰 할인 ★",
        "company": "ASUS",
        "spec": "4nm / 베이스클럭: 1980MHz / 부스트클럭: 2520MHz / OC클럭: 2550MHz / 스트림 프로세서: 7168개 / PCIe4.0x16 / GDDR6X(DDR6X) / 출력단자: HDMI2.1 , DP1.4 / 부가기능: 제로팬(0-dB기술) , 8K 지원 , 4K 지원 , HDR 지원 , HDCP 2.3 / 정격파워 750W 이상 / 전원 포트: 16핀(12VHPWR) x1 / 2개 팬 / 가로(길이): 227.2mm / 두께: 49. / RTX4070 SUPER / PCI Express 4.0 x16 / 12GB / GDDR6X / 192bit / 일반 / 1개 / 3개 / DisplayPortx3개 / HDMIx1개 / 16Pin / VGA231~250mm / AURA-SYNC / 히트파이프+팬 / 팬2개",
        "img": "https://image3.compuzone.co.kr/img/product_img/2024/0229/1120873/1120873_600.jpg",
        "price": "950000",
        "link": "https://www.compuzone.co.kr/product/product_detail.htm?ProductNo=1120873&BigDivNo=&MediumDivNo=1016&DivNo=0"
    }
]
'''

# Load JSON data
data = json.loads(sample_json)

# Convert JSON data to DataFrame
shopdanawa_gpu = pd.DataFrame(data)

# Function to remove content inside square brackets
def remove_square_brackets(text):
    return re.sub(r'\[.*?\]', '', text).strip()

def remove_square_brackets_and_stars(text):
    text = re.sub(r'\[.*?\]', '', text).strip()  # Remove content inside square brackets
    text = re.sub(r'★.*?★', '', text).strip()  # Remove content between stars and the stars themselves
    return text

# Apply the function to the 'name' column
shopdanawa_gpu['name'] = shopdanawa_gpu['name'].apply(remove_square_brackets_and_stars)

# Convert specified columns to uppercase
columns_to_uppercase = ['name', 'title', 'company']
for column in columns_to_uppercase:
    shopdanawa_gpu[column] = shopdanawa_gpu[column].str.upper()

# List of words to be removed from the model
words_to_remove = ['대원시티에스', '피씨디렉트', '디앤디컴', '제이씨현', '에즈윈', '대원씨티에스', '웨이코스', '마이크로닉스', '지포스', '라데온', 'D6X', 'RADEON', 'GEFORCE', 'RADEON™', 'D6']

# Color translation dictionary
color_translation = {
    '핑크': 'Pink', '화이트': 'White', '블랙': 'Black', '레드': 'Red', 
    '블루': 'Blue', '그린': 'Green', '옐로우': 'Yellow', '퍼플': 'Purple', 
    '실버': 'Silver', '골드': 'Gold'
}

# Add chipset_type column based on productName containing '라데온'
shopdanawa_gpu['chipset_type'] = shopdanawa_gpu['name'].apply(lambda x: 'AMD' if 'RADEON' in x else 'NVIDIA')
shopdanawa_gpu.rename(columns={'title': 'Type'}, inplace=True)


def translate_company(company):
    translations = {
        'EMTEK': '이엠텍',
        'GALAX': '갤럭시',
        'LEADTEK': '리드텍'
    }
    return translations.get(company, company)

# Apply the function to the 'company' column
shopdanawa_gpu['company'] = shopdanawa_gpu['company'].apply(translate_company)

# Function to extract company, model, memory, color, and RGB information
def extract_company_model_memory_color_rgb(row):
    parts = row.split()
    company = ""
    memory = ""
    color = ""
    model_parts = []
    has_rgb = False

    for part in parts:
        if re.search(r'\d+GB', part):  # Check if part contains memory information
            memory = part.replace('GB', '').strip()
        elif part.lower() in color_translation.keys() or part.lower() in [v.lower() for v in color_translation.values()]:
            color = color_translation.get(part.lower(), part.capitalize())
        elif part.upper() == 'RGB':
            has_rgb = True
        elif part not in words_to_remove:
            model_parts.append(part)
    
    model = " ".join(model_parts).strip().upper()  # Remaining parts form the model name

    return model, memory, color, has_rgb

# Apply the function to the productName column
shopdanawa_gpu[['Model', 'Memory', 'Color', 'RGB']] = shopdanawa_gpu['name'].apply(lambda x: pd.Series(extract_company_model_memory_color_rgb(x)))

# Function to reorder words in the Model column
def reorder_model(row):
    if row['Company'] == 'ASUS' and 'TUF GAMING' in row['Model']:
        parts = row['Model'].split(' ')
        tuf_index = parts.index('TUF')
        reordered_parts = parts[tuf_index:tuf_index+2] + parts[:tuf_index] + parts[tuf_index+2:]
        return ' '.join(reordered_parts)
    return row['Model']

shopdanawa_gpu.rename(columns={'company': 'Company'}, inplace=True)

# Apply the function to reorder the Model column
shopdanawa_gpu['Model'] = shopdanawa_gpu.apply(reorder_model, axis=1)

# Remove rows where productName contains '해외'
shopdanawa_gpu = shopdanawa_gpu[~shopdanawa_gpu['name'].str.contains('해외')]

# Remove commas from productPriceStr column and rename it to Price
shopdanawa_gpu['Price'] = shopdanawa_gpu['price'].str.replace(',', '')
shopdanawa_gpu.drop(columns=['price'], inplace=True)


# Add ComponentID column
shopdanawa_gpu['ComponentID'] = shopdanawa_gpu.apply(lambda x: f"{x['Type']}#{x['Company']}#{x['Model']}", axis=1)

# Add Date column with current date and time
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
shopdanawa_gpu['Date'] = current_time

# Add Shop column
shopdanawa_gpu['Shop'] = 'compuzone'

# Rename productUrlAll to URL
shopdanawa_gpu.rename(columns={'link': 'URL'}, inplace=True)

# Rename productImage to image_url and add https: if it starts with //
shopdanawa_gpu.rename(columns={'img': 'image_url'}, inplace=True)
shopdanawa_gpu['image_url'] = shopdanawa_gpu['image_url'].apply(lambda x: 'https:' + x if x.startswith('//') else x)

# Convert RGB column to boolean
shopdanawa_gpu['RGB'] = shopdanawa_gpu['RGB'].astype(bool)

# 중복 데이터 제거
shopdanawa_gpu.drop_duplicates(subset=['ComponentID', 'URL'], inplace=True)

# Drop productId, productName, and productSpec columns
shopdanawa_gpu.drop(columns=['spec', 'name'], inplace=True)

In [None]:
shopdanawa_gpu