In [11]:
import pandas as pd
import re
import json
from datetime import datetime

# Sample JSON data
sample_json = '''
[
    {
        "title": "gpu",
        "name": "[ASUS] ROG STRIX GeForce RTX 4090 O24G GAMING OC D6X 24GB WHITE",
        "company": "ASUS",
        "spec": "RTX 4090 / 4nm / 부스트클럭: 2640MHz / 스트림 프로세서: 16384개 / PCIe4.0x16 / GDDR6X(DDR6X) / 출력단자: HDMI2.1 , DP1.4 / 부가기능: 제로팬(0-dB기술) , 8K 해상도 지원 , 4K 해상도 지원 , HDR 지원 , HDCP 2.3 / 정격파워 1000W 이상 / 전원 포트: 16(12+4)핀 x1개 / 3개 팬 / 가로(길이): 357.6mm / 두께: 70.1mm / 백플레이트 / LED 라이트 / AURA SYNC / VGA지지대 포함 / RTX4090 / PCI Express 4.0 x16 / 24GB / GDDR6X / 384bit / 일반 / 1개 / 3개 / DisplayPortx3개 / HDMIx1개 / 16Pin / VGA290mm초과 / AURA-SYNC / 히트파이프+팬 / 팬3개",
        "img": "https://image3.compuzone.co.kr/img/product_img/2023/0519/1028534/1028534_600.jpg",
        "price": "3365000",
        "link": "https://www.compuzone.co.kr/product/product_detail.htm?ProductNo=1028534&BigDivNo=&MediumDivNo=1016&DivNo=0"
    },
    {
        "title": "gpu",
        "name": "[ASUS] GeForce RTX 4070 Ti SUPER TUF GAMING O16G OC D6X 16GB",
        "company": "ASUS",
        "spec": "RTX 4070 Ti SUPER / 4nm / 베이스클럭: 2340MHz / 부스트클럭: 2640MHz / OC클럭: 2670MHz",
        "img": "https://image3.compuzone.co.kr/img/product_img/2024/0125/1111628/1111628_600.jpg",
        "price": "1220000",
        "link": "https://www.compuzone.co.kr/product/product_detail.htm?ProductNo=1111628&BigDivNo=&MediumDivNo=1016&DivNo=0"
    }
]
'''

# Load JSON data
data = json.loads(sample_json)

# Convert JSON data to DataFrame
compuzone_gpu = pd.DataFrame(data)

# Function to remove content inside square brackets
# def remove_square_brackets(text):
#     return re.sub(r'\[.*?\]', '', text).strip()

def remove_square_brackets_and_stars(text):
    text = re.sub(r'\[.*?\]', '', text).strip()  # Remove content inside square brackets
    text = re.sub(r'★.*?★', '', text).strip()  # Remove content between stars and the stars themselves
    return text

# Apply the function to the 'name' column
compuzone_gpu['name'] = compuzone_gpu['name'].apply(remove_square_brackets_and_stars)

# Convert specified columns to uppercase
columns_to_uppercase = ['name', 'title', 'company']
for column in columns_to_uppercase:
    compuzone_gpu[column] = compuzone_gpu[column].str.upper()

# List of words to be removed from the model
words_to_remove = ['대원시티에스', '피씨디렉트', '디앤디컴', '제이씨현', '에즈윈', '대원씨티에스', '웨이코스', '마이크로닉스', '지포스', '라데온', 'D6X', 'RADEON', 'GEFORCE', 'RADEON™', 'D6']

# Color translation dictionary
color_translation = {
    '핑크': 'Pink', '화이트': 'White', '블랙': 'Black', '레드': 'Red', 
    '블루': 'Blue', '그린': 'Green', '옐로우': 'Yellow', '퍼플': 'Purple', 
    '실버': 'Silver', '골드': 'Gold'
}

# Add chipset_type column based on productName containing '라데온'
compuzone_gpu['chipset_type'] = compuzone_gpu['name'].apply(lambda x: 'AMD' if 'RADEON' in x else 'NVIDIA')
compuzone_gpu.rename(columns={'title': 'Type'}, inplace=True)

def translate_company(company):
    translations = {
        'EMTEK': '이엠텍',
        'GALAX': '갤럭시',
        'LEADTEK': '리드텍'
    }
    return translations.get(company, company)

# Apply the function to the 'company' column
compuzone_gpu['company'] = compuzone_gpu['company'].apply(translate_company)

# Function to extract company, model, memory, color, and RGB information
def extract_company_model_memory_color_rgb(row):
    parts = row.split()
    company = ""
    memory = ""
    color = ""
    model_parts = []
    has_rgb = False

    for part in parts:
        if re.search(r'\d+GB', part):  # Check if part contains memory information
            memory = part.replace('GB', '').strip()
        elif part.lower() in color_translation.keys() or part.lower() in [v.lower() for v in color_translation.values()]:
            color = color_translation.get(part.lower(), part.capitalize())
        elif part.upper() == 'RGB':
            has_rgb = True
        elif part not in words_to_remove:
            model_parts.append(part)
    
    model = " ".join(model_parts).strip().upper()  # Remaining parts form the model name

    return model, memory, color, has_rgb

# Apply the function to the productName column
compuzone_gpu[['Model', 'Memory', 'Color', 'RGB']] = compuzone_gpu['name'].apply(lambda x: pd.Series(extract_company_model_memory_color_rgb(x)))

# Function to reorder words in the Model column
def reorder_model(row):
    if row['Company'] == 'ASUS' and 'TUF GAMING' in row['Model']:
        parts = row['Model'].split(' ')
        tuf_index = parts.index('TUF')
        reordered_parts = parts[tuf_index:tuf_index+2] + parts[:tuf_index] + parts[tuf_index+2:]
        return ' '.join(reordered_parts)
    return row['Model']

compuzone_gpu.rename(columns={'company': 'Company'}, inplace=True)

# Apply the function to reorder the Model column
compuzone_gpu['Model'] = compuzone_gpu.apply(reorder_model, axis=1)

# Remove rows where productName contains '해외'
compuzone_gpu = compuzone_gpu[~compuzone_gpu['name'].str.contains('해외')]

# Remove commas from productPriceStr column and rename it to Price
compuzone_gpu['Price'] = compuzone_gpu['price'].str.replace(',', '')
compuzone_gpu.drop(columns=['price'], inplace=True)


# Add ComponentID column
compuzone_gpu['ComponentID'] = compuzone_gpu.apply(lambda x: f"{x['Type']}#{x['Company']}#{x['Model']}", axis=1)

# Add Date column with current date and time
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
compuzone_gpu['Date'] = current_time

# Add Shop column
compuzone_gpu['Shop'] = 'compuzone'

# Rename productUrlAll to URL
compuzone_gpu.rename(columns={'link': 'URL'}, inplace=True)

# Rename productImage to image_url and add https: if it starts with //
compuzone_gpu.rename(columns={'img': 'image_url'}, inplace=True)
compuzone_gpu['image_url'] = compuzone_gpu['image_url'].apply(lambda x: 'https:' + x if x.startswith('//') else x)

# Convert RGB column to boolean
compuzone_gpu['RGB'] = compuzone_gpu['RGB'].astype(bool)

# 중복 데이터 제거
compuzone_gpu.drop_duplicates(subset=['ComponentID', 'URL'], inplace=True)

# Drop productId, productName, and productSpec columns
compuzone_gpu.drop(columns=['spec', 'name'], inplace=True)

# Load the standard GPU CSV data
standard_gpu_path = 'gpu_standard.csv'
standard_gpu = pd.read_csv(standard_gpu_path)

# Create ComponentID in standard_gpu
standard_gpu['ComponentID'] = 'GPU#' + standard_gpu['회사'] + '#' + standard_gpu['모델']

# Convert 메모리 column to int and then to string
standard_gpu['메모리'] = standard_gpu['메모리'].astype(int).astype(str)

# Join compuzone_gpu and standard_gpu on ComponentID and Memory
final_gpu = pd.merge(compuzone_gpu, standard_gpu, left_on=['ComponentID', 'Memory'], right_on=['ComponentID', '메모리'], how='inner')

# Drop the columns that are no longer needed after join
final_gpu.drop(columns=['메모리', '회사', '모델'], inplace=True)
final_gpu.fillna('', inplace = True)

# Convert final DataFrame back to JSON
final_data = final_gpu[['ComponentID', 'Type', 'Date', 'Shop', 'Price', 'URL']].copy()
final_json_data = final_data.to_dict(orient='records')

# Save the final cleaned JSON data
final_output_path_v2 = 'processed_compuzone_gpu2222.json'
with open(final_output_path_v2, 'w', encoding='utf-8') as f:
    json.dump(final_json_data, f, ensure_ascii=False, indent=4)



In [14]:
import pandas as pd
import re
import json
from datetime import datetime

# Sample JSON data
sample_json = '''
[
    {
        "title": "gpu",
        "name": "[emTek] GeForce RTX 4060 Ti STORM X Dual OC D6 8GB",
        "company": "emTek",
        "spec": "RTX 4060 Ti / 4nm / 베이스클럭: 2310MHz / 부스트클럭: 2685MHz / 스트림 프로세서: 4352개 / PCIe4.0x16(at x8) / GDDR6(DDR6) / 출력단자: HDMI2.1 , DP1.4 / 부가기능: 제로팬(0-dB기술) , 8K 지원 , 4K 지원 , HDR 지원 , HDCP 2.3 / 사용전력: 160W / 정격파워 650W 이상 / 전원 포트: 8핀 x1개 / 2개 팬 / 가로(길이): 249.9mm / 두께: 40.1mm / 백플레이트 / DrMOS / LED 라이트 / ThunderMaster / A/S 3년",
        "img": "https://image3.compuzone.co.kr/img/product_img/2023/0524/1029950/1029950_600.jpg",
        "price": "580000",
        "link": "https://www.compuzone.co.kr/product/product_detail.htm?ProductNo=1029950&BigDivNo=4&MediumDivNo=1016&DivNo=0"
    },
    {
        "title": "gpu",
        "name": "[ASUS] DUAL 지포스 RTX 4070 SUPER O12G EVO OC D6X 12GB ★ 컴퓨존 5만원 다운로드 쿠폰 할인 ★",
        "company": "ASUS",
        "spec": "4nm / 베이스클럭: 1980MHz / 부스트클럭: 2520MHz / OC클럭: 2550MHz / 스트림 프로세서: 7168개 / PCIe4.0x16 / GDDR6X(DDR6X) / 출력단자: HDMI2.1 , DP1.4 / 부가기능: 제로팬(0-dB기술) , 8K 지원 , 4K 지원 , HDR 지원 , HDCP 2.3 / 정격파워 750W 이상 / 전원 포트: 16핀(12VHPWR) x1 / 2개 팬 / 가로(길이): 227.2mm / 두께: 49. / RTX4070 SUPER / PCI Express 4.0 x16 / 12GB / GDDR6X / 192bit / 일반 / 1개 / 3개 / DisplayPortx3개 / HDMIx1개 / 16Pin / VGA231~250mm / AURA-SYNC / 히트파이프+팬 / 팬2개",
        "img": "https://image3.compuzone.co.kr/img/product_img/2024/0229/1120873/1120873_600.jpg",
        "price": "950000",
        "link": "https://www.compuzone.co.kr/product/product_detail.htm?ProductNo=1120873&BigDivNo=&MediumDivNo=1016&DivNo=0"
    }
]
'''

# Load JSON data
data = json.loads(sample_json)

# Convert JSON data to DataFrame
compuzone_gpu = pd.DataFrame(data)

# Function to remove content inside square brackets
def remove_square_brackets(text):
    return re.sub(r'\[.*?\]', '', text).strip()

def remove_square_brackets_and_stars(text):
    text = re.sub(r'\[.*?\]', '', text).strip()  # Remove content inside square brackets
    text = re.sub(r'★.*?★', '', text).strip()  # Remove content between stars and the stars themselves
    return text

# Apply the function to the 'name' column
compuzone_gpu['name'] = compuzone_gpu['name'].apply(remove_square_brackets_and_stars)

# Convert specified columns to uppercase
columns_to_uppercase = ['name', 'title', 'company']
for column in columns_to_uppercase:
    compuzone_gpu[column] = compuzone_gpu[column].str.upper()

# List of words to be removed from the model
words_to_remove = ['대원시티에스', '피씨디렉트', '디앤디컴', '제이씨현', '에즈윈', '대원씨티에스', '웨이코스', '마이크로닉스', '지포스', '라데온', 'D6X', 'RADEON', 'GEFORCE', 'RADEON™', 'D6']

# Color translation dictionary
color_translation = {
    '핑크': 'Pink', '화이트': 'White', '블랙': 'Black', '레드': 'Red', 
    '블루': 'Blue', '그린': 'Green', '옐로우': 'Yellow', '퍼플': 'Purple', 
    '실버': 'Silver', '골드': 'Gold'
}

# Add chipset_type column based on productName containing '라데온'
compuzone_gpu['chipset_type'] = compuzone_gpu['name'].apply(lambda x: 'AMD' if 'RADEON' in x else 'NVIDIA')
compuzone_gpu.rename(columns={'title': 'Type'}, inplace=True)


def translate_company(company):
    translations = {
        'EMTEK': '이엠텍',
        'GALAX': '갤럭시',
        'LEADTEK': '리드텍'
    }
    return translations.get(company, company)

# Apply the function to the 'company' column
compuzone_gpu['company'] = compuzone_gpu['company'].apply(translate_company)

# Function to extract company, model, memory, color, and RGB information
def extract_company_model_memory_color_rgb(row):
    parts = row.split()
    company = ""
    memory = ""
    color = ""
    model_parts = []
    has_rgb = False

    for part in parts:
        if re.search(r'\d+GB', part):  # Check if part contains memory information
            memory = part.replace('GB', '').strip()
        elif part.lower() in color_translation.keys() or part.lower() in [v.lower() for v in color_translation.values()]:
            color = color_translation.get(part.lower(), part.capitalize())
        elif part.upper() == 'RGB':
            has_rgb = True
        elif part not in words_to_remove:
            model_parts.append(part)
    
    model = " ".join(model_parts).strip().upper()  # Remaining parts form the model name

    return model, memory, color, has_rgb

# Apply the function to the productName column
compuzone_gpu[['Model', 'Memory', 'Color', 'RGB']] = compuzone_gpu['name'].apply(lambda x: pd.Series(extract_company_model_memory_color_rgb(x)))

# Function to reorder words in the Model column
def reorder_model(row):
    if row['Company'] == 'ASUS' and 'TUF GAMING' in row['Model']:
        parts = row['Model'].split(' ')
        tuf_index = parts.index('TUF')
        reordered_parts = parts[tuf_index:tuf_index+2] + parts[:tuf_index] + parts[tuf_index+2:]
        return ' '.join(reordered_parts)
    return row['Model']

compuzone_gpu.rename(columns={'company': 'Company'}, inplace=True)

# Apply the function to reorder the Model column
compuzone_gpu['Model'] = compuzone_gpu.apply(reorder_model, axis=1)

# Remove rows where productName contains '해외'
compuzone_gpu = compuzone_gpu[~compuzone_gpu['name'].str.contains('해외')]

# Remove commas from productPriceStr column and rename it to Price
compuzone_gpu['Price'] = compuzone_gpu['price'].str.replace(',', '')
compuzone_gpu.drop(columns=['price'], inplace=True)


# Add ComponentID column
compuzone_gpu['ComponentID'] = compuzone_gpu.apply(lambda x: f"{x['Type']}#{x['Company']}#{x['Model']}", axis=1)

# Add Date column with current date and time
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
compuzone_gpu['Date'] = current_time

# Add Shop column
compuzone_gpu['Shop'] = 'compuzone'

# Rename productUrlAll to URL
compuzone_gpu.rename(columns={'link': 'URL'}, inplace=True)

# Rename productImage to image_url and add https: if it starts with //
compuzone_gpu.rename(columns={'img': 'image_url'}, inplace=True)
compuzone_gpu['image_url'] = compuzone_gpu['image_url'].apply(lambda x: 'https:' + x if x.startswith('//') else x)

# Convert RGB column to boolean
compuzone_gpu['RGB'] = compuzone_gpu['RGB'].astype(bool)

# 중복 데이터 제거
compuzone_gpu.drop_duplicates(subset=['ComponentID', 'URL'], inplace=True)

# Drop productId, productName, and productSpec columns
compuzone_gpu.drop(columns=['spec', 'name'], inplace=True)

In [15]:
compuzone_gpu

Unnamed: 0,Type,Company,image_url,URL,chipset_type,Model,Memory,Color,RGB,Price,ComponentID,Date,Shop
0,GPU,이엠텍,https://image3.compuzone.co.kr/img/product_img...,https://www.compuzone.co.kr/product/product_de...,NVIDIA,RTX 4060 TI STORM X DUAL OC,8,,False,580000,GPU#이엠텍#RTX 4060 TI STORM X DUAL OC,2024-07-02 14:09:43,compuzone
1,GPU,ASUS,https://image3.compuzone.co.kr/img/product_img...,https://www.compuzone.co.kr/product/product_de...,NVIDIA,DUAL RTX 4070 SUPER O12G EVO OC,12,,False,950000,GPU#ASUS#DUAL RTX 4070 SUPER O12G EVO OC,2024-07-02 14:09:43,compuzone
