In [None]:
import pandas as pd
import json

# 读取Excel文件
excel_path = '/data1/dxw_data/llm/taobao/true/cleaned_titles_last4.xlsx'  # 替换为你的Excel文件路径
sheet_name = 'Sheet1'  # 替换为你的Sheet名称

# 读取指定列
df = pd.read_excel(excel_path, sheet_name=sheet_name, usecols=['name4'])

# 将列数据转换为列表，并处理datetime对象
def convert_to_serializable(val):
    if isinstance(val, pd.Timestamp):
        return val.strftime('%Y-%m-%d')
    return str(val)  # 将所有非字符串类型转换为字符串

title_4_list = df['name4'].dropna().apply(convert_to_serializable).tolist()

# 保存为JSON文件
json_path = '/data1/dxw_data/llm/taobao/true/title_all.json'  # 替换为你的输出JSON文件路径
with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(title_4_list, f, ensure_ascii=False, indent=4)

print(f"数据已保存到 {json_path}！")


In [None]:
import json
import pandas as pd
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer

def load_words_from_json(json_file):
    with open(json_file, 'r', encoding='utf-8') as file:
        words = json.load(file)
    return words

def cluster_words(words, model_name, n_clusters=500):
    # Load the embedding model
    model = SentenceTransformer(model_name)

    # Get embeddings for the words
    embeddings = model.encode(words, show_progress_bar=True)

    # Perform clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    clusters = kmeans.fit_predict(embeddings)

    return clusters

def save_clusters_to_csv(words, clusters, output_csv):
    df = pd.DataFrame({'word': words, 'cluster': clusters})
    df.to_csv(output_csv, index=False, encoding='utf-8')

# Usage example
# json_file = '/data1/dxw_data/llm/taobao/input.json'
# output_csv = '/data1/dxw_data/llm/taobao/output_products_category.csv'
json_file = '/data1/dxw_data/llm/taobao/true/title_all.json'
output_csv = '/data1/dxw_data/llm/taobao/true/all_products_category.csv'
model_name = '/data1/dxw_data/llm/text2vec-large-chinese'
n_clusters = 60

words = load_words_from_json(json_file)
clusters = cluster_words(words, model_name, n_clusters)
save_clusters_to_csv(words, clusters, output_csv)


In [None]:
import pandas as pd
import json

def group_words_by_cluster(csv_file, output_json, threshold=5):
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Group words by their cluster
    cluster_dict = {}
    for _, row in df.iterrows():
        word, cluster = row['word'], row['cluster']
        if cluster not in cluster_dict:
            cluster_dict[cluster] = []
        cluster_dict[cluster].append(word)

    # Filter out clusters that have fewer than the specified threshold number of words
    filtered_cluster_dict = {k: v for k, v in cluster_dict.items() if len(v) >= threshold}

    # Save the filtered grouped data to a JSON file
    with open(output_json, 'w', encoding='utf-8') as json_file:
        json.dump(filtered_cluster_dict, json_file, ensure_ascii=False, indent=4)

# Usage example
csv_file = '/data1/dxw_data/llm/taobao/true/all_products_category.csv'
output_json = '/data1/dxw_data/llm/taobao/true/all_grouped_products_category.json'
threshold = 0

group_words_by_cluster(csv_file, output_json, threshold)


In [None]:
import json
import pandas as pd
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer

def load_clusters_from_json(json_file):
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def cluster_subtypes(words, model_name, n_clusters=2):
    # Load the embedding model
    model = SentenceTransformer(model_name)

    # Get embeddings for the words
    embeddings = model.encode(words, show_progress_bar=True)

    # Perform clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    clusters = kmeans.fit_predict(embeddings)

    return clusters

def perform_second_level_clustering(input_json, output_json, model_name):
    # Load the existing clusters
    existing_clusters = load_clusters_from_json(input_json)

    # Initialize a dictionary to hold the second level clusters
    second_level_clusters = {}

    # Perform clustering on each cluster to divide it into sub-clusters
    for cluster_id, words in existing_clusters.items():
        sub_clusters = cluster_subtypes(words, model_name, n_clusters=3)

        # Create sub-cluster dictionaries
        sub_cluster_dict = {f"{cluster_id}_{sub_cluster_id}": [] for sub_cluster_id in range(3)}
        for word, sub_cluster_id in zip(words, sub_clusters):
            sub_cluster_dict[f"{cluster_id}_{sub_cluster_id}"].append(word)
        
        # Update the main dictionary with sub-clusters
        second_level_clusters.update(sub_cluster_dict)

    # Save the resulting sub-clusters into a JSON file
    with open(output_json, 'w', encoding='utf-8') as json_file:
        json.dump(second_level_clusters, json_file, ensure_ascii=False, indent=4)

# Usage example
input_json = '/data1/dxw_data/llm/taobao/true/all_grouped_products_category.json'
output_json = '/data1/dxw_data/llm/taobao/true/all_output_sub_clusters3.json'
model_name = '/data1/dxw_data/llm/text2vec-large-chinese'

perform_second_level_clustering(input_json, output_json, model_name)


In [None]:
import json
import random

# Path to the input JSON file
input_file_path = "/data1/dxw_data/llm/taobao/true/all_output_sub_clusters3.json"
output_file_path = "/data1/dxw_data/llm/taobao/true/all_output_sub_clusters3-litte2.json"

# Read the input JSON data from the file
with open(input_file_path, "r", encoding="utf-8") as file:
    input_data = json.load(file)

# Initialize the output dictionary
output_data = {}

# Iterate over each type in the input data
for key, values in input_data.items():
    if isinstance(values, list):
        # Randomly sample 20 elements for each type
        output_data[key] = random.sample(values, min(20, len(values)))
    else:
        print(f"Skipping key {key} because its value is not a list.")

# Convert the output dictionary to JSON with UTF-8 encoding
output_json = json.dumps(output_data, ensure_ascii=False, indent=4)

# Save the output JSON to a file
with open(output_file_path, "w", encoding="utf-8") as json_file:
    json_file.write(output_json)

# Print the output JSON
print(output_json)


In [None]:
# gpt 生成分类

In [None]:
import json
import pandas as pd

# Load the JSON files
with open('/data1/dxw_data/llm/taobao/true/all_output_sub_clusters3.json', 'r', encoding='utf-8') as file:
    sub_clusters = json.load(file)

with open('/data1/dxw_data/llm/taobao/true/updated_categories.json', 'r', encoding='utf-8') as file:
    official_categories = json.load(file)

# Create a list to hold the output data
output_data = []

# Process each item in sub_clusters and assign categories
for cluster_key, items in sub_clusters.items():
    if cluster_key in official_categories:
        major_category = official_categories[cluster_key][0]
        minor_category = official_categories[cluster_key][1]
    else:
        major_category = "Unknown"
        minor_category = "Unknown"
    
    for item in items:
        output_data.append({
            "item": item,
            "major_category": major_category,
            "minor_category": minor_category
        })

# Convert the output data to a DataFrame
df = pd.DataFrame(output_data)

# Save the DataFrame to an Excel file
output_path = '/data1/dxw_data/llm/taobao/true/assigned_categories.xlsx'
df.to_excel(output_path, index=False)

print(f"Categories assigned and saved to {output_path}")

In [2]:
# 这个是合并的成功代码，两个excel的匹配

import pandas as pd
from tqdm import tqdm

# 读取Excel文件
a_excel = pd.read_excel('/data1/dxw_data/llm/taobao/true/cleaned_titles_last4.xlsx')
b_excel = pd.read_excel('/data1/dxw_data/llm/taobao/true/assigned_categories.xlsx')

# 打印列名以确认
print("a_excel columns:", a_excel.columns)
print("b_excel columns:", b_excel.columns)

# 去重：保留b_excel中name列的第一个出现值
b_excel_unique = b_excel.drop_duplicates(subset='name', keep='first')

# 使用tqdm进度条进行数据合并
with tqdm(total=1, desc="Merging DataFrames") as pbar:
    matched_data = a_excel.merge(b_excel_unique[['name', 'major_category', 'minor_category']], left_on='cleaned_title4', right_on='name', how='left')
    pbar.update(1)

# 打印合并后数据框的列名以确认
print("matched_data columns:", matched_data.columns)

# 检查'name'列是否存在并删除不需要的列
if 'name' in matched_data.columns:
    with tqdm(total=1, desc="Dropping unnecessary columns") as pbar:
        matched_data.drop(columns=['name'], inplace=True)
        pbar.update(1)
else:
    print("'name'列不存在，跳过删除步骤")

# 输出合并后的表格
output_path = '/data1/dxw_data/llm/taobao/true/merged_output.xlsx'
matched_data.to_excel(output_path, index=False)

print(f"合并后的表格已保存到 {output_path}")


a_excel columns: Index(['name', 'cleaned_title4'], dtype='object')
b_excel columns: Index(['name', 'major_category', 'minor_category'], dtype='object')


Merging DataFrames: 100%|██████████| 1/1 [00:00<00:00,  4.08it/s]


matched_data columns: Index(['name_x', 'cleaned_title4', 'name_y', 'major_category',
       'minor_category'],
      dtype='object')
'name'列不存在，跳过删除步骤
合并后的表格已保存到 /data1/dxw_data/llm/taobao/true/merged_output.xlsx


In [None]:
import pandas as pd
from tqdm import tqdm

# 读取Excel文件
df = pd.read_excel('/data1/dxw_data/llm/taobao/refine/huitun_room_goods_sample_xiangwen.xlsx')

# 确保 tqdm 在 groupby 后显示进度条
categories = df['小类别'].unique()

sorted_groups = []
for category in tqdm(categories, desc="Processing categories"):
    group = df[df['小类别'] == category].sort_values(by='livePrice')
    sorted_groups.append(group)

# 将所有分组结果合并
df_sorted = pd.concat(sorted_groups).reset_index(drop=True)

# 输出结果到新的Excel文件
df_sorted.to_excel('/data1/dxw_data/llm/taobao/refine/huitun_room_goods_sample_xiangwen_sorted_output_file.xlsx', index=False)

print("Processing complete. The sorted data has been saved to 'huitun_room_goods_sample_xiangwen_sorted_output_file.xlsx'.")
