In [1]:
import pandas as pd
import os

In [2]:
csv_path = '/Users/caopengbo/Documents/code/clean_item/property_clusters_output_secondary.csv'
df = pd.read_csv(csv_path, encoding='utf-8')

  df = pd.read_csv(csv_path, encoding='utf-8')


In [3]:
filtered_df = df[(df['cluster_id'] != 'Others') & (df['cluster_total_frequency'] >= 10)]
print(f'原始数据行数: {len(df)}')
print(f'过滤后数据行数: {len(filtered_df)}')
#
cluster_count = filtered_df['cluster_id'].nunique()
print(f'cluster_id总共有 {cluster_count} 个类别')

原始数据行数: 339519
过滤后数据行数: 120613
cluster_id总共有 13745 个类别


In [4]:
#只保留每个cluster_id中count最高的行-也就是用最高频词来作为该cluster_id的代表
highest_df = filtered_df.loc[filtered_df.groupby('cluster_id')['cluster_total_frequency'].idxmax()]
print(f'每个cluster_id中count最高的行数: {len(highest_df)}')
#保存代表词文件

output_highest_path = os.path.join(os.path.dirname(csv_path), 'highest_property_clusters_output_secondary.csv')

highest_df.to_csv(output_highest_path, index=False, encoding='utf-8')


每个cluster_id中count最高的行数: 13745


In [11]:
# 统计filtered_df中property列所有关键词的词频
# 简化版本 - 只统计最重要的信息
import re
from collections import Counter
def simple_property_analysis(df, column_name='property'):
    
    # 1. 完整值统计
    print("=== 完整Property值频次（前10个）===")
    print(df[column_name].value_counts().head(10))
    print()
    
    # 2. 主要术语统计（去括号）
    print("=== 主要术语频次（去括号内容）===")
    main_terms = []
    for prop in df[column_name].dropna():
        # 去除括号内容，清理空格
        clean_term = re.sub(r'\([^)]*\)', '', str(prop)).strip()
        clean_term = re.sub(r'\s+', ' ', clean_term)  # 标准化空格
        if clean_term:
            main_terms.append(clean_term)
    
    main_counts = Counter(main_terms)
    for term, count in main_counts.most_common(10):
        print(f"'{term}': {count}")
    print()
    
    # 3. 关键词统计
    print("=== 核心关键词频次 ===")
    keywords = []
    for prop in df[column_name].dropna():
        words = re.findall(r'[A-Za-z]{3,}', str(prop))  # 提取3个字母以上的词
        keywords.extend([w.lower() for w in words])
    
    # 过滤常见词
    stop_words = {'from', 'and', 'the', 'inferred', 'implied'}
    filtered_keywords = [w for w in keywords if w not in stop_words]
    
    keyword_counts = Counter(filtered_keywords)
    for word, count in keyword_counts.most_common(15):
        print(f"{word}: {count}")

    return main_counts, keyword_counts

# 运行简化分析
main_counts, keyword_counts=simple_property_analysis(highest_df)


=== 完整Property值频次（前10个）===
property
Thickness                        1
Transport energy gap value       1
Intensity Ratio D°X/FXA          1
ESR peak-to-peak linewidth       1
Energy level scheme              1
Vibrational Bands                1
Moth-eye structure morphology    1
Conversion depth                 1
Internal loss (αi)               1
Triplet State Lifetime           1
Name: count, dtype: int64

=== 主要术语频次（去括号内容）===
'Composition': 33
'Thickness': 13
'Morphology': 13
'Surface Morphology': 11
'Interlayer Spacing': 10
'Photocurrent': 9
'Crystal Quality': 8
'Crystal Structure': 7
'Stability': 7
'Stoichiometry': 7

=== 核心关键词频次 ===
energy: 679
peak: 588
emission: 425
density: 409
band: 399
current: 375
concentration: 340
ratio: 333
temperature: 332
surface: 304
intensity: 300
presence: 298
efficiency: 295
structure: 250
thickness: 248


In [13]:
# 保存为两个独立的CSV文件
base_path = os.path.dirname(csv_path)

# 保存主要术语
main_terms_path = os.path.join(base_path, 'main_terms_results.csv')
with open(main_terms_path, 'w', encoding='utf-8') as f:
    f.write("Main Term,Count\n")
    for term, count in main_counts.items():
        f.write(f'"{term}",{count}\n')

# 保存关键词
keywords_path = os.path.join(base_path, 'keywords_results.csv')
with open(keywords_path, 'w', encoding='utf-8') as f:
    f.write("Keyword,Count\n")
    for word, count in keyword_counts.items():
        f.write(f"{word},{count}\n")

print(f"主要术语保存到: {main_terms_path}")
print(f"关键词保存到: {keywords_path}")

主要术语保存到: /Users/caopengbo/Documents/code/clean_item/main_terms_results.csv
关键词保存到: /Users/caopengbo/Documents/code/clean_item/keywords_results.csv


上述是对filtered_df的属性列进行的简单分析（统计专业术语即去掉括号的频次，核心关键词的频次），结果已保存为CSV文件

In [15]:
# 对highest_df的高频词汇聚类，相似度较低，旨在为其构建上一级聚类
from sentence_transformers import SentenceTransformer, util
import torch

def perform_property_clustering(df, column_name='property', similarity_threshold=0.3, min_community_size=2):
    """
    对property列进行聚类分析
    
    Args:
        df: 包含property列的DataFrame
        column_name: 要聚类的列名
        similarity_threshold: 相似度阈值
        min_community_size: 最小簇大小
    
    Returns:
        聚类结果字典
    """
    print(f"\n🤖 开始对{column_name}列进行聚类...")
    print(f"  - 相似度阈值: {similarity_threshold}")
    print(f"  - 最小簇大小: {min_community_size}")
    
    # 准备数据
    properties = df[column_name].dropna().tolist()
    print(f"  - 待聚类的属性数量: {len(properties)}")
    
    # 加载模型并计算嵌入
    print("  - 正在加载语言模型...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    print("  - 正在计算句子嵌入...")
    embeddings = model.encode(properties, convert_to_tensor=True)
    
    # 执行社区检测聚类
    print("  - 正在执行聚类...")
    clusters = util.community_detection(
        embeddings,
        min_community_size=min_community_size,
        threshold=similarity_threshold
    )
    
    print(f"✅ 聚类完成！共找到 {len(clusters)} 个簇。")
    
    # 处理聚类结果
    clustered_indices = set()
    clustered_results = []
    
    for i, cluster_indices in enumerate(clusters):
        cluster_members = []
        for idx in cluster_indices:
            property_name = properties[idx]
            # 从原始DataFrame中获取该property的频次信息
            count = len(df[df[column_name] == property_name])
            cluster_members.append({
                'property': property_name,
                'count': count
            })
        
        clustered_results.append({
            'cluster_id': f"cluster_{i+1}",
            'size': len(cluster_members),
            'members': cluster_members
        })
        clustered_indices.update(cluster_indices)
    
    # 识别未聚类的项目
    all_indices = set(range(len(properties)))
    unclustered_indices = all_indices - clustered_indices
    unclustered_items = []
    
    for idx in unclustered_indices:
        property_name = properties[idx]
        count = len(df[df[column_name] == property_name])
        unclustered_items.append({
            'property': property_name,
            'count': count
        })
    
    print(f"  - {len(clustered_indices)} 个属性被聚类")
    print(f"  - {len(unclustered_indices)} 个属性未被聚类")
    
    return {
        'clustered_results': clustered_results,
        'unclustered_items': unclustered_items,
        'total_clusters': len(clusters),
        'clustered_count': len(clustered_indices),
        'unclustered_count': len(unclustered_indices)
    }




In [18]:
# 对highest_df的高频词汇聚类，相似度较低，旨在为其构建上一级聚类
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
# 计算句子嵌入
embeddings = model.encode(highest_df['property'].tolist(), convert_to_tensor=True)
#使用社区算法聚类
results = perform_property_clustering(highest_df, column_name='property', similarity_threshold=0.5, min_community_size=2)




🤖 开始对property列进行聚类...
  - 相似度阈值: 0.5
  - 最小簇大小: 2
  - 待聚类的属性数量: 13744
  - 正在加载语言模型...
  - 正在计算句子嵌入...
  - 正在执行聚类...
✅ 聚类完成！共找到 1650 个簇。
  - 13438 个属性被聚类
  - 306 个属性未被聚类


0.3的相似度可以聚类到700个，
0.5的相似度可以聚类到1650个，所以暂定一个值（可以为1650也可以是其他值，聚类后用大语言模型统一命名）
然后对最高频的1650个进行聚类，得到的结果是1650个聚类结果

In [19]:
# 保存高频词蒸馏结果

def save_clustering_results(results, base_path):
    """保存聚类结果到CSV文件"""
    
    # 保存聚类结果
    clusters_data = []
    for cluster in results['clustered_results']:
        for member in cluster['members']:
            clusters_data.append({
                'cluster_id': cluster['cluster_id'],
                'property': member['property'],
                'count': member['count'],
                'cluster_size': cluster['size']
            })
    
    clusters_df = pd.DataFrame(clusters_data)
    clusters_path = os.path.join(base_path, 'high_property_clusters_results.csv')
    clusters_df.to_csv(clusters_path, index=False, encoding='utf-8')
    
    # 保存未聚类项目
    if results['unclustered_items']:
        unclustered_df = pd.DataFrame(results['unclustered_items'])
        unclustered_path = os.path.join(base_path, 'high_unclustered_properties.csv')
        unclustered_df.to_csv(unclustered_path, index=False, encoding='utf-8')
        print(f"未聚类项目保存到: {unclustered_path}")
    
    print(f"聚类结果保存到: {clusters_path}")

# 保存结果
base_path = os.path.dirname('data_anlyze')
save_clustering_results(results, base_path)

未聚类项目保存到: high_unclustered_properties.csv
聚类结果保存到: high_property_clusters_results.csv
