In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

# 读取CSV文件的前100行，指定编码格式为ISO-8859-1
file_path = '/data1/dxw_data/llm/MKT_data_mining/Multimodal/image2text/output/all_statistic.csv'
df = pd.read_csv(file_path, encoding='ISO-8859-1') # , nrows=100

# 定义类别列表
category_list = [
    "Product Review",
    "Outfit Recommendations",
    "Popular Science",
    "Medical Health",
    "Education and Campus",
    "Workplace/Interpersonal Relationships",
    "Technology",
    "Movie Commentary",
    "Film/Variety Editing",
    "Short Skits",
    "Animation/Anime",
    "Creative Editing/Dubbing",
    "Satirical Parody",
    "Art Creation",
    "Street Interviews",
    "Landscape Photography",
    "Cute Pets",
    "Food",
    "Vlog/Insights Sharing",
    "Travel",
    "Fitness and Beauty",
    "Skill Sharing",
    "Home Life",
    "Dance",
    "Music",
    "Local Culture and Tourism",
    "Public Service Advertising",
    "Celebrity Entertainment",
    "Social and Political News",
    "Games",
    "Sports",
    "Automobiles",
    "Finance",
    "Others"
]

# 初始化模型
model = SentenceTransformer('/data1/dxw_data/llm/paraphrase-multilingual-MiniLM-L12-v2')

# 对category_list中的每个元素进行embedding
category_embeddings = model.encode(category_list, convert_to_tensor=True)

# 定义一个函数来重新分配Topic
def reassign_topic(description, category_list, category_embeddings, model):
    if isinstance(description, float) and pd.isna(description):
        return "Others"  # 或者你可以选择返回一个其他的默认值
    description_embedding = model.encode(str(description), convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(description_embedding, category_embeddings)
    most_similar_idx = similarities.argmax().item()
    return category_list[most_similar_idx]

# 使用tqdm显示进度条并重新分配Topic
tqdm.pandas(desc="Reassigning Topics")
df['Topic'] = df['description'].progress_apply(lambda desc: reassign_topic(desc, category_list, category_embeddings, model))

# 保存结果到新的CSV文件
output_file_path = '/data1/dxw_data/llm/MKT_data_mining/Multimodal/image2text/output/all_statistic_reassigned_top100.csv'
df.to_csv(output_file_path, index=False)

print(f"结果已保存到 {output_file_path}")


Reassigning Topics: 100%|██████████| 11037/11037 [01:42<00:00, 108.01it/s]


结果已保存到 /data1/dxw_data/llm/MKT_data_mining/Multimodal/image2text/output/all_statistic_reassigned_top100.csv


In [5]:
import pandas as pd

# 读取 CSV 文件
file_path = '/data1/dxw_data/llm/MKT_data_mining/Multimodal/image2text/output/all_statistic_reassigned_top100.csv'
df = pd.read_csv(file_path)

# 统计相同的 user 中相同的 Topic 相同的 time 出现的次数
count_df = df.groupby(['user', 'time', 'Topic']).size().reset_index(name='Count')

# 输出结果保存到 CSV 文件
output_file_path = '/data1/dxw_data/llm/MKT_data_mining/Multimodal/image2text/output/user_topic_time_count_top100.csv'
count_df.to_csv(output_file_path, index=False)

print(f'Results have been saved to {output_file_path}')


Results have been saved to /data1/dxw_data/llm/MKT_data_mining/Multimodal/image2text/output/user_topic_time_count_top100.csv


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap

# 读取user_time_word_distribution-final.csv文件
df = pd.read_csv('/data1/dxw_data/llm/MKT_data_mining/Multimodal/image2text/output/user_time_topic_distribution.csv')

# 定义主题类别及其子类别
categories = {
    "Product Guide": [
        "Product Review",
        "Outfit Recommendations"
    ],
    "Educational/Informational": [
        "Popular Science",
        "Medical Health",
        "Education and Campus",
        "Workplace/Interpersonal Relationships",
        "Technology"
    ],
    "Plot Type": [
        "Movie Commentary",
        "Film/Variety Editing",
        "Short Skits",
        "Animation/Anime"
    ],
    "Leisure and Comedy": [
        "Creative Editing/Dubbing",
        "Satirical Parody",
        "Art Creation",
        "Street Interviews",
        "Landscape Photography",
        "Cute Pets"
    ],
    "Life Sharing": [
        "Food",
        "Vlog/Insights Sharing",
        "Travel",
        "Fitness and Beauty",
        "Skill Sharing",
        "Home Life"
    ],
    "Appearance": [
        "Dance",
        "Music"
    ],
    "Advertising": [
        "Local Culture and Tourism",
        "Public Service Advertising"
    ],
    "Celebrity Entertainment": [
        "Celebrity Entertainment"
    ],
    "News and Current Events": [
        "Social and Political News"
    ],
    "Games": [
        "Games"
    ],
    "Sports": [
        "Sports"
    ],
    "Automobiles": [
        "Automobiles"
    ],
    "Finance": [
        "Finance"
    ],
    "Others": [
        "Others"
    ]
}

# 生成不同主类的颜色
color_palette = plt.cm.get_cmap('tab20c', len(categories))

# 为每个大类主题分配一个颜色
color_map = {}
for i, (category, subcategories) in enumerate(categories.items()):
    color = color_palette(i)
    for subcategory in subcategories:
        color_map[subcategory] = color

# 获取所有的主题列表
all_words = df['Topic'].unique()

# 获取所有用户
users = df['User'].unique()

for user in users:
    user_df = df[df['User'] == user]
    
    # 创建一个pivot table
    pivot_table = user_df.pivot_table(index='Time', columns='Topic', values='Count', fill_value=0)
    
    # 获取所有时间点
    times = pivot_table.index
    
    fig, ax = plt.subplots(figsize=(14, 8))  # 增大图片大小
    bottom = np.zeros(len(times))
    
    for word in all_words:
        if word in pivot_table.columns:
            counts = pivot_table[word].values
            ax.bar(times, counts, bottom=bottom, color=color_map.get(word, '#000000'), label=word)
            bottom += counts

    ax.set_title(f'User: {user}', fontsize=16)
    ax.set_xlabel('Time', fontsize=14)
    ax.set_ylabel('Count', fontsize=14)
    
    # 获取图例并在内部显示
    handles, labels = ax.get_legend_handles_labels()
    label_color_map = {label: color_map.get(label, '#000000') for label in labels}
    sorted_labels = sorted(labels, key=lambda x: list(color_map.keys()).index(x) if x in color_map else -1)
    sorted_handles = [handles[labels.index(label)] for label in sorted_labels]
    new_labels = ['\n'.join(label[i:i+20] for i in range(0, len(label), 20)) for label in sorted_labels]
    ax.legend(sorted_handles, new_labels, loc='upper right', bbox_to_anchor=(0.98, 0.98), fontsize=6, title='Topic', title_fontsize=8, ncol=4)

    plt.tight_layout(rect=[0, 0, 1, 1])
    plt.savefig(f'/data1/dxw_data/llm/tiktok/mv/fig_all/user_{user}_time_distribution_with_topics.png')
    plt.close(fig)
