#### Clean the data

In [None]:
import pandas as pd

# Load data with UTF-8 encoding for Chinese text
file_path = '/home/disk1/red_disk1/zhiyi/zhiyi_data/tb_style_sale_202311_202410.csv'
data = pd.read_csv(file_path, encoding='utf-8')

# Count total rows in the dataset
total_rows_count = data.shape[0]

# Display the result
print("Total rows:", total_rows_count)

# Count distinct rows by considering all columns (to ensure row uniqueness)
distinct_rows_count = data.drop_duplicates().shape[0]

# Count distinct values in the 'union_id' column
distinct_union_id_count = data['union_id'].nunique()

# Display the results
print("Distinct rows:", distinct_rows_count)
print("Distinct union_id values:", distinct_union_id_count)

# Count distinct values in the 'style_new' column
distinct_style_new_count = data['style_new'].nunique()

# Display the result
print("Distinct style_new values:", distinct_style_new_count)

#### Check styles
1. Clusters the data by style_new.
2. Counts the unique union_id and note_id values for each style.
3. Sorts the styles by the count of note_id and selects the top 10 styles.
4. Visualizes the results in a bar chart.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import matplotlib as mpl

# Set up the font properties for Chinese text
chinese_font_path = '/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc'
chinese_font = FontProperties(fname=chinese_font_path)
mpl.rcParams['font.sans-serif'] = [chinese_font_path]  # Set default font for matplotlib
mpl.rcParams['axes.unicode_minus'] = False  # Ensure minus signs display correctly

# Group by 'style_new' and count unique 'union_id' and 'note_id' for each style
style_counts = data.groupby('style_new').agg(
    unique_union_id_count=('union_id', 'nunique'),
    note_id_count=('note_id', 'count')
).reset_index()

# Sort by 'note_id_count' to get the top 20 styles
top_10_styles = style_counts.sort_values(by='note_id_count', ascending=False).head(20)

# Plotting the top 20 styles
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(top_10_styles['style_new'], top_10_styles['note_id_count'], label='Note Count')
ax.plot(top_10_styles['style_new'], top_10_styles['unique_union_id_count'], color='orange', marker='o', label='Union ID Count')

# Add labels and legend with Chinese font support
ax.set_xlabel('风格', fontproperties=chinese_font)  # 'Style' in Chinese
ax.set_ylabel('计数', fontproperties=chinese_font)  # 'Count' in Chinese
ax.set_title('按Note ID计数的前20风格', fontproperties=chinese_font)  # 'Top 20 Styles by Note ID Count' in Chinese
ax.legend(prop=chinese_font)
plt.xticks(rotation=45, fontproperties=chinese_font)
plt.tight_layout()

# Display the plot
plt.show()


In [None]:
# Display the results in a table format
top_10_styles.columns = ['Style', 'Union ID Count', 'Note ID Count']
print(top_10_styles)