post trend2002-2025&posting times of the day

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')


# 读取数据
df = pd.read_csv('ptt_posts_clean(1).csv')

# 查看数据基本信息
print("数据基本信息：")
print(f"数据形状: {df.shape}")
print("\n列名:")
print(df.columns.tolist())
print("\n前5行数据:")
print(df.head())

# 检查post_time列的情况
print(f"\npost_time列的数据类型: {df['post_time'].dtype}")
print(f"post_time列非空值数量: {df['post_time'].notna().sum()}")
print(f"post_time列空值数量: {df['post_time'].isna().sum()}")
print("\npost_time列的前10个值:")
print(df['post_time'].head(10).tolist())

# 检查是否有重复值
print(f"\npost_time列重复值数量: {df['post_time'].duplicated().sum()}")

# 如果post_time不是datetime类型，尝试转换
if df['post_time'].dtype != 'datetime64[ns]':
    try:
        # 尝试常见的时间格式转换
        df['post_time'] = pd.to_datetime(df['post_time'], errors='coerce')
        print(f"\n转换后post_time列的数据类型: {df['post_time'].dtype}")
        print(f"转换后无法识别的时间数量: {df['post_time'].isna().sum()}")
    except Exception as e:
        print(f"\n时间转换出错: {e}")

# 查看时间范围
if df['post_time'].dtype == 'datetime64[ns]':
    print(f"\n时间范围:")
    print(f"最早时间: {df['post_time'].min()}")
    print(f"最晚时间: {df['post_time'].max()}")
    print(f"时间跨度: {(df['post_time'].max() - df['post_time'].min()).days} 天")

# 数据预处理 
# 过滤空值并复制数据
df_clean = df[df['post_time'].notna()].copy()
# 再次转换（可选，若前面转换已完成则可省略）
df_clean['post_time'] = pd.to_datetime(df_clean['post_time'], errors='coerce')
# 提取时间维度特征
df_clean['year'] = df_clean['post_time'].dt.year
df_clean['month'] = df_clean['post_time'].dt.month
df_clean['hour'] = df_clean['post_time'].dt.hour
df_clean['weekday'] = df_clean['post_time'].dt.weekday  # 0=周一, 6=周日

# 年度发文量趋势图 
yearly_posts = df_clean.groupby('year').size().reset_index(name='post_count')
total_posts = len(df_clean)

plt.figure(figsize=(14, 8))
plt.plot(yearly_posts['year'], yearly_posts['post_count'], 
         marker='o', linewidth=2, markersize=6, color='#2E86AB')

# 美化图表
plt.title(
    f'PTT Forum Annual Posting Trends (topic about scam) (2002-2025)\nTotal Posts: {total_posts:,}', 
    fontsize=16, fontweight='bold', fontfamily='DejaVu Sans'
)
plt.xlabel('year', fontsize=12, fontfamily='DejaVu Sans')
plt.ylabel('Number of posts', fontsize=12, fontfamily='DejaVu Sans')
plt.grid(True, alpha=0.3, linestyle='--')
plt.xticks(yearly_posts['year'][::2], rotation=45)  # 每2年显示一个刻度

# 添加数值标签（仅显示峰值）
max_year = yearly_posts.loc[yearly_posts['post_count'].idxmax()]
plt.annotate(f'peak: {int(max_year["post_count"])}posts\n{int(max_year["year"])}year',
             xy=(max_year['year'], max_year['post_count']),
             xytext=(max_year['year']+2, max_year['post_count']+100),
             arrowprops=dict(arrowstyle='->', color='red', alpha=0.7),
             fontsize=10, color='red', fontweight='bold')

plt.tight_layout()
plt.savefig('/Users/apple/Desktop/data/visual_new/yearly_post_trend(1).png', dpi=300, bbox_inches='tight')
plt.close()

print("\n年度发文量趋势图已保存")
print(f"数据概览: 共{len(yearly_posts)}个年份，平均每年{yearly_posts['post_count'].mean():.0f}篇")
print(f"最高年份: {int(max_year['year'])}年 ({int(max_year['post_count'])}篇)")

# 日内发文时段分布图 
hourly_posts = df_clean.groupby('hour').size().reset_index(name='post_count')

plt.figure(figsize=(14, 8))
bars = plt.bar(hourly_posts['hour'], hourly_posts['post_count'], 
               color='#A23B72', alpha=0.8, edgecolor='white', linewidth=0.5)

# 美化图表
plt.title('PTT Forum Posting Times of the Day(topic about scam)', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('hours', fontsize=12)
plt.ylabel('Number of posts', fontsize=12)
plt.grid(True, alpha=0.3, linestyle='--', axis='y')
plt.xticks(range(0, 24))

# 标记高峰时段
peak_hours = hourly_posts.nlargest(3, 'post_count')
for idx, (_, row) in enumerate(peak_hours.iterrows(), 1):
    # 峰值排名 + 小时 + 帖子数
    label = f'peak value{idx}: {int(row["hour"])}:00\n{int(row["post_count"])}posts'
    # 调整文字位置
    y_offset = 80 + idx * 180  # 不同峰值错开高度
    plt.text(
        row['hour'], row['post_count'] + y_offset,
        label,
        ha='center', va='bottom',
        fontweight='bold', color='darkred',
        bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.7)  # 加背景框突出
    )

# 添加时段划分线
plt.axvline(x=6, color='gray', linestyle='--', alpha=0.5, label='6:00')
plt.axvline(x=12, color='gray', linestyle='--', alpha=0.5, label='12:00')
plt.axvline(x=18, color='gray', linestyle='--', alpha=0.5, label='18:00')
plt.legend()

# 调整y轴上限
max_post = hourly_posts['post_count'].max()
plt.ylim(0, max_post * 1.5) 

plt.tight_layout()
plt.savefig('/Users/apple/Desktop/data/visual_new/hourly_post_distribution(1).png', dpi=300, bbox_inches='tight')
plt.close()

print("\n日内发文时段分布图已保存")
print(f"peak hours: {peak_hours['hour'].tolist()}点 (分别{peak_hours['post_count'].tolist()}篇)")

数据基本信息：
数据形状: (25739, 16)

列名:
['post_id', 'platform', 'country', 'title', 'content', 'language', 'translated_content', 'post_time', 'author_id', 'author_name', 'post_url', 'images', 'like_count', 'reply_count', 'updated_time', 'status']

前5行数据:
                     post_id platform country                        title  \
0  PTT_1762393787_b4d03d50a4      PTT      TW     [問卦] 如果中共承諾嚴懲詐騙 會加強統一意願嗎   
1  PTT_1762401174_b3bc827528      PTT      TW  [問卦] 詐騙幹部在大安區有11豪宅+48車位 代表?   
2  PTT_1762401052_e27b1c7926      PTT      TW      [問卦] 為什麼三立報導詐騙首腦好像很勵志呀？   
3  PTT_1762400598_286973d75e      PTT      TW         [問卦] 從車手升級到詐騙集團主理人要多   
4  PTT_1762399220_933bf3c214      PTT      TW        [問卦] 月薪沒3萬不去做詐騙是不是傻了?   

                                             content language  \
0  有的時候看詐騙新聞 看詐欺犯笑嘻嘻幾十萬交保\n\n看受騙家庭流淚甚至抱家人自殺 就覺得台灣...       zh   
1  調查結果出來了  太子集團  詐騙幹部\n\n在台北大安區  擁有11間豪宅+48車位  還...       zh   
2  阿肥我剛剛在看新聞\n剛好看到三立在報導詐騙首腦陳志\nhttps://www.setn.c...       zh   
3  各位八卦版鄉民們午安\n\n詐騙集團吃香喝辣