In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 加载数据集
data = pd.read_csv('Dataset-SA.csv')

# 数据清洗：过滤评分和空评论
data = data.dropna(subset=['Review'])  # 删除Review为空的行
data = data.dropna(subset=['Summary'])  # 删除Summary为空的行
data['Rate'] = pd.to_numeric(data['Rate'], errors='coerce')  # 将Rate转换为数值
data = data[data['Rate'].isin([1, 2, 3, 4, 5])]  # 只保留1-5的评分

# 重置索引
data = data.reset_index(drop=True)

print("数据清洗后的样本数量:", len(data))
print("\n评分分布:")
print(data['Rate'].value_counts().sort_index())



In [2]:
# 显示数据集的前几行
print(data.head())



In [3]:
# 显示数据集的列名
print(data.columns)



In [4]:
# 数据集基本信息
print(data.info())



In [5]:
# 描述性统计
print(data.describe())



In [6]:
# 显示列名和数据类型
print(data.dtypes)



In [7]:
# 检查是否有空值
print(data.isnull().sum())




In [8]:
# 过滤评分数据，只保留1-5的评分
data['Rate'] = pd.to_numeric(data['Rate'], errors='coerce')  # 转换为数值类型
data = data[data['Rate'].between(1, 5)]  # 只保留1-5之间的评分

# 创建评分饼图
plt.figure(figsize=(12, 5))

# 评分饼图
plt.subplot(1, 2, 1)
rating_counts = data['Rate'].value_counts().sort_index()  # 按评分值排序
plt.pie(rating_counts.values, 
        labels=rating_counts.index,
        autopct='%1.1f%%',
        colors=sns.color_palette("husl", n_colors=len(rating_counts)))
plt.title('Rating Distribution (1-5)')

# 情感饼图
plt.subplot(1, 2, 2)
sentiment_counts = data['Sentiment'].value_counts()
plt.pie(sentiment_counts.values, 
        labels=sentiment_counts.index,
        autopct='%1.1f%%',
        colors=sns.color_palette("Set3", n_colors=len(sentiment_counts)))
plt.title('Sentiment Distribution')

plt.tight_layout()
plt.show()



In [9]:
# 添加评论长度列并过滤短评论
data['review_word_count'] = data['Summary'].str.split().str.len()  # 计算每条评论的词数
data_filtered = data[data['review_word_count'] >= 10]  # 只保留至少10个词的评论

# 创建一个包含所有情感类型的均衡样本
sample_data = []
for sentiment in ['neutral', 'negative', 'positive']:
    # 对每种情感获取不同评分的样本
    sentiment_data = data_filtered[data_filtered['Sentiment'] == sentiment]
    # 按评论长度排序并选择前4条
    sample = sentiment_data.nlargest(4, 'review_word_count')
    sample_data.append(sample)

# 合并样本并随机打乱
final_sample = pd.concat(sample_data).sample(frac=1).reset_index(drop=True)

# 显示样本数据的前10行，包含产品信息
pd.set_option('display.max_colwidth', 200)  # 增加列宽以显示更长的评论
display(final_sample[['Sentiment', 'Rate', 'product_name', 'product_price', 'Review', 'Summary','review_word_count']].head(10).style
        .set_properties(**{'text-align': 'left'})
        .set_table_styles([
            {'selector': 'th', 'props': [('text-align', 'left'), ('background-color', '#f2f2f2')]},
            {'selector': 'td', 'props': [('padding', '8px')]}
        ]))



In [10]:
# 处理评论长度分析
# 过滤掉空评论并计算长度
data = data.dropna(subset=['Summary'])
data['review_length'] = data['Summary'].str.len()
data['review_words'] = data['Summary'].str.split().str.len()

# 创建子图
plt.figure(figsize=(15, 10))

# 1. 评论长度分布
plt.subplot(2, 2, 1)
sns.histplot(data=data, x='review_length', bins=50)
plt.title('Distribution of Summary Length (Characters)')
plt.xlabel('Length in Characters')
plt.ylabel('Count')

# 2. 评论词数分布
plt.subplot(2, 2, 2)
sns.histplot(data=data, x='review_words', bins=50)
plt.title('Distribution of Summary Length (Words)')
plt.xlabel('Number of Words')
plt.ylabel('Count')

# 3. 评分与评论长度的关系
plt.subplot(2, 2, 3)
sns.boxplot(data=data, x='Rate', y='review_words')
plt.title('Summary Length by Rating')
plt.xlabel('Rating')
plt.ylabel('Number of Words')

# 4. 情感与评论长度的关系
plt.subplot(2, 2, 4)
sns.boxplot(data=data, x='Sentiment', y='review_words')
plt.title('Summary Length by Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Number of Words')

plt.tight_layout()
plt.show()

# 显示一些统计信息
print("\n评论长度统计信息:")
print(data[['review_length', 'review_words']].describe())

print("\n各评分的平均评论长度:")
print(data.groupby('Rate')['review_words'].mean().round(2))

print("\n各情感类别的平均评论长度:")
print(data.groupby('Sentiment')['review_words'].mean().round(2))





In [11]:
# 过滤出合适长度的评论（10词到95%分位数）
data['word_count'] = data['Summary'].str.split().str.len()
min_words = 10
max_words = data['word_count'].quantile(0.95)
filtered_data = data[(data['word_count'] >= min_words) & (data['word_count'] <= max_words)]

# 对每个情感类别进行平衡采样
min_samples = filtered_data['Sentiment'].value_counts().min()

In [12]:
# 过滤出合适长度的评论（10词到95%分位数）
data['word_count'] = data['Summary'].str.split().str.len()
min_words = 10
max_words = data['word_count'].quantile(0.95)
filtered_data = data[(data['word_count'] >= min_words) & (data['word_count'] <= max_words)]

# 对每个情感类别进行平衡采样
min_samples = filtered_data['Sentiment'].value_counts().min()
balanced_data = filtered_data.groupby('Sentiment').apply(lambda x: x.sample(min_samples, random_state=42)).reset_index(drop=True)

# 可视化平衡后的数据分布
plt.figure(figsize=(15, 5))

# 情感分布
plt.subplot(1, 3, 1)
sns.countplot(data=balanced_data, x='Sentiment')
plt.title('Balanced Sentiment Distribution')
plt.xticks(rotation=45)

# 评论长度箱线图
plt.subplot(1, 3, 2)
sns.boxplot(data=balanced_data, x='Sentiment', y='word_count')
plt.title('Summary Length by Sentiment (Balanced)')
plt.xticks(rotation=45)

# 评论长度分布
plt.subplot(1, 3, 3)
sns.histplot(data=balanced_data, x='word_count', hue='Sentiment', multiple='stack')
plt.title('Summary Length Distribution (Balanced)')
plt.xlabel('Number of Words')

plt.tight_layout()
plt.show()

# 打印统计信息
print('\n重采样后的统计信息:')
print(f'总样本数: {len(balanced_data)}')
print('\n情感分布:')
print(balanced_data['Sentiment'].value_counts())
print('\n评论长度统计（按情感分类）:')
print(balanced_data.groupby('Sentiment')['word_count'].describe())







In [14]:
final_columns = ['Summary', 'Sentiment','Rate']
df_filtered = balanced_data[final_columns]

# 保存处理后的数据
df_filtered.to_csv('filtered_reviews.csv', index=False)

print("数据集大小:", df_filtered.shape)
print("\n情感分布:")
print(df_filtered['Sentiment'].value_counts())

