In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import os
from tqdm import tqdm
import jieba
import re

In [2]:
# 设置随机种子，确保结果可复现
torch.manual_seed(42)
np.random.seed(42)

In [3]:
# 1. 数据加载和预处理
def load_csv_files(directory):
    """加载目录下所有CSV文件并合并"""
    all_data = []
    
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            all_data.append(df)
            print(f"加载文件: {filename}, 行数: {len(df)}")
    
    if all_data:
        merged_df = pd.concat(all_data, ignore_index=True)
        print(f"合并后总行数: {len(merged_df)}")
        return merged_df
    else:
        print("没有找到CSV文件")
        return None

In [4]:
# 数据清洗
def clean_data(df):
    """对数据进行基本清洗"""
    # 删除缺失值
    df = df.dropna(subset=['短评内容'])
    
    # 打印列名以便调试
    print("DataFrame 列名:", df.columns.tolist())
    
    # 假设评分列为'评分'，将其转换为情感标签
    if '评分' in df.columns:
        df['情感标签'] = df['评分'].apply(lambda x: 0 if x <= 2 else (1 if x == 3 else 2))
    else:
        # 如果没有'评分'列，可以考虑以下几种方案:
        
        # 方案1: 假设另有一个可能的评分列，比如'rating'或'score'
        for possible_rating_col in ['rating', 'score', 'rate', '打分', '星级']:
            if possible_rating_col in df.columns:
                print(f"使用 '{possible_rating_col}' 列作为评分")
                df['情感标签'] = df[possible_rating_col].apply(lambda x: 0 if x <= 2 else (1 if x == 3 else 2))
                return df
        
        # 方案2: 如果没有任何评分列，可以通过简单的规则从文本中推断情感
        print("未找到评分列，使用简单文本分析推断情感")
        # 定义一些简单的正面和负面词语
        pos_words = ['好', '喜欢', '棒', '赞', '精彩', '感动', '推荐', '经典']
        neg_words = ['差', '失望', '烂', '无聊', '难看', '垃圾', '后悔', '浪费']
        
        def infer_sentiment(text):
            pos_count = sum(1 for word in pos_words if word in text)
            neg_count = sum(1 for word in neg_words if word in text)
            
            if pos_count > neg_count:
                return 2  # 正面
            elif neg_count > pos_count:
                return 0  # 负面
            else:
                return 1  # 中性
        
        df['情感标签'] = df['短评内容'].apply(infer_sentiment)
    
    return df

In [None]:
# 2. 基于TF-IDF的情感分析
class TFIDFAnalyzer:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=5000)
        self.model = LogisticRegression(max_iter=1000, C=1.0, class_weight='balanced')
    
    def preprocess_text(self, texts):
        """预处理文本"""
        processed_texts = []
        for text in texts:
            # 分词
            words = jieba.cut(text)
            # 转换为字符串
            processed_text = ' '.join(words)
            processed_texts.append(processed_text)
        return processed_texts
    
    def train(self, train_texts, train_labels, val_texts, val_labels):
        """训练模型"""
        print("预处理训练文本...")
        processed_train_texts = self.preprocess_text(train_texts)
        
        print("提取TF-IDF特征...")
        X_train = self.vectorizer.fit_transform(processed_train_texts)
        
        print("训练逻辑回归模型...")
        self.model.fit(X_train, train_labels)
        
        print("预处理验证文本...")
        processed_val_texts = self.preprocess_text(val_texts)
        
        print("转换验证集特征...")
        X_val = self.vectorizer.transform(processed_val_texts)
        
        print("模型预测...")
        val_preds = self.model.predict(X_val)
        
        return val_labels, val_preds
    
    def predict(self, texts):
        """使用训练好的模型进行预测"""
        processed_texts = self.preprocess_text(texts)
        X = self.vectorizer.transform(processed_texts)
        predictions = self.model.predict(X)
        
        return predictions