In [1]:
import torch

# 检查是否有可用的GPU
print(torch.cuda.is_available())

# 输出当前使用的GPU设备
if torch.cuda.is_available():
    print(torch.cuda.current_device())
    print(torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print("CUDA is not available.")


True
0
NVIDIA GeForce RTX 2060


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import Dataset

# 加载Financial PhraseBank数据集
file_path = 'E:/Haibo_Fang23-24-Dissertation/report/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt'

# 加载数据并添加列名
df = pd.read_csv(file_path, delimiter='@', header=None, names=['sentence', 'sentiment'], encoding='ISO-8859-1')

# 去除空白符号
df['sentence'] = df['sentence'].str.strip()
df['sentiment'] = df['sentiment'].str.strip()

# 将情绪标签转换为数字
df['sentiment'] = df['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

# 划分训练集和测试集
train_texts, test_texts, train_labels, test_labels = train_test_split(df['sentence'].tolist(), df['sentiment'].tolist(), test_size=0.2, random_state=42)

# 自定义数据集类
class FinancialPhraseBankDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 加载预训练的BERT模型和tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3).to(device)

# 创建数据集
train_dataset = FinancialPhraseBankDataset(train_texts, train_labels, tokenizer, max_len=512)
test_dataset = FinancialPhraseBankDataset(test_texts, test_labels, tokenizer, max_len=512)


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from transformers import Trainer, TrainingArguments

# 定义训练参数
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,  # 修改为每100步记录一次
    save_strategy="epoch",
    fp16=True  # 启用混合精度训练
)

# 使用Trainer API进行训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


 15%|█▍        | 100/681 [00:36<03:30,  2.76it/s]

{'loss': 1.0367, 'grad_norm': 4.762389659881592, 'learning_rate': 9.900000000000002e-06, 'epoch': 0.44}


 29%|██▉       | 200/681 [01:13<02:57,  2.71it/s]

{'loss': 0.4255, 'grad_norm': 2.5994019508361816, 'learning_rate': 1.9800000000000004e-05, 'epoch': 0.88}


                                                 
 33%|███▎      | 227/681 [01:29<02:22,  3.19it/s]

{'eval_loss': 0.1462348997592926, 'eval_runtime': 6.2605, 'eval_samples_per_second': 72.358, 'eval_steps_per_second': 9.105, 'epoch': 1.0}


 44%|████▍     | 300/681 [01:57<02:18,  2.75it/s]

{'loss': 0.1899, 'grad_norm': 0.1298755556344986, 'learning_rate': 2.98e-05, 'epoch': 1.32}


 59%|█████▊    | 400/681 [02:33<01:42,  2.75it/s]

{'loss': 0.2064, 'grad_norm': 27.075387954711914, 'learning_rate': 3.9800000000000005e-05, 'epoch': 1.76}


                                                 
 67%|██████▋   | 454/681 [02:59<01:10,  3.21it/s]

{'eval_loss': 0.1873665601015091, 'eval_runtime': 6.2681, 'eval_samples_per_second': 72.27, 'eval_steps_per_second': 9.094, 'epoch': 2.0}


 73%|███████▎  | 500/681 [03:18<01:06,  2.74it/s]

{'loss': 0.1224, 'grad_norm': 33.05534744262695, 'learning_rate': 4.9800000000000004e-05, 'epoch': 2.2}


 88%|████████▊ | 600/681 [03:54<00:29,  2.75it/s]

{'loss': 0.1168, 'grad_norm': 0.17812764644622803, 'learning_rate': 2.3204419889502762e-05, 'epoch': 2.64}


                                                 
100%|██████████| 681/681 [04:30<00:00,  3.19it/s]

{'eval_loss': 0.2292633354663849, 'eval_runtime': 6.2627, 'eval_samples_per_second': 72.333, 'eval_steps_per_second': 9.102, 'epoch': 3.0}


100%|██████████| 681/681 [04:32<00:00,  2.50it/s]

{'train_runtime': 272.142, 'train_samples_per_second': 19.964, 'train_steps_per_second': 2.502, 'train_loss': 0.31303124140712835, 'epoch': 3.0}





TrainOutput(global_step=681, training_loss=0.31303124140712835, metrics={'train_runtime': 272.142, 'train_samples_per_second': 19.964, 'train_steps_per_second': 2.502, 'total_flos': 1429495198516224.0, 'train_loss': 0.31303124140712835, 'epoch': 3.0})

In [5]:
import numpy as np
import praw

# Reddit API 认证
reddit = praw.Reddit(client_id='ByGHuaBLiK2AdpNTPWKlCA',
                     client_secret='KfB9LAgGXaJ7PhUzRFvNZr32P3g5lg',
                     user_agent='Haibo Fang')

def get_reddit_data(stock_ticker):
    subreddit = reddit.subreddit('all')
    query = f'{stock_ticker}'
    posts = subreddit.search(query, limit=100)
    data = []
    for post in posts:
        data.append(post.title + ' ' + post.selftext)
    return data

# 替换为你感兴趣的股票代码
stock_ticker = 'AAPL'
reddit_data = get_reddit_data(stock_ticker)

# 数据预处理
def preprocess(texts):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors='pt')
    return encodings

# 情绪分析
def analyze_sentiment(texts):
    model.eval()
    inputs = preprocess(texts)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    return predictions.cpu().numpy()

# 获取Reddit数据并分析情绪
sentiments = []
batch_size = 8
for i in range(0, len(reddit_data), batch_size):
    batch_texts = reddit_data[i:i+batch_size]
    batch_sentiments = analyze_sentiment(batch_texts)
    sentiments.extend(batch_sentiments)

# 统计情绪分析结果
def summarize_sentiments(sentiments):
    unique, counts = np.unique(sentiments, return_counts=True)
    sentiment_summary = dict(zip(unique, counts))
    return sentiment_summary

summary = summarize_sentiments(sentiments)
print("Sentiment Summary:", summary)


Sentiment Summary: {0: 12, 1: 77, 2: 11}
