In [1]:
import praw
import pandas as pd
import re

# 设置Reddit API
reddit = praw.Reddit(client_id='ByGHuaBLiK2AdpNTPWKlCA',
                     client_secret='KfB9LAgGXaJ7PhUzRFvNZr32P3g5lg',
                     user_agent='Haibo Fang')

# 获取Reddit帖子
def get_reddit_posts(subreddit, query, limit=3000):
    subreddit = reddit.subreddit(subreddit)
    posts = []
    for post in subreddit.search(query, limit=limit):
        posts.append([post.title, post.selftext])
    return posts

# 数据清洗
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    return text

# 获取和清洗数据
posts = get_reddit_posts('stocks', 'AAPL', limit=3000)
df = pd.DataFrame(posts, columns=['title', 'text'])
df['text'] = df['title'] + ' ' + df['text']
df['text'] = df['text'].apply(clean_text)
print(df.head())


                                    title  \
0                    Underestimating AAPL   
1  Are you worried about AAPL long term?    
2                     Sell or Hold $AAPL?   
3                          I SOLD AAPL :(   
4                 AAPL is at 52-week low!   

                                                text  
0  underestimating aapl im not at surprised at th...  
1  are you worried about aapl long term  now im n...  
2  sell or hold aapl im up  already almost k in g...  
3  i sold aapl  i know  you just buy and forget i...  
4  aapl is at week low hi guys\n\ni just notice t...  


In [2]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

# 数据标注 (假设已经有标注的数据，实际情况需要手动标注或使用预标注的数据)
df['label'] = [1 if 'good' in text or 'up' in text or 'rise' in text else 0 for text in df['text']]

# 分词和编码
def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['text'] = df['text'].apply(preprocess_text)
print(df.head())
    

                                    title  \
0                    Underestimating AAPL   
1  Are you worried about AAPL long term?    
2                     Sell or Hold $AAPL?   
3                          I SOLD AAPL :(   
4                 AAPL is at 52-week low!   

                                                text  label  
0  underestimating aapl im surprised kneejerk sma...      1  
1  worried aapl long term im saying sell apple st...      0  
2  sell hold aapl im already almost k gains finan...      1  
3  sold aapl know buy forget yes know yet dumb ho...      1  
4  aapl week low hi guys notice aapl near week lo...      0  


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\48869/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\48869/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
from sklearn.model_selection import train_test_split

# 训练集和测试集划分
X = df['text'].values
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")


Training samples: 181, Testing samples: 46


In [None]:
from sklearn.model_selection import train_test_split

# 训练集和测试集划分
X = df['text'].values
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")


Training samples: 181, Testing samples: 46


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 文本编码和填充
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

max_len = 150
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
X_test = pad_sequences(X_test, padding='post', maxlen=max_len)

print(f"Encoded and padded training samples: {X_train.shape}, Testing samples: {X_test.shape}")


Encoded and padded training samples: (181, 150), Testing samples: (46, 150)


In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

# 构建LSTM模型
def create_model(embedding_dim, spatial_dropout, lstm_units, dropout, recurrent_dropout):
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=embedding_dim, input_length=max_len))
    model.add(SpatialDropout1D(spatial_dropout))
    model.add(Bidirectional(LSTM(lstm_units, dropout=dropout, recurrent_dropout=recurrent_dropout)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = create_model(128, 0.3, 100, 0.3, 0.3)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1, callbacks=[early_stopping])

# 模型评估
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')

# 示例预测
def predict_sentiment(text):
    text = clean_text(text)
    text = preprocess_text(text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, padding='post', maxlen=max_len)
    pred = model.predict(padded)
    return 'Positive' if pred > 0.5 else 'Negative'

example_text = "The stock market is expected to rise"
print(predict_sentiment(example_text))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.6086956262588501
Negative
