In [23]:
import json

def process_tweet_file(filename):
    processed_tweets = []
    
    with open(filename, 'r') as f:
        for i, line in enumerate(f, 1):
            tweet = json.loads(line)
            # Concatenate all tokens in the text array with spaces
            full_text = f"{i}. " + ' '.join(tweet['text'])
            processed_tweets.append(full_text)
    
    return processed_tweets

# Template text
template = """<Task>
Analyze the provided tweets to determine their likely impact on the future stock price of a given company. Your answer must be one of the following: "[Positive]", "[Neutral]", or "[Negative]".

<Solving Process>
1. Identify the Target Stock: Extract the stock symbol from the tweets.
2. Tweet-by-Tweet Analysis:
(a) Separate the Factors: For each relevant tweet, identify Positive and Negative factors.
(b) Assess Sentiment: Evaluate how each relevant tweet might influence investor sentiment.
3. Overall Sentiment Summary: Combine your analyses to conclude the overall sentiment.

<Tweets>
{}

<Output Requirement>
You must do this: Conclude with a single line that states the overall sentiment. Use one of the following tags exactly:
[Positive]
[Neutral]
[Negative]
"""

# Process the file
filename = 'tweet/preprocessed/AAPL/2014-01-02'
tweets = process_tweet_file(filename)

# Format tweets and print with template
formatted_tweets = '\n'.join(tweets)
print(template.format(formatted_tweets)) 

<Task>
Analyze the provided tweets to determine their likely impact on the future stock price of a given company. Your answer must be one of the following: "[Positive]", "[Neutral]", or "[Negative]".

<Solving Process>
1. Identify the Target Stock: Extract the stock symbol from the tweets.
2. Tweet-by-Tweet Analysis:
(a) Separate the Factors: For each relevant tweet, identify Positive and Negative factors.
(b) Assess Sentiment: Evaluate how each relevant tweet might influence investor sentiment.
3. Overall Sentiment Summary: Combine your analyses to conclude the overall sentiment.

<Tweets>
1. $ aapl - wall st . kicks off new year on lower note -> URL stock stocks stockaction
2. rt AT_USER here's how apple could be making a huge push into healthcare --> URL $ aapl
3. rt AT_USER our top 3 trade ideas for 2014 and two $ 1,000 futures wins already in the bank $ gld $ uso $ aapl - - URL
4. AT_USER $ aapl having breathing difficulties
5. emylers : cheeky's board ! : $ aapl up to date compan

In [24]:
import ollama

ollama.list()

ListResponse(models=[Model(model='deepseek-r1:1.5b', modified_at=datetime.datetime(2025, 3, 19, 9, 28, 46, 170881, tzinfo=TzInfo(-04:00)), digest='a42b25d8c10a841bd24724309898ae851466696a7d7f3a0a408b895538ccbc96', size=1117322599, details=ModelDetails(parent_model='', format='gguf', family='qwen2', families=['qwen2'], parameter_size='1.8B', quantization_level='Q4_K_M')), Model(model='gemma2:latest', modified_at=datetime.datetime(2024, 11, 21, 15, 8, 38, 852496, tzinfo=TzInfo(-05:00)), digest='ff02c3702f322b9e075e9568332d96c0a7028002f1a5a056e0a6784320a4db0b', size=5443152417, details=ModelDetails(parent_model='', format='gguf', family='gemma2', families=['gemma2'], parameter_size='9.2B', quantization_level='Q4_0'))])

In [25]:
from ollama import chat
from ollama import ChatResponse

def get_ai_analysis(formatted_prompt):
    """
    Analyze tweets using ollama's deepseek-r1 model
    """
    try:
        response: ChatResponse = chat(
            model='deepseek-r1:1.5b',
            messages=[{
                'role': 'user',
                'content': formatted_prompt
            }]
        )
        return response.message.content
    except Exception as e:
        print(f"Error using model: {str(e)}")
        return "[Error]"

final_prompt = template.format(formatted_tweets)

# Get AI analysis
ai_response = get_ai_analysis(final_prompt)
print("AI Analysis Result:")
print(ai_response) 

AI Analysis Result:
<think>
Okay, so I need to figure out the overall impact of these tweets on the stock price of Apple (AAPL) for the future. Let me go through each tweet step by step.

Starting with tweet 1: It mentions that the Wall Street kicked off a new year on a lower note. Hmm, that seems neutral or maybe positive because it's about the start of the year and could indicate good opportunities coming up. But I'm not sure if that's enough to decide the overall sentiment yet.

Tweet 2 talks about how Apple is making a huge push into healthcare. That sounds like a positive sign for their health product line, which can be seen as a selling point. So, this tweet might boost the stock positively.

Tweet 3 mentions China saying they'll double smartphone sales by 2014, but doesn't cry much about investors. Maybe it's neutral because while it's a good news, the investors are just looking at other trends. But perhaps the overall impact is mixed.

Tweet 4 says that Apple has breathing diff

批量处理&保存

In [33]:
import os
import csv
from datetime import datetime, timedelta
import re


def extract_sentiment_and_analysis(ai_response):
    # 提取最终情绪
    sentiment_matches = re.findall(r'\[(Positive|Negative|Neutral)\]', ai_response)
    if sentiment_matches:
        sentiment = sentiment_matches[-1]
    else:
        sentiment = "Unknown"  # 提供默认值
    
    # 提取分析内容
    analysis_matches = re.findall(r'<think>(.*?)</think>', ai_response, re.DOTALL)
    if analysis_matches:
        analysis = analysis_matches[0].strip()
    else:
        analysis = "No analysis provided"  # 提供默认值
    
    return sentiment, analysis


def process_date_range(start_date, end_date, symbol):
    sentiment_dir = os.path.join('sentiment')
    if not os.path.exists(sentiment_dir):
        os.makedirs(sentiment_dir)
        
    # 创建原始AI分析结果CSV文件
    raw_results_file = os.path.join(sentiment_dir, f'raw_sentiment_analysis_{symbol}_{start_date.strftime("%Y%m%d")}_{end_date.strftime("%Y%m%d")}.csv')
    
    # 创建处理后的结果CSV文件
    processed_results_file = os.path.join(sentiment_dir, f'processed_sentiment_analysis_{symbol}_{start_date.strftime("%Y%m%d")}_{end_date.strftime("%Y%m%d")}.csv')
    
    # 检查原始结果文件是否已存在，如果存在则跳过处理
    if os.path.exists(processed_results_file):
        print(f"File {processed_results_file} already exist, skip {symbol}")
        return

    # 打开原始结果文件
    with open(raw_results_file, 'w', newline='', encoding='utf-8') as raw_csvfile:
        raw_writer = csv.writer(raw_csvfile)
        raw_writer.writerow(['Date', 'Raw_Analysis'])
        
        # 打开处理后的结果文件
        with open(processed_results_file, 'w', newline='', encoding='utf-8') as processed_csvfile:
            processed_writer = csv.writer(processed_csvfile)
            processed_writer.writerow(['Date', 'Sentiment', 'Analysis'])

            current_date = start_date
            while current_date <= end_date:
                date_str = current_date.strftime('%Y-%m-%d')
                filename = f'tweet/preprocessed/{symbol}/{date_str}'

                if os.path.exists(filename):
                    # 处理文件
                    tweets_content = process_tweet_file(filename)
                    formatted_tweets = '\n'.join(tweets_content)
                    formatted_content = template.format(formatted_tweets)
                    
                    # Initialize variables
                    sentiment, analysis = "Unknown", "No analysis provided"
                    
                    # Keep trying until valid sentiment and analysis are extracted
                    while sentiment == "Unknown" or analysis == "No analysis provided":
                        ai_analysis = get_ai_analysis(formatted_content)
                        # print(ai_analysis)
                        
                        # 先保存原始AI分析结果
                        raw_writer.writerow([date_str, ai_analysis])
                        
                        try:
                            # 提取情绪和分析
                            sentiment, analysis = extract_sentiment_and_analysis(ai_analysis)
                            if sentiment == "Unknown" or analysis == "No analysis provided":
                                print(f"Failed to extract valid sentiment and analysis for {date_str}. Retrying...")
                        except Exception as e:
                            # 处理提取过程中可能出现的错误
                            print(f"Error processing {date_str}: {str(e)}")
                            # 写入错误信息到处理后的CSV
                            processed_writer.writerow([date_str, "Error", f"Error extracting data: {str(e)}"])
                            break  # Exit loop on error

                    # 写入处理后的CSV
                    processed_writer.writerow([date_str, sentiment, analysis])
                    print(f"Finished processing: {date_str}, Sentiment: {sentiment}")
                else:
                    print(f"Not exist: {date_str}")
                    processed_writer.writerow([date_str, "Missing", "File not found"])

                current_date += timedelta(days=1)
    
    print(f"Raw results saved to: {raw_results_file}")
    print(f"Processed results saved to: {processed_results_file}")

# 设置日期范围
start_date = datetime(2014, 1, 1)
end_date = datetime(2015, 12, 31)
# end_date = datetime(2014, 1, 5)

# symbol = 'AAPL'
# 
# process_date_range(start_date, end_date, symbol)

In [34]:
def get_all_symbols():
    """
    从tweet/preprocessed目录结构中获取所有股票代码
    返回股票代码列表
    """
    symbols = []
    # 获取preprocessed文件夹中所有子文件夹，这些子文件夹名称就是股票代码
    preprocessed_path = os.path.join('tweet', 'preprocessed')
    
    if os.path.exists(preprocessed_path) and os.path.isdir(preprocessed_path):
        for item in os.listdir(preprocessed_path):
            item_path = os.path.join(preprocessed_path, item)
            if os.path.isdir(item_path) and not item.startswith('.'):
                symbols.append(item)
        print(f"在 {preprocessed_path} 中找到 {len(symbols)} 个股票代码目录")
    else:
        print(f"警告: 路径 {preprocessed_path} 不存在或不是文件夹")
        # 尝试检查tweet文件夹内容以帮助调试
        tweet_path = 'tweet'
        if os.path.exists(tweet_path) and os.path.isdir(tweet_path):
            print(f"tweet文件夹内容: {os.listdir(tweet_path)}")
    
    return symbols

symbols = get_all_symbols()

for symbol in symbols:
    print(f"正在处理 {symbol}...")
    try:
        process_date_range(start_date, end_date, symbol)
        print(f"{symbol} 处理完成")
    except Exception as e:
        print(f"处理 {symbol} 时出错: {str(e)}")

print("所有股票代码处理完成")

在 tweet\preprocessed 中找到 87 个股票代码目录
正在处理 AAPL...
File sentiment\processed_sentiment_analysis_AAPL_20140101_20151231.csv already exist, skip AAPL
AAPL 处理完成
正在处理 ABB...
File sentiment\processed_sentiment_analysis_ABB_20140101_20151231.csv already exist, skip ABB
ABB 处理完成
正在处理 ABBV...
File sentiment\processed_sentiment_analysis_ABBV_20140101_20151231.csv already exist, skip ABBV
ABBV 处理完成
正在处理 AEP...
File sentiment\processed_sentiment_analysis_AEP_20140101_20151231.csv already exist, skip AEP
AEP 处理完成
正在处理 AGFS...
File sentiment\processed_sentiment_analysis_AGFS_20140101_20151231.csv already exist, skip AGFS
AGFS 处理完成
正在处理 AMGN...
File sentiment\processed_sentiment_analysis_AMGN_20140101_20151231.csv already exist, skip AMGN
AMGN 处理完成
正在处理 AMZN...
File sentiment\processed_sentiment_analysis_AMZN_20140101_20151231.csv already exist, skip AMZN
AMZN 处理完成
正在处理 BA...
File sentiment\processed_sentiment_analysis_BA_20140101_20151231.csv already exist, skip BA
BA 处理完成
正在处理 BABA...
File sentiment\

KeyboardInterrupt: 