In [63]:
import pandas as pd
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [28]:
aapl_tweet_df = pd.read_csv('datasets/preprocessed-evaluation-tweets/aapl.csv')
tsla_tweet_df = pd.read_csv('datasets/preprocessed-evaluation-tweets/tsla.csv')

                       created_at  favorite_count  retweet_count  view_count  \
0  Mon Jan 01 20:48:37 +0000 2024              50             13        7669   
1  Mon Jan 01 12:48:10 +0000 2024               8              2        1790   
2  Mon Jan 01 23:24:59 +0000 2024             100             22        6916   
3  Mon Jan 01 20:13:15 +0000 2024               1              0         252   
4  Mon Jan 01 16:12:30 +0000 2024              27              7        1772   

                                                text  
1  Happy New Year, Traders! Best of luck 2024 $AA...  
2  More than 42 million transactions were complet...  
3          LETS GET LITTLE BIT ROMANTIC! $AAPL [URL]  
4   :police_car_light: THE END, IS THE BEGINNING ...  
                       created_at  favorite_count  retweet_count  view_count  \
0  Mon Jan 01 22:11:21 +0000 2024             208             38       91878   
1  Mon Jan 01 22:16:20 +0000 2024              15              1         555   
2  M

In [29]:
aapl_price_df = pd.read_csv('stock_prices/AAPL_prices_2022-01-01_to_2024-04-11.csv')
tsla_price_df = pd.read_csv('stock_prices/TSLA_prices_2022-01-01_to_2024-04-11.csv')

                        Date        Open        High         Low       Close  \
0  2022-01-03 00:00:00-05:00  175.597058  180.583650  175.478569  179.724564   
1  2022-01-04 00:00:00-05:00  180.336774  180.642879  176.870839  177.443558   
2  2022-01-05 00:00:00-05:00  177.354699  177.907665  172.447104  172.723587   
3  2022-01-06 00:00:00-05:00  170.531463  173.098822  169.484776  169.840256   
4  2022-01-07 00:00:00-05:00  170.719078  171.953382  168.882433  170.008118   

      Volume  Dividends  Stock Splits  
0  104487900        0.0           0.0  
1   99310400        0.0           0.0  
2   94537600        0.0           0.0  
3   96904000        0.0           0.0  
4   86709100        0.0           0.0  
                        Date        Open        High         Low       Close  \
0  2022-01-03 00:00:00-05:00  382.583344  400.356659  378.679993  399.926666   
1  2022-01-04 00:00:00-05:00  396.516663  402.666656  374.350006  383.196655   
2  2022-01-05 00:00:00-05:00  382.21667

In [32]:
# Example of date pre-processing for tweet

tweet_date_string = tsla_tweet_df['created_at'][0]

# Parse the string date into a datetime object
date_obj = datetime.strptime(tweet_date_string, "%a %b %d %H:%M:%S %z %Y")

# Format the datetime object into the desired format
formatted_date = date_obj.strftime("%Y-%m-%d")

print(formatted_date)
print(type(formatted_date))

2024-01-01
<class 'str'>


In [38]:
# Example of date pre-processing for stock price

price_date_string = tsla_price_df['Date'][0]
price_date_string = price_date_string[:10]
print(price_date_string)
print(type(price_date_string))

2022-01-03
<class 'str'>


In [95]:
# Evaluation for Tesla
y_true = []
y_pred = []

for i in range(1, 31):
    date = str(i).zfill(2)
    date_string = f'2024-01-{date}'
    day_exist = False

    # check if prices fall or rise
    for index, Date in enumerate(tsla_price_df['Date']):
        if date_string == Date[:10]:
            if tsla_price_df['Close'][index+1] > tsla_price_df['Close'][index]:
                y_true.append(1)
            else:
                y_true.append(-1)
            day_exist = True
    if day_exist == False:
        continue
    else:
        day_exist = False

    # ---------------------------------------- Need to change the part below to utilise your model
    # calculate sentiment on the day
    tweet_sentiments = []
    for index, Date in enumerate(tsla_tweet_df['created_at']):
        date_obj = datetime.strptime(Date, "%a %b %d %H:%M:%S %z %Y")

        # Format the datetime object into the desired format
        formatted_date = date_obj.strftime("%Y-%m-%d")
        if date_string == formatted_date:
            text = tsla_tweet_df['text'][index]
            compound, sentiment, scores = analyze_sentiment(text) # need to produce output from your model
            tweet_sentiments.append(compound) # adds output to a list to find the average sentiment of the day

    # find average sentiment of the day, can change sentiment threshhold
    average_sentiment = sum(tweet_sentiments) / len(tweet_sentiments)
    if round(average_sentiment, 3) > 0.05:
        y_pred.append(1)
    elif round(average_sentiment, 3) < -0.05:
        y_pred.append(-1)
    else:
        y_true.pop()

print('Tesla')
print(f'y_true: {y_true}')
print(f'y_pred: {y_pred}')

tsla_f1 = f1_score(y_true, y_pred, average='macro')
tsla_accuracy = accuracy_score(y_true, y_pred)


print(f'f1: {tsla_f1}')
print(f'accuracy: {tsla_accuracy}')



Tesla
y_true: [-1, -1, -1, -1, -1, 1, 1, -1, -1, 1, 1, -1]
y_pred: [1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1]
f1: 0.48571428571428577
accuracy: 0.5


In [96]:
# Evaluation for Apple
y_true = []
y_pred = []

for i in range(1, 31):
    date = str(i).zfill(2)
    date_string = f'2024-01-{date}'
    day_exist = False

    # check if prices fall or rise
    for index, Date in enumerate(aapl_price_df['Date']):
        if date_string == Date[:10]:
            if aapl_price_df['Close'][index+1] > aapl_price_df['Close'][index]:
                y_true.append(1)
            else:
                y_true.append(-1)
            day_exist = True
    if day_exist == False:
        continue
    else:
        day_exist = False

    # ---------------------------------------- Need to change the part below to utilise your model
    # calculate sentiment on the day
    tweet_sentiments = []
    for index, Date in enumerate(aapl_tweet_df['created_at']):
        date_obj = datetime.strptime(Date, "%a %b %d %H:%M:%S %z %Y")

        # Format the datetime object into the desired format
        formatted_date = date_obj.strftime("%Y-%m-%d")
        if date_string == formatted_date:
            text = aapl_tweet_df['text'][index]
            compound, sentiment, scores = analyze_sentiment(text) # need to produce output from your model
            tweet_sentiments.append(compound) # adds output to a list to find the average sentiment of the day

    # find average sentiment of the day, can change sentiment threshhold
    average_sentiment = sum(tweet_sentiments) / len(tweet_sentiments)
    if round(average_sentiment, 3) > 0.05:
        y_pred.append(1)
    elif round(average_sentiment, 3) < -0.05:
        y_pred.append(-1)
    else:
        y_true.pop()

print('Apple')
print(f'y_true: {y_true}')
print(f'y_pred: {y_pred}')

aapl_f1 = f1_score(y_true, y_pred, average='macro')
aapl_accuracy = accuracy_score(y_true, y_pred)


print(f'f1: {aapl_f1}')
print(f'accuracy: {aapl_accuracy}')

Apple
y_true: [-1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1]
y_pred: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
f1: 0.2727272727272727
accuracy: 0.375


In [98]:
print(f'Average f1: {(aapl_f1 + tsla_f1) / 2}')
print(f'Average accuracy: {(aapl_accuracy + tsla_accuracy) / 2}')

Average f1: 0.37922077922077924
Average accuracy: 0.4375
