In [None]:
import yfinance as yf
import pandas as pd
import numpy as np

In [18]:
df = pd.read_csv("/Users/francis/Desktop/Github/elon-musk-sentiment-stock/data/musk_quote_with_sentiment.csv")
df['musk_quote_created_at'] = pd.to_datetime(df['musk_quote_created_at'])
start_date = df["musk_quote_created_at"].min().date().isoformat()
end_date = df["musk_quote_created_at"].max().date().isoformat()
ticker = "TSLA"


In [21]:
tsla_df = yf.download(ticker, start=start_date, end=end_date)
tsla_df = tsla_df.reset_index()
tsla_df["Date"] = pd.to_datetime(tsla_df["Date"])
print(tsla_df["Date"])

[*********************100%***********************]  1 of 1 completed

0      2015-06-29
1      2015-06-30
2      2015-07-01
3      2015-07-02
4      2015-07-06
          ...    
2458   2025-04-07
2459   2025-04-08
2460   2025-04-09
2461   2025-04-10
2462   2025-04-11
Name: Date, Length: 2463, dtype: datetime64[ns]





In [22]:
df = df.reset_index(drop=True)
tsla_df = tsla_df.reset_index(drop=True)

# Convert multi-level to single-level columns
if tsla_df.columns.nlevels > 1:
    tsla_df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in tsla_df.columns]

print(tsla_df.columns)

Index(['Date_', 'Close_TSLA', 'High_TSLA', 'Low_TSLA', 'Open_TSLA',
       'Volume_TSLA'],
      dtype='object')


In [None]:
tsla_df["intraday_volatility"] = (tsla_df["High_TSLA"] - tsla_df["Low_TSLA"]) / tsla_df["Open_TSLA"]
tsla_df["Date_"] = pd.to_datetime(tsla_df["Date_"])
tsla_df["volatility_label"] = (tsla_df["intraday_volatility"] > 0.03).astype(int)

tsla_df[['Date_', 'Open_TSLA', 'High_TSLA','Close_TSLA', 'Low_TSLA', 'intraday_volatility', 'volatility_label']].to_csv("/Users/francis/Desktop/Github/elon-musk-sentiment-stock/data/tsla_intraday_volatility.csv", index=False)

In [26]:
# Load tweet + sentiment dataset
tweet_df = pd.read_csv("/Users/francis/Desktop/Github/elon-musk-sentiment-stock/data/musk_tweets_with_tsla_return.csv")
tweet_df['musk_quote_created_at'] = pd.to_datetime(tweet_df['musk_quote_created_at'])

# Add tweet date for merge
tweet_df['tweet_date'] = tweet_df['musk_quote_created_at'].dt.date
tweet_df['tweet_date'] = pd.to_datetime(tweet_df['tweet_date'])
# Load TSLA volatility data (from above)
tsla_vol = pd.read_csv("/Users/francis/Desktop/Github/elon-musk-sentiment-stock/data/tsla_intraday_volatility.csv")
tsla_vol['Date_'] = pd.to_datetime(tsla_vol['Date_'])

# Merge on date
df = pd.merge(tweet_df, tsla_vol[['Date_', 'intraday_volatility', 'volatility_label']],
              left_on='tweet_date', right_on='Date_', how='left')


In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline


In [32]:
# Load Twitter-RoBERTa model and tokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create a pipeline for sentiment
twitter_sentiment = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

def classify_roberta_sentiment(text):
    try:
        result = twitter_sentiment(text[:512])[0]  # BERT limit
        label_map = {'LABEL_0': 'negative', 'LABEL_1': 'neutral', 'LABEL_2': 'positive'}
        score_map = {'LABEL_0': -1, 'LABEL_1': 0, 'LABEL_2': 1}
        label = label_map[result['label']]
        score = score_map[result['label']] * result['score']  # Score weighted by direction
        return pd.Series([label, score])
    except:
        return pd.Series([None, None])

# Apply to your dataset
df[['roberta_label', 'roberta_score']] = df['musk_quote_tweet'].apply(classify_roberta_sentiment)



Device set to use mps:0


In [37]:
print(f"Number of NaN values in target: {df['volatility_label'].isna().sum()}")
print(f"Percentage of NaN values: {df['volatility_label'].isna().mean() * 100:.2f}%")

Number of NaN values in target: 2048
Percentage of NaN values: 28.16%


In [43]:
df = df.dropna(subset=['volatility_label'])
features = [
    'roberta_score',
    'vader_compound',
    'musk_quote_like_count',
    'musk_quote_retweet_count',
    'musk_quote_quote_count',
    'musk_quote_view_count'
]
X = df[features]
y = df['volatility_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Volatility Model Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Volatility Model Accuracy: 0.8535885167464115
Classification Report:
               precision    recall  f1-score   support

         0.0       0.29      0.08      0.13       137
         1.0       0.87      0.97      0.92       908

    accuracy                           0.85      1045
   macro avg       0.58      0.53      0.52      1045
weighted avg       0.80      0.85      0.82      1045

Confusion Matrix:
 [[ 11 126]
 [ 27 881]]


In [46]:
from sklearn.utils import resample

df_major = df[df['volatility_label'] == 1]
df_minor = df[df['volatility_label'] == 0]

df_major_downsampled = resample(df_major, replace=False, n_samples=len(df_minor), random_state=42)
df_balanced = pd.concat([df_major_downsampled, df_minor])

X = df_balanced[features]
y = df_balanced['volatility_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Balanced Volatility Model Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

print(df_balanced['volatility_label'].value_counts(normalize=True))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Balanced Volatility Model Accuracy: 0.5407166123778502
Classification Report:
               precision    recall  f1-score   support

         0.0       0.55      0.56      0.56       156
         1.0       0.53      0.52      0.53       151

    accuracy                           0.54       307
   macro avg       0.54      0.54      0.54       307
weighted avg       0.54      0.54      0.54       307

Confusion Matrix:
 [[88 68]
 [73 78]]
volatility_label
1.0    0.5
0.0    0.5
Name: proportion, dtype: float64
