# Tesla Stock Market prediction based on Elon Musk Tweet Interaction

In [None]:
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

import numpy as np
import seaborn as sns

In [None]:
tesla_stock_data = pd.read_csv("TSLA.csv")
elon_tweets_data = pd.read_csv("TweetsElonMusk.csv")

## Describing Datasets

### Tesla Data

In [None]:
tesla_stock_data.describe()

### Elon Musk Data

In [None]:
elon_tweets_data.describe()

## Combine Datasets and Refine

In [None]:
refined_elon = DataFrame(elon_tweets_data, columns=['replies_count', 'retweets_count', 'likes_count', 'date'])
refined_tesla_stock = DataFrame(tesla_stock_data, columns=['High', 'Low', 'Date'])
refined_tesla_stock = refined_tesla_stock.rename(columns={'Date': 'date'})

In [None]:
sorted_elon = refined_elon.sort_values(by='date', ascending=True)
sorted_tesla = refined_tesla_stock.sort_values(by='date', ascending=True)

#### Merged dataframes

In [None]:
data_merged = pd.merge(refined_elon, refined_tesla_stock, on='date', how='inner')
# create everage row
data_merged['average_stock'] = (data_merged['High'] + data_merged['Low']) /2

data_merged.columns

## Data clean and Analysis

### Null values

In [None]:
data_merged.to_csv('data.csv', index=False)
pd.isnull(data_merged).any()

Data has no null values

### Min values each column

In [None]:
data_merged.min()

### Max values each column

In [None]:
data_merged.max()

### Mean(average) values each column

In [None]:
data_merged.mean(numeric_only=True)

## Data Info

In [None]:
data_merged.info()

## Dependent and Independent variables

In [None]:
y = DataFrame(data_merged, columns=['average_stock'])
x = DataFrame(data_merged, columns=['replies_count', 'retweets_count', 'likes_count'])
# y = np.log(y)

# data = pd.get_dummies(data, columns=['job'], prefix='is')

### Test Splits and Model Training

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
                                                    random_state=24)

regr = LinearRegression()
regr.fit(x_train, y_train)

In [None]:
print("Intercept", regr.intercept_)
coef_data = regr.coef_.reshape(-1, 1)
pd.DataFrame(data=coef_data, index=x_train.columns, columns=['coef'])

In [None]:
np.e**0.000040

 From the coefficient above we clearly see that the reply count has the most significant impact on the stock price. But only very little.

### Model Accuracy with R-Squared

In [None]:
print('Trainng data r-squared:', regr.score(x_train, y_train))
print('Test data r-squared:', regr.score(x_test, y_test))

In [None]:
# from the r-squared results we see that our model are not the most accurate.

## Test Non Accurate Model

In [None]:
input_tweet = {
    'replies_count': 107,
    'retweets_count': 60,
    'likes_count': 2000,  
}

# Create a DataFrame with the input data
input_df = pd.DataFrame(input_tweet, index=[0])


# Use the model to make a prediction
predicted_stock = regr.predict(input_df)

print(f'Predicted Average Stock: {predicted_stock[0]}')

## Refine Model and Data

### Test Data For Correlations


In [None]:
plt.figure(figsize=(10,6))
plt.hist(data_merged['average_stock'], bins=50, ec='black', color='gray')
plt.xlabel("Price in $100000s")
plt.ylabel('Number of houses')
plt.show()

In [None]:


plt.figure(figsize=(10,6))
plt.scatter(x["likes_count"], y, alpha=0.3)
plt.title('Likes vs Stocks')
plt.show()


In [None]:
y_log = np.log(data_merged["average_stock"])

In [None]:
y_log.skew()

In [None]:
y.skew()

In [None]:
plt.figure(figsize=(10,6))
plt.hist(y_log, bins=50, ec='black', color='gray')
plt.xlabel("Price in $100000s")
plt.ylabel('Number of houses')
plt.show()

In [None]:
sns.lmplot(x="likes_count", y="average_stock", data=data_merged, scatter_kws={'alpha': 0.6}, line_kws={'color': 'darkred'})
plt.show()

In [None]:
transformed_data = x
transformed_data['LOG_Average_Stock'] = y_log
sns.lmplot(x="replies_count", y="LOG_Average_Stock", data=transformed_data, scatter_kws={'alpha': 0.6}, line_kws={'color': 'darkred'})
plt.show()

## Hyperparameters RandomizedSearchCV

 Random Search randomly samples combinations of hyperparameters, which can save time and computational resources. However, whether Random Search is better than Grid Search depends on your specific problem and dataset.

In [None]:
param_dist = {
    'alpha': np.logspace(-3, 3, 100),  # Range of alpha values to test
    'fit_intercept': [True, False],
}

ridge = Ridge()
#n_jobs=-1 means using all cpu cores
random_search = RandomizedSearchCV(ridge, param_distributions=param_dist, n_iter=200, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)


random_search.fit(x_train, y_train)


Alpha (Regularization Strength):

Analogy: Think of alpha as a knob on a stereo system to control the volume. If you turn it to the left (low values of alpha), the volume (the influence of regularization) is low, meaning the model can capture more details, potentially overfitting. If you turn it to the right (high values of alpha), the volume (the influence of regularization) is high, meaning the model simplifies and smooths out its predictions to avoid overfitting.
Fit Intercept:

Analogy: Imagine you are fitting a line through data points. If 'fit_intercept' is 'True,' it's like allowing the line to move up or down to best fit the data, similar to adjusting the line's intercept. If 'fit_intercept' is 'False,' it's like forcing the line to go through the origin (0,0) and not allowing it to shift up or down.
Normalize:

Analogy: Consider a recipe that requires you to add salt to a dish. If 'normalize' is 'True,' it's like ensuring the amount of salt you add is proportional to the other ingredients' quantities, maintaining a balance. If 'normalize' is 'False,' it means you add a fixed amount of salt, regardless of the other ingredient quantities. Normalizing helps maintain consistent scaling of the features, which can be important for some algorithms.

In [None]:
# Get the best parameters
best_params = random_search.best_params_

# Get the best estimator (model)
best_model = random_search.best_estimator_

# You can then evaluate the best model on the test set and use it for predictions
y_pred = best_model.predict(x_test)

In [None]:
print("best params: ", best_params)
print("best model: ", best_model)
print("y prediction: ", y_pred)


### Model Evaluation

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R-squared_Test: {r2_test}')

# Model interpretation
coefficients = best_model.coef_
print('Coefficients:', coefficients)
new = best_model.coef_.reshape(-1, 1)
pd.DataFrame(data=new, index=x_train.columns, columns=['coef'])