<a href="https://colab.research.google.com/github/danielbehargithub/MuskTweets-Impact-on-TeslaStock/blob/main/tweets_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence_transformers

In [None]:
import pandas as pd
from textblob import TextBlob
from sentence_transformers import SentenceTransformer, util
import numpy as np
import time

In [None]:
import pandas as pd
from textblob import TextBlob
from sentence_transformers import SentenceTransformer, util
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import time

# Load pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load your tweets data
tweets_df = pd.read_csv('/content/merged_df.csv')  # Adjust the path to your CSV file

# Select the first 10000 tweets for testing
# tweets_df = tweets_df.head(10000)

# Retain only relevant columns
tweets_df = tweets_df[['index', 'text']]

# Define keyword categories
primary_keywords = ['Tesla', 'TSLA', 'Elon Musk', 'Model S', 'Model 3', 'Model X',
                    'Model Y', 'Cybertruck', 'Roadster', 'Semi', 'Autopilot',
                    'Full Self-Driving', 'Gigafactory', 'Solar Roof', 'Powerwall', 'Starlink']

secondary_keywords = ['@Tesla', 'Electric Vehicle', 'Battery', 'Lithium-ion', 'Renewable Energy',
                      'Autonomous Driving', 'Clean Energy', 'Sustainable Energy',
                      'Energy Storage', 'Solar Energy', 'Charging Infrastructure',
                      'Supercharger', 'AI', 'Machine Learning', 'Autonomous Vehicles',
                      'Green Technology', 'Smart Grid']

financial_keywords = ['Stock', 'NASDAQ', 'Earnings', 'Share', 'Dividend', 'Market Cap',
                      'Valuation', 'Quarterly Report', 'Profit', 'Revenue', 'EPS',
                      'Buyback', 'Stock Split', 'Shareholders', 'Financial Results',
                      'Forecast', 'Guidance', 'Analyst Rating', 'Price Target',
                      'Upgrade', 'Downgrade', 'SEC Filing', 'IPO', 'Mergers and Acquisitions',
                      'Investment', 'Contract']

# Keyword matching scores
tweets_df['primary_keyword_score'] = tweets_df['text'].apply(lambda x: sum(1 for kw in primary_keywords if kw.lower() in x.lower()))
tweets_df['secondary_keyword_score'] = tweets_df['text'].apply(lambda x: sum(1 for kw in secondary_keywords if kw.lower() in x.lower()))

# Update financial_keyword_score to 0 if both primary and secondary scores are 0
tweets_df['financial_keyword_score'] = tweets_df.apply(
    lambda row: sum(1 for kw in financial_keywords if kw.lower() in row['text'].lower()) if row['primary_keyword_score'] > 0 or row['secondary_keyword_score'] > 0 else 0,
    axis=1
)

# Total keyword score with different weights for each category
tweets_df['keyword_score'] = (tweets_df['primary_keyword_score'] * 5) + (tweets_df['secondary_keyword_score'] * 2) + tweets_df['financial_keyword_score']

# Define a reference text for Tesla-related topics
reference_text = """
Tesla Inc. is a company that designs, manufactures, and sells electric vehicles and energy storage products.
Tesla's stock price is influenced by various factors, including company performance, innovations in electric vehicle technology,
statements and actions by CEO Elon Musk, market trends, and economic conditions.
"""

# Compute reference embedding
reference_embedding = model.encode(reference_text, convert_to_tensor=True)

# Function to analyze sentiment
def analyze_sentiment(tweet):
    blob = TextBlob(tweet)
    sentiment_polarity = blob.sentiment.polarity
    sentiment_score = (sentiment_polarity + 1) * 5  # Convert polarity (-1 to 1) to score (1 to 10)
    return sentiment_score

# Function to analyze relevance
def compute_similarity(tweet):
    tweet_embedding = model.encode(tweet, convert_to_tensor=True)
    similarity = util.cos_sim(tweet_embedding, reference_embedding).item()
    return similarity

# Function to normalize similarity (Z-score normalization)
def normalize_similarity(similarity, mean_similarity, std_similarity):
    z_score = (similarity - mean_similarity) / std_similarity
    return z_score

# Scale absolute z-scores to range [1, 10]
def scale_to_range_1_10(z_score, min_z, max_z):
    scaled_score = ((abs(z_score) - min_z) / (max_z - min_z)) * 9 + 1
    return scaled_score
"""
# Function to analyze volume with weighted sum
def analyze_volume(likes, retweets, replies):
    weight_likes = 0.2
    weight_retweets = 0.4
    weight_replies = 0.4

    volume_score = weight_likes * likes + weight_retweets * retweets + weight_replies * replies
    volume_score = np.log1p(volume_score)
    max_volume = np.log1p(tweets_df[['likes_count', 'retweets_count', 'replies_count']].max().sum())
    volume_score = (volume_score / max_volume) * 10
    return volume_score
"""
# Start analysis
start_time = time.time()

# Analyze sentiment
tweets_df['sentiment_score'] = tweets_df['text'].apply(analyze_sentiment)
sentiment_end_time = time.time()

# Analyze volume
"""
tweets_df['volume_score'] = tweets_df.apply(lambda row: analyze_volume(row['likes_count'], row['retweets_count'], row['replies_count']), axis=1)
volume_end_time = time.time()
"""
# Analyze relevance
tweets_df['similarity'] = tweets_df['text'].apply(compute_similarity)
tweets_df['abs_similarity'] = tweets_df['similarity'].apply(abs)
mean_similarity = tweets_df['abs_similarity'].mean()
std_similarity = tweets_df['abs_similarity'].std()

tweets_df['z_score'] = tweets_df['abs_similarity'].apply(lambda x: normalize_similarity(x, mean_similarity, std_similarity))
min_z = tweets_df['z_score'].abs().min()
max_z = tweets_df['z_score'].abs().max()
tweets_df['relevance_score'] = tweets_df['z_score'].apply(lambda x: scale_to_range_1_10(x, min_z, max_z))
relevance_end_time = time.time()

# Calculate adjusted weights for similarity based on the number of keywords
def adjusted_similarity_weight(keyword_score):
    return min(0.1 + (keyword_score * 0.1), 1.0)

tweets_df['similarity_weight'] = tweets_df['keyword_score'].apply(adjusted_similarity_weight)
tweets_df['keyword_weight'] = 1 - tweets_df['similarity_weight']

# Calculate the final relevance using the adjusted weights
tweets_df['final_relevance'] = (tweets_df['keyword_weight'] * tweets_df['keyword_score']) + (tweets_df['similarity_weight'] * tweets_df['relevance_score'])

# Normalize final score to the range [1, 10]
final_scaler = MinMaxScaler(feature_range=(1, 10))
tweets_df['final_relevance'] = final_scaler.fit_transform(tweets_df[['final_relevance']])

"""
# Calculate the final rating using sentiment, relevance, and volume
tweets_df['final_rating'] = (tweets_df['sentiment_score'].astype(float) * 0.4 +
                             tweets_df['final_relevance'].astype(float) * 0.4 +
                             tweets_df['volume_score'] * 0.2)
"""
# Format scores to two decimal places
tweets_df['sentiment_score'] = tweets_df['sentiment_score'].apply(lambda x: f"{x:.2f}")
tweets_df['final_relevance'] = tweets_df['final_relevance'].apply(lambda x: f"{x:.2f}")
# tweets_df['volume_score'] = tweets_df['volume_score'].apply(lambda x: f"{x:.2f}")
# tweets_df['final_rating'] = tweets_df['final_rating'].apply(lambda x: f"{x:.2f}")

# Drop intermediate columns
"""
tweets_df = tweets_df.drop(columns=['primary_keyword_score', 'secondary_keyword_score', 'financial_keyword_score',
                                    'similarity', 'abs_similarity', 'z_score', 'similarity_weight', 'keyword_weight',
                                    'relevance_score'])
"""

# Reorder columns to place scores after the tweet column
cols = list(tweets_df.columns)
cols.insert(cols.index('text') + 1, cols.pop(cols.index('sentiment_score')))
cols.insert(cols.index('text') + 2, cols.pop(cols.index('final_relevance')))
#cols.insert(cols.index('text') + 3, cols.pop(cols.index('volume_score')))
# cols.insert(cols.index('text') + 4, cols.pop(cols.index('final_rating')))
tweets_df = tweets_df[cols]

# Print timing results
print(f"Sentiment Analysis Time: {sentiment_end_time - start_time:.2f} seconds")
#print(f"Volume Analysis Time: {volume_end_time - sentiment_end_time:.2f} seconds")
#print(f"Relevance Analysis Time: {relevance_end_time - volume_end_time:.2f} seconds")

# Save or display the results
tweets_df.to_csv('tweets_with_analysis_scores.csv', index=False)


Sentiment Analysis Time: 1.22 seconds
