In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Load and process MSFT price data
msft = pd.read_csv('/kaggle/input/msft-prices/msft_price-history-04-08-2025.csv')
msft = msft[pd.to_datetime(msft['Time'], errors='coerce').notna()]
msft['date'] = pd.to_datetime(msft['Time']).dt.date
msft = msft.sort_values('date')
msft['Last'] = pd.to_numeric(msft['Last'], errors='coerce')
msft['3_day_return'] = msft['Last'].pct_change(periods=3).shift(-3)

# 2. Load and process news sentiment data
news = pd.read_csv('/kaggle/input/stock-prices/sentiment_analyzed_news.csv')
news['date'] = pd.to_datetime(news['date']).dt.date
news = news[['date', 'source', 'sentiment', 'sentiment_strength', 'weighted_sentiment']]


In [2]:

# 3. Map sentiment labels to numeric
sentiment_map = {'positive': 1, 'neutral': 0, 'negative': -1}
news['sentiment'] = news['sentiment'].map(sentiment_map)

# 4. Merge on date
merged = pd.merge(news, msft[['date', '3_day_return']], on='date', how='inner')

print(f"Merged shape: {merged.shape}")

# 5. Drop rows missing info
merged = merged.dropna(subset=['sentiment', '3_day_return', 'source'])

# 6. Evaluate prediction correctness
merged['correct'] = np.where(
    ((merged['sentiment'] > 0.5) & (merged['3_day_return'] > 0)) |
    ((merged['sentiment'] <= 0.5) & (merged['3_day_return'] <= 0)),
    1, 0
)

# 7. Compute accuracy per source
accuracy_df = merged.groupby('source')['correct'].mean().reset_index()
accuracy_df.columns = ['source', 'accuracy']


Merged shape: (2185, 6)


In [3]:

# 8. Define authority (you can change if needed)
authority_dict = {
    'Economic Times': 0.85,
    'Google News': 0.8,
    'News API': 0.75,
    'StockTwits': 0.6,
    'Yahoo Finance': 0.9
}
merged['authority'] = merged['source'].map(authority_dict)


In [4]:

# 9. Merge accuracy into main
merged = merged.merge(accuracy_df, on='source', how='left')

# 10. Compute Source Credibility Index (SCI)
alpha, beta = 0.7, 0.3
merged['SCI'] = alpha * merged['accuracy'] + beta * merged['authority']


In [5]:

# 11. Credibility-weighted sentiment
merged['weighted_sentiment'] = merged['sentiment'] * merged['SCI']

# 12. Aggregate daily
daily_sentiment = merged.groupby('date').agg({
    'sentiment': 'mean',
    'weighted_sentiment': 'mean'
}).reset_index()


In [6]:
# 15. Compute SCI for all sources, even those not in merged
all_sources = pd.DataFrame({'source': list(authority_dict.keys())})

# Merge in observed accuracy, if available
all_sources = all_sources.merge(accuracy_df, on='source', how='left')

# Use mean accuracy for missing sources
mean_accuracy = accuracy_df['accuracy'].mean()
all_sources['accuracy'] = all_sources['accuracy'].fillna(mean_accuracy)

# Add authority
all_sources['authority'] = all_sources['source'].map(authority_dict)

# Recalculate SCI
all_sources['SCI'] = alpha * all_sources['accuracy'] + beta * all_sources['authority']

# Print final results
print("\nFinal SCI (including sources with no matched returns):")
print(all_sources.sort_values('SCI', ascending=False))



Final SCI (including sources with no matched returns):
           source  accuracy  authority       SCI
4   Yahoo Finance  0.824561       0.90  0.847193
2        News API  0.676950       0.75  0.698865
3      StockTwits  0.731810       0.60  0.692267
0  Economic Times  0.571429       0.85  0.655000
1     Google News  0.580000       0.80  0.646000
