In [21]:
import pandas as pd
import feedparser
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import datetime
import requests
import time

# RSS Feeds from trusted financial and news sources
feeds = {
    'Reuters': 'http://feeds.reuters.com/reuters/businessNews',
    'Bloomberg': 'https://www.bloomberg.com/feed/podcast/etfreport.xml',
    'Economic Times': 'https://economictimes.indiatimes.com/rssfeedsdefault.cms',
    'Financial Times': 'https://www.ft.com/?format=rss',
    'Wall Street Journal': 'https://feeds.a.dj.com/rss/RSSMarketsMain.xml',
    'BBC News': 'http://feeds.bbci.co.uk/news/business/rss.xml'
}

# Fetch articles from RSS feeds
def fetch_rss_feed(feed_url):
    feed = feedparser.parse(feed_url)
    articles = []
    for entry in feed.entries:
        # Handle the summary field safely, default to an empty string if not present
        summary = entry.get('summary', '')

        # Handle the published date safely
        published = None
        if 'published_parsed' in entry:
            published = datetime(*entry.published_parsed[:6])

        article = {
            'title': entry.title,
            'link': entry.link,
            'published': published,
            'summary': summary
        }
        articles.append(article)
    return articles

# Fetch articles from GDELT (for historical news)
def fetch_gdelt_articles(company, start_date, end_date):
    url = f"https://api.gdeltproject.org/api/v2/doc/doc?query={company}&mode=artlist&startdatetime={start_date}&enddatetime={end_date}&format=json"
    
    try:
        response = requests.get(url)
        
        # Check if the response is valid
        if response.status_code == 200 and response.content:
            try:
                json_data = response.json()
                if json_data and 'articles' in json_data:
                    return json_data['articles']
                else:
                    print(f"No articles found for {company} in GDELT.")
                    return []
            except ValueError:
                print(f"Error parsing GDELT data for {company}: Invalid JSON")
                return []
        else:
            print(f"GDELT API returned non-200 status or empty response for {company}. Status Code: {response.status_code}")
            return []
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching GDELT data for {company}: {e}")
        return []

# Keywords to identify fresh events
fresh_event_keywords = ["today", "just now", "announced", "introduced", "launched", "reported", "unveiled", "breaking news"]

# Filter fresh events
def filter_fresh_events(articles):
    fresh_events = []
    for article in articles:
        title_lower = article['title'].lower()
        summary_lower = article['summary'].lower()
        if any(keyword in title_lower or keyword in summary_lower for keyword in fresh_event_keywords):
            fresh_events.append(article)
    return fresh_events

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Perform sentiment analysis
def calculate_sentiment(article):
    sentiment = analyzer.polarity_scores(article['summary'])
    article['sentiment_score'] = sentiment['compound']  # Store compound score
    return article

def analyze_sentiment_for_articles(articles):
    for article in articles:
        article = calculate_sentiment(article)
    return articles

# Aggregate sentiment scores by date
def aggregate_sentiment_by_day(articles, company_name):
    df = pd.DataFrame(articles)
    if not df.empty and 'published' in df.columns:
        df['date'] = df['published'].dt.date  # Extract the date from published datetime
        daily_sentiment = df.groupby('date')['sentiment_score'].mean().reset_index()
        daily_sentiment['company'] = company_name
        return daily_sentiment
    return pd.DataFrame()  # Return empty DataFrame if no valid data

# Process each company to fetch news, calculate sentiment, and aggregate scores
def process_company(ticker, company_name):
    print(f"Processing company: {company_name} (Ticker: {ticker})...")  # Print the company name and ticker

    # Define start and end dates
    start_date = "19900101"
    end_date = "20240930"

    # Fetch articles from all RSS feeds
    rss_articles = []
    for source, feed_url in feeds.items():
        rss_articles.extend(fetch_rss_feed(feed_url))
    
    # Fetch historical articles from GDELT
    gdelt_articles = fetch_gdelt_articles(company_name, start_date, end_date)

    # Combine RSS and GDELT articles
    all_articles = rss_articles + gdelt_articles

    # Filter fresh events
    fresh_events = filter_fresh_events(all_articles)

    # Perform sentiment analysis
    fresh_events_with_sentiment = analyze_sentiment_for_articles(fresh_events)

    # Aggregate sentiment scores by date
    aggregated_sentiment = aggregate_sentiment_by_day(fresh_events_with_sentiment, company_name)

    return aggregated_sentiment

# Load your ticker file (NSE_tickers.csv) and namechange file (namechange.csv)
ticker_file = 'NSE_tickers.csv'
namechange_file = 'namechange.csv'

# Load the ticker file
tickers_df = pd.read_csv(ticker_file)

# Try loading with ISO-8859-1 encoding
namechange_df = pd.read_csv(namechange_file, encoding='ISO-8859-1')

# Merge the tickers with the company names using 'SYMBOL' as the key
tickers_df = tickers_df.merge(namechange_df[['SYMBOL', 'COMPANY']], left_on='SYMBOL', right_on='SYMBOL', how='left')

# Drop any rows where we couldn't find the company name
tickers_df = tickers_df.dropna(subset=['COMPANY'])

# Process companies in batches
all_sentiment_data = []
company_counter = 0  # Counter to track the number of companies processed

for index, row in tickers_df.iterrows():
    # Print the company being processed
    print(f"Starting processing for company: {row['COMPANY']} (Ticker: {row['SYMBOL']})")
    
    sentiment_data = process_company(row['SYMBOL'], row['COMPANY'])
    
    if not sentiment_data.empty:
        all_sentiment_data.append(sentiment_data)
        company_counter += 1  # Increment the company counter
    
    # Save intermediate results after processing 2 companies
    if company_counter == 2:
        # Combine sentiment data for the first two companies
        intermediate_df = pd.concat(all_sentiment_data)
        
        # Save intermediate results
        intermediate_df.to_csv('intermediate_sentiment_dataset.csv', index=False)
        print("Intermediate results saved to 'intermediate_sentiment_dataset.csv'")
        # Optionally, break here to verify the data before continuing
        # break
    
    # Add delay between requests to avoid rate limiting
    time.sleep(1)  # Adjust the delay if necessary

# Continue processing the rest of the companies and save final results
if all_sentiment_data:
    final_df = pd.concat(all_sentiment_data)

    # Save the final dataset
    final_df.to_csv('company_sentiment_dataset.csv', index=False)
    print("Final data saved to 'company_sentiment_dataset.csv'")
else:
    print("No sentiment data found for the companies processed.")


Starting processing for company: 360 ONE WAM LIMITED (Ticker: 360ONE)
Processing company: 360 ONE WAM LIMITED (Ticker: 360ONE)...
Error parsing GDELT data for 360 ONE WAM LIMITED: Invalid JSON
Starting processing for company: 3M India Limited (Ticker: 3MINDIA)
Processing company: 3M India Limited (Ticker: 3MINDIA)...
Error parsing GDELT data for 3M India Limited: Invalid JSON
Intermediate results saved to 'intermediate_sentiment_dataset.csv'
Starting processing for company: 3P Land Holdings Limited (Ticker: 3PLAND)
Processing company: 3P Land Holdings Limited (Ticker: 3PLAND)...
Error parsing GDELT data for 3P Land Holdings Limited: Invalid JSON
Starting processing for company: 63 moons technologies limited (Ticker: 63MOONS)
Processing company: 63 moons technologies limited (Ticker: 63MOONS)...
Error parsing GDELT data for 63 moons technologies limited: Invalid JSON
Starting processing for company: A2Z INFRA ENGINEERING LIMITED (Ticker: A2ZINFRA)
Processing company: A2Z INFRA ENGINEERI

Error parsing GDELT data for Amara Raja Energy & Mobility Limited: Invalid JSON
Starting processing for company: Rajdarshan Industries Limited (Ticker: ARENTERP)
Processing company: Rajdarshan Industries Limited (Ticker: ARENTERP)...
Error parsing GDELT data for Rajdarshan Industries Limited: Invalid JSON
Starting processing for company: Arrow Greentech Limited (Ticker: ARROWGREEN)
Processing company: Arrow Greentech Limited (Ticker: ARROWGREEN)...
Error parsing GDELT data for Arrow Greentech Limited: Invalid JSON
Starting processing for company: Arvind Limited (Ticker: ARVIND)
Processing company: Arvind Limited (Ticker: ARVIND)...
Error parsing GDELT data for Arvind Limited: Invalid JSON
Starting processing for company: Arvind SmartSpaces Limited (Ticker: ARVSMART)
Processing company: Arvind SmartSpaces Limited (Ticker: ARVSMART)...
Error parsing GDELT data for Arvind SmartSpaces Limited: Invalid JSON
Starting processing for company: Asahi India Glass Limited (Ticker: ASAHIINDIA)
Proc

Starting processing for company: Career Point Limited (Ticker: CAREERP)
Processing company: Career Point Limited (Ticker: CAREERP)...
Error parsing GDELT data for Career Point Limited: Invalid JSON
Starting processing for company: CARE Ratings Limited (Ticker: CARERATING)
Processing company: CARE Ratings Limited (Ticker: CARERATING)...
Error parsing GDELT data for CARE Ratings Limited: Invalid JSON
Starting processing for company: CARYSIL LIMITED (Ticker: CARYSIL)
Processing company: CARYSIL LIMITED (Ticker: CARYSIL)...
Error parsing GDELT data for CARYSIL LIMITED: Invalid JSON
Starting processing for company: Centum Electronics Limited (Ticker: CENTUM)
Processing company: Centum Electronics Limited (Ticker: CENTUM)...
Error parsing GDELT data for Centum Electronics Limited: Invalid JSON
Starting processing for company: Capri Global Capital Limited (Ticker: CGCL)
Processing company: Capri Global Capital Limited (Ticker: CGCL)...
Error parsing GDELT data for Capri Global Capital Limited

Starting processing for company: DIC India Limited (Ticker: DICIND)
Processing company: DIC India Limited (Ticker: DICIND)...
Error parsing GDELT data for DIC India Limited: Invalid JSON
Starting processing for company: DiGiSPICE Technologies Limited (Ticker: DIGISPICE)
Processing company: DiGiSPICE Technologies Limited (Ticker: DIGISPICE)...
Error parsing GDELT data for DiGiSPICE Technologies Limited: Invalid JSON
Starting processing for company: Debock Industries Limited (Ticker: DIL)
Processing company: Debock Industries Limited (Ticker: DIL)...
Error parsing GDELT data for Debock Industries Limited: Invalid JSON
Starting processing for company: DMCC SPECIALITY CHEMICALS LIMITED (Ticker: DMCC)
Processing company: DMCC SPECIALITY CHEMICALS LIMITED (Ticker: DMCC)...
Error parsing GDELT data for DMCC SPECIALITY CHEMICALS LIMITED: Invalid JSON
Starting processing for company: Dolat Algotech Limited (Ticker: DOLATALGO)
Processing company: Dolat Algotech Limited (Ticker: DOLATALGO)...
Err

Starting processing for company: Genus Power Infrastructures Limited (Ticker: GENUSPOWER)
Processing company: Genus Power Infrastructures Limited (Ticker: GENUSPOWER)...
Error parsing GDELT data for Genus Power Infrastructures Limited: Invalid JSON
Starting processing for company: Geojit Financial Services Limited (Ticker: GEOJITFSL)
Processing company: Geojit Financial Services Limited (Ticker: GEOJITFSL)...
Error parsing GDELT data for Geojit Financial Services Limited: Invalid JSON
Starting processing for company: GE Power India Limited (Ticker: GEPIL)
Processing company: GE Power India Limited (Ticker: GEPIL)...
Error parsing GDELT data for GE Power India Limited: Invalid JSON
Starting processing for company: GE T&D India Limited (Ticker: GET&D)
Processing company: GE T&D India Limited (Ticker: GET&D)...
Error parsing GDELT data for GE T&D India Limited: Invalid JSON
Starting processing for company: GFL Limited (Ticker: GFLLIMITED)
Processing company: GFL Limited (Ticker: GFLLIMITE

Error parsing GDELT data for Hitech Corporation Limited: Invalid JSON
Starting processing for company: The Hi-Tech Gears Limited (Ticker: HITECHGEAR)
Processing company: The Hi-Tech Gears Limited (Ticker: HITECHGEAR)...
Error parsing GDELT data for The Hi-Tech Gears Limited: Invalid JSON
Starting processing for company: HLV LIMITED (Ticker: HLVLTD)
Processing company: HLV LIMITED (Ticker: HLVLTD)...
Error parsing GDELT data for HLV LIMITED: Invalid JSON
Starting processing for company: Honeywell Automation India Limited (Ticker: HONAUT)
Processing company: Honeywell Automation India Limited (Ticker: HONAUT)...
Error parsing GDELT data for Honeywell Automation India Limited: Invalid JSON
Starting processing for company: Honda India Power Products Limited (Ticker: HONDAPOWER)
Processing company: Honda India Power Products Limited (Ticker: HONDAPOWER)...
Error parsing GDELT data for Honda India Power Products Limited: Invalid JSON
Starting processing for company: Himadri Speciality Chemic

Error parsing GDELT data for JTEKT India Limited: Invalid JSON
Starting processing for company: JTL INDUSTRIES LIMITED (Ticker: JTLIND)
Processing company: JTL INDUSTRIES LIMITED (Ticker: JTLIND)...
Error parsing GDELT data for JTL INDUSTRIES LIMITED: Invalid JSON
Starting processing for company: Jubilant Pharmova Limited (Ticker: JUBLPHARMA)
Processing company: Jubilant Pharmova Limited (Ticker: JUBLPHARMA)...
Error parsing GDELT data for Jubilant Pharmova Limited: Invalid JSON
Starting processing for company: Jupiter Wagons Limited (Ticker: JWL)
Processing company: Jupiter Wagons Limited (Ticker: JWL)...
Error parsing GDELT data for Jupiter Wagons Limited: Invalid JSON
Starting processing for company: Jyothy Labs Limited (Ticker: JYOTHYLAB)
Processing company: Jyothy Labs Limited (Ticker: JYOTHYLAB)...
Error parsing GDELT data for Jyothy Labs Limited: Invalid JSON
Starting processing for company: Kakatiya Cement Sugar & Industries Limited (Ticker: KAKATCEM)
Processing company: Kakati

Starting processing for company: Mirza International Limited (Ticker: MIRZAINT)
Processing company: Mirza International Limited (Ticker: MIRZAINT)...
Error parsing GDELT data for Mirza International Limited: Invalid JSON
Starting processing for company: MODISON LIMITED (Ticker: MODISONLTD)
Processing company: MODISON LIMITED (Ticker: MODISONLTD)...
Error parsing GDELT data for MODISON LIMITED: Invalid JSON
Starting processing for company: Modern Threads (India) Limited (Ticker: MODTHREAD)
Processing company: Modern Threads (India) Limited (Ticker: MODTHREAD)...
Error parsing GDELT data for Modern Threads (India) Limited: Invalid JSON
Starting processing for company: Samvardhana Motherson International Limited (Ticker: MOTHERSON)
Processing company: Samvardhana Motherson International Limited (Ticker: MOTHERSON)...
Error parsing GDELT data for Samvardhana Motherson International Limited: Invalid JSON
Starting processing for company: MphasiS Limited (Ticker: MPHASIS)
Processing company: 

Error parsing GDELT data for Paisalo Digital Limited: Invalid JSON
Starting processing for company: Patanjali Foods Limited (Ticker: PATANJALI)
Processing company: Patanjali Foods Limited (Ticker: PATANJALI)...
Error parsing GDELT data for Patanjali Foods Limited: Invalid JSON
Starting processing for company: PCBL LIMITED (Ticker: PCBL)
Processing company: PCBL LIMITED (Ticker: PCBL)...
Error parsing GDELT data for PCBL LIMITED: Invalid JSON
Starting processing for company: PDS Limited (Ticker: PDSL)
Processing company: PDS Limited (Ticker: PDSL)...
Error parsing GDELT data for PDS Limited: Invalid JSON
Starting processing for company: Piramal Enterprises Limited (Ticker: PEL)
Processing company: Piramal Enterprises Limited (Ticker: PEL)...
Error parsing GDELT data for Piramal Enterprises Limited: Invalid JSON
Starting processing for company: Peninsula Land Limited (Ticker: PENINLAND)
Processing company: Peninsula Land Limited (Ticker: PENINLAND)...
Error parsing GDELT data for Peninsu

Error parsing GDELT data for Puravankara Limited: Invalid JSON
Starting processing for company: PVP Ventures Limited (Ticker: PVP)
Processing company: PVP Ventures Limited (Ticker: PVP)...
Error parsing GDELT data for PVP Ventures Limited: Invalid JSON
Starting processing for company: PVR INOX Limited (Ticker: PVRINOX)
Processing company: PVR INOX Limited (Ticker: PVRINOX)...
Error parsing GDELT data for PVR INOX Limited: Invalid JSON
Starting processing for company: Rain Industries Limited (Ticker: RAIN)
Processing company: Rain Industries Limited (Ticker: RAIN)...
Error parsing GDELT data for Rain Industries Limited: Invalid JSON
Starting processing for company: Shree Rama Newsprint Limited (Ticker: RAMANEWS)
Processing company: Shree Rama Newsprint Limited (Ticker: RAMANEWS)...
Error parsing GDELT data for Shree Rama Newsprint Limited: Invalid JSON
Starting processing for company: The Ramco Cements Limited (Ticker: RAMCOCEM)
Processing company: The Ramco Cements Limited (Ticker: RAM

Error parsing GDELT data for Shoppers Stop Limited: Invalid JSON
Starting processing for company: Shradha Infraprojects Limited (Ticker: SHRADHA)
Processing company: Shradha Infraprojects Limited (Ticker: SHRADHA)...
Error parsing GDELT data for Shradha Infraprojects Limited: Invalid JSON
Starting processing for company: SHREE CEMENT LIMITED (Ticker: SHREECEM)
Processing company: SHREE CEMENT LIMITED (Ticker: SHREECEM)...
Error parsing GDELT data for SHREE CEMENT LIMITED: Invalid JSON
Starting processing for company: Shreyas Shipping & Logistics Limited (Ticker: SHREYAS)
Processing company: Shreyas Shipping & Logistics Limited (Ticker: SHREYAS)...
Error parsing GDELT data for Shreyas Shipping & Logistics Limited: Invalid JSON
Starting processing for company: Shriram Finance Limited (Ticker: SHRIRAMFIN)
Processing company: Shriram Finance Limited (Ticker: SHRIRAMFIN)...
Error parsing GDELT data for Shriram Finance Limited: Invalid JSON
Starting processing for company: Shriram Finance Li

Error parsing GDELT data for Shriram Finance Limited: Invalid JSON
Starting processing for company: Shriram Finance Limited (Ticker: SHRIRAMFIN)
Processing company: Shriram Finance Limited (Ticker: SHRIRAMFIN)...
Error parsing GDELT data for Shriram Finance Limited: Invalid JSON
Starting processing for company: Shriram Finance Limited (Ticker: SHRIRAMFIN)
Processing company: Shriram Finance Limited (Ticker: SHRIRAMFIN)...
Error parsing GDELT data for Shriram Finance Limited: Invalid JSON
Starting processing for company: Shriram Finance Limited (Ticker: SHRIRAMFIN)
Processing company: Shriram Finance Limited (Ticker: SHRIRAMFIN)...
Error parsing GDELT data for Shriram Finance Limited: Invalid JSON
Starting processing for company: Shriram Finance Limited (Ticker: SHRIRAMFIN)
Processing company: Shriram Finance Limited (Ticker: SHRIRAMFIN)...
Error parsing GDELT data for Shriram Finance Limited: Invalid JSON
Starting processing for company: Shriram Finance Limited (Ticker: SHRIRAMFIN)
Pro

Error parsing GDELT data for Shriram Finance Limited: Invalid JSON
Starting processing for company: Shriram Finance Limited (Ticker: SHRIRAMFIN)
Processing company: Shriram Finance Limited (Ticker: SHRIRAMFIN)...
Error parsing GDELT data for Shriram Finance Limited: Invalid JSON
Starting processing for company: Shriram Finance Limited (Ticker: SHRIRAMFIN)
Processing company: Shriram Finance Limited (Ticker: SHRIRAMFIN)...
Error parsing GDELT data for Shriram Finance Limited: Invalid JSON
Starting processing for company: Shriram Finance Limited (Ticker: SHRIRAMFIN)
Processing company: Shriram Finance Limited (Ticker: SHRIRAMFIN)...
Error parsing GDELT data for Shriram Finance Limited: Invalid JSON
Starting processing for company: Shriram Finance Limited (Ticker: SHRIRAMFIN)
Processing company: Shriram Finance Limited (Ticker: SHRIRAMFIN)...
Error parsing GDELT data for Shriram Finance Limited: Invalid JSON
Starting processing for company: Shriram Finance Limited (Ticker: SHRIRAMFIN)
Pro

Error parsing GDELT data for Styrenix Performance Materials Limited: Invalid JSON
Starting processing for company: Sunflag Iron And Steel Company Limited (Ticker: SUNFLAG)
Processing company: Sunflag Iron And Steel Company Limited (Ticker: SUNFLAG)...
Error parsing GDELT data for Sunflag Iron And Steel Company Limited: Invalid JSON
Starting processing for company: Sun Pharmaceutical Industries Limited (Ticker: SUNPHARMA)
Processing company: Sun Pharmaceutical Industries Limited (Ticker: SUNPHARMA)...
Error parsing GDELT data for Sun Pharmaceutical Industries Limited: Invalid JSON
Starting processing for company: Sun TV Network Limited (Ticker: SUNTV)
Processing company: Sun TV Network Limited (Ticker: SUNTV)...
Error parsing GDELT data for Sun TV Network Limited: Invalid JSON
Starting processing for company: SURANA SOLAR LIMITED (Ticker: SURANASOL)
Processing company: SURANA SOLAR LIMITED (Ticker: SURANASOL)...
Error parsing GDELT data for SURANA SOLAR LIMITED: Invalid JSON
Starting pr

Error parsing GDELT data for UFLEX Limited: Invalid JSON
Starting processing for company: UltraTech Cement Limited (Ticker: ULTRACEMCO)
Processing company: UltraTech Cement Limited (Ticker: ULTRACEMCO)...
Error parsing GDELT data for UltraTech Cement Limited: Invalid JSON
Starting processing for company: UNO Minda Limited (Ticker: UNOMINDA)
Processing company: UNO Minda Limited (Ticker: UNOMINDA)...
Error parsing GDELT data for UNO Minda Limited: Invalid JSON
Starting processing for company: UPL Limited (Ticker: UPL)
Processing company: UPL Limited (Ticker: UPL)...
Error parsing GDELT data for UPL Limited: Invalid JSON
Starting processing for company: Usha Martin Limited (Ticker: USHAMART)
Processing company: Usha Martin Limited (Ticker: USHAMART)...
Error parsing GDELT data for Usha Martin Limited: Invalid JSON
Starting processing for company: V2 Retail Limited (Ticker: V2RETAIL)
Processing company: V2 Retail Limited (Ticker: V2RETAIL)...
Error parsing GDELT data for V2 Retail Limited

In [29]:
# Load the sentiment dataset
sentiment_df = pd.read_csv('company_sentiment_dataset.csv')  # Assuming this is the filename

# Load the ticker-to-company mapping file
namechange_df = pd.read_csv('namechange.csv', encoding='ISO-8859-1')

# Merge the sentiment dataset with the ticker file to get the SYMBOL for each company
sentiment_df = sentiment_df.merge(namechange_df[['SYMBOL', 'COMPANY']], left_on='company', right_on='COMPANY', how='left')

# Drop the 'COMPANY' column as it's no longer needed after mapping to SYMBOL
sentiment_df = sentiment_df.drop(columns=['company'])


In [30]:
len(sentiment_df)

69475

In [32]:
# Load the main stock dataset (with all stock features)
stock_df = pd.read_parquet('intermediate_self_sufficient_2.parquet')  # Replace with the correct file path

# Ensure the 'Date' column in the stock data is in datetime format for accurate merging
stock_df['Date'] = pd.to_datetime(stock_df['Date'])

# Ensure the 'date' column in the sentiment data is also in datetime format
sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])

# Merge the stock dataset with the sentiment dataset on 'Date' and 'SYMBOL'
merged_df = stock_df.merge(sentiment_df, left_on=['Date', 'Company_ID'], right_on=['date', 'SYMBOL'], how='left')

# Drop unnecessary columns like 'date' and 'SYMBOL' after merging
merged_df = merged_df.drop(columns=['date', 'SYMBOL'])

# Fill missing sentiment scores with 0 (neutral sentiment)
merged_df['sentiment_score'].fillna(0, inplace=True)

# Save the merged dataset to inspect and verify
merged_df.to_parquet('merged_with_sentiment_interim1.parquet')
print("Merged dataset saved as 'merged_with_sentiment_interim1.parquet'.")


Merged dataset saved as 'merged_with_sentiment_interim1.parquet'.


In [4]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from tqdm import tqdm  # Progress bar
import numpy as np

# Load the merged dataset
print("Loading the merged dataset...")
merged_df = pd.read_parquet('merged_with_sentiment_interim1.parquet')
print("Merged dataset loaded.")

# Create lagged sentiment features (1-day, 5-day, 10-day lags)
print("Creating lagged sentiment features...")
lags = [1, 5, 10]
for lag in lags:
    print(f"Processing {lag}-day lag sentiment...")
    # Create lagged sentiment with a group-by and shift
    merged_df[f'Lagged_Sentiment_{lag}'] = merged_df.groupby('Company_ID')['sentiment_score'].transform(lambda x: x.shift(lag))
print("Lagged sentiment features created.")

# Define the features related to market sentiment (including lagged sentiment)
sentiment_features = [
    'Price_Change', 'Price_Change_Percent', 'Volume_Change', 'Volume_Change_Percent',
    'Rolling_Std_Close', 'Rolling_Mean_Close', 'Rolling_Max_Close', 'Rolling_Min_Close',
    'Z_Score_Close', 'Momentum', 'OBV_Change', 'Volume_Ratio',
    'Lagged_Sentiment_1', 'Lagged_Sentiment_5', 'Lagged_Sentiment_10'
]

# Drop rows with missing values in sentiment-related features
print("Dropping rows with missing sentiment-related features...")
merged_df = merged_df.dropna(subset=sentiment_features)
print(f"{len(merged_df)} rows remaining after dropping missing values.")

# Ensure all features are finite (remove rows with inf or very large values)
print("Ensuring all feature values are finite...")
X = merged_df[sentiment_features]
X = X.replace([np.inf, -np.inf], np.nan)  # Replace infinities with NaN
X = X.dropna()  # Drop rows with any NaN values
y = merged_df.loc[X.index, 'sentiment_score']  # Make sure to align y with X after dropping NaNs

print("Feature matrix cleaned. Proceeding to model fitting...")

# Fit a linear regression model to find optimal weights
print("Fitting the linear regression model...")
model = LinearRegression()
model.fit(X, y)
print("Model fitting completed.")

# Get the feature weights (coefficients)
weights = model.coef_

# Print the feature weights
print("Feature Weights:")
for feature, weight in zip(sentiment_features, weights):
    print(f"{feature}: {weight}")

# Save feature weights to a CSV for reference
print("Saving feature weights to 'feature_weights_final.csv'...")
weights_df = pd.DataFrame({'Feature': sentiment_features, 'Weight': weights})
weights_df.to_csv('feature_weights_final.csv', index=False)
print("Feature weights saved.")

# Calculate the new sentiment metric (weighted sum of features)
print("Calculating the new weighted sentiment metric...")
weighted_sentiment = (X * weights).sum(axis=1)
merged_df.loc[X.index, 'Weighted_Sentiment_Metric'] = weighted_sentiment
print("Weighted sentiment metric calculated.")

# Save the merged dataset with the new weighted sentiment metric for future use
print("Saving the final merged dataset with the new weighted sentiment metric...")
merged_df.to_parquet('merged_with_weighted_sentiment_final.parquet')
print("Final merged dataset saved as 'merged_with_weighted_sentiment_final.parquet'.")


Loading the merged dataset...
Merged dataset loaded.
Creating lagged sentiment features...
Processing 1-day lag sentiment...
Processing 5-day lag sentiment...
Processing 10-day lag sentiment...
Lagged sentiment features created.
Dropping rows with missing sentiment-related features...
6409090 rows remaining after dropping missing values.
Ensuring all feature values are finite...
Feature matrix cleaned. Proceeding to model fitting...
Fitting the linear regression model...
Model fitting completed.
Feature Weights:
Price_Change: -4.11574402906927e-09
Price_Change_Percent: 2.5666542822369083e-07
Volume_Change: -1.7827567442081687e-12
Volume_Change_Percent: 1.327662738994281e-11
Rolling_Std_Close: -1.4815757236669486e-08
Rolling_Mean_Close: 2.7945749905191854e-08
Rolling_Max_Close: -2.3446638641760268e-08
Rolling_Min_Close: 6.3043831318904855e-09
Z_Score_Close: 2.3494891078401505e-05
Momentum: 3.099515856255062e-09
OBV_Change: 4.1784918060281585e-13
Volume_Ratio: -2.5643216849827193e-05
Lag

In [1]:
import pandas as pd
check = pd.read_parquet('merged_with_weighted_sentiment_final.parquet')
check.columns

Index(['Date', 'Open_x', 'High_x', 'Low_x', 'Close_x', 'Adj Close_x',
       'Volume_x', 'Company_ID', 'VWAP_x', 'MACD_x', 'MACD_Signal_x',
       'MACD_Hist_x', 'SMA_20', 'EMA_20', 'WMA_20', 'Upper_Band',
       'Middle_Band', 'Lower_Band', 'ATR_14', 'RSI_14', 'Stoch_K', 'Stoch_D',
       'OBV', 'SAR', 'CCI_20', 'ROC', 'Price_Change', 'Price_Change_Percent',
       'Volume_Change', 'Volume_Change_Percent', 'High_Low_Spread',
       'Close_Open_Ratio', 'High_Low_Ratio', 'Price_Range',
       'Price_Volume_Product', 'Lag_Close_x_1', 'Lag_Volume_x_1',
       'Lag_Close_x_5', 'Lag_Volume_x_5', 'Lag_Close_x_10', 'Lag_Volume_x_10',
       'Day_of_Week', 'Month', 'Quarter', 'Week_of_Year', 'Rolling_Std_Close',
       'Rolling_Std_Volume', 'Rolling_Mean_Close', 'Rolling_Max_Close',
       'Rolling_Min_Close', 'Rolling_Price_Range', 'Z_Score_Close', 'Momentum',
       'Price_Acceleration', 'OBV_Change', 'Volume_Ratio', 'Rolling_Return',
       'Kurtosis', 'Skewness', 'Sharpe_Ratio', 'Fib_Level