## Importing Libraries and dataset

In [3]:
# Importing libraries 
import pandas as pd
import talib
import yfinance as yf
import spacy
import matplotlib.pyplot as plt
import plotly.express as px
import nltk
import re


from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from scipy.stats import pearsonr, spearmanr
# from nltk.tokenize import word_tokenize
# from nltk.tag import pos_tag
# from nltk.corpus import stopwords
# from nltk.probability import FreqDist
from nltk.sentiment import SentimentIntensityAnalyzer
# import nltk
from textblob import TextBlob

In [None]:
# import importlib
# importlib.reload(nltk)

In [None]:

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nlp = spacy.load('en_core_web_sm')

In [4]:
# Importing dataset from file 
dataset = pd.read_csv('../data/raw_analyst_ratings.csv')

## Task-1

### 1. Descriptive Statistics

In [None]:
headline_lengths = [len(headline) for headline in dataset['headline']]
total_headlines = len(dataset['headline'])
print(f"The total headlines are {total_headlines}.")

In [None]:
min_length = min(headline_lengths)
max_length = max(headline_lengths)
average_length = sum(headline_lengths) / total_headlines
print(f"The minimum, maximum and avarage length of a headlines are {min_length}, {max_length} and {round(average_length, 2)} respectively.")

In [None]:
article_count = dataset['publisher'].value_counts()
print(f"The number of articles published are {article_count}.")

In [None]:
dataset.head()

In [None]:
dataset["date"] = pd.DataFrame(dataset["date"])
dataset['date'] = pd.to_datetime(dataset['date'], errors='coerce')


In [None]:
dataset['day_of_week'] = dataset['date'].dt.day_name()
dataset['month'] = dataset['date'].dt.month
dataset['year'] = dataset['date'].dt.year

In [None]:
day_of_week_counts = dataset['day_of_week'].value_counts().sort_index()
plt.plot(day_of_week_counts.index, day_of_week_counts.values)
plt.xlabel('Day of the Week')
plt.ylabel('Publication Frequency')
plt.title('Publication Frequency by Day of the Week')
plt.show()

In [None]:
month_counts = dataset['month'].value_counts().sort_index()
plt.plot(month_counts.index, month_counts.values)
plt.xlabel('Month')
plt.ylabel('Publication Frequency')
plt.title('Publication Frequency by Month')
plt.show()

### 2. Text Analysis (Sentiment analysis & Topic Modeling)

In [None]:
for headline in dataset['headline'].head(10):
    sentiment = SentimentIntensityAnalyzer().polarity_scores(headline)
    compound_score = sentiment['compound']
    
    if compound_score > 0:
        print(f"Positive: {headline}")
    elif compound_score < 0:
        print(f"Negative: {headline}")
    else:
        print(f"Neutral: {headline}")

In [None]:
nlp = spacy.load('en_core_web_sm')
tokenizer = English().tokenizer

In [None]:
def extract_keywords(text):
    doc = nlp(text)
    
    keywords = []
    
    for chunk in doc.noun_chunks:
        keywords.append(chunk.text)
    
    topic_phrases = ["FDA approval", "price target"]
    
    matcher = PhraseMatcher(nlp.vocab)
    patterns = [nlp(phrase) for phrase in topic_phrases]
    matcher.add("TopicPhrases", None, *patterns)
    
    matches = matcher(doc)
    for match_id, start, end in matches:
        matched_span = doc[start:end]
        keywords.append(matched_span.text)
    
    return keywords

In [None]:
dataset['keywords'] = dataset["headline"].head().apply(extract_keywords)
print(dataset['keywords'])

### 3. Time Series Analysis

In [None]:
publication_counts = dataset.groupby(dataset['date'].dt.date).size()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(publication_counts.index, publication_counts.values)
plt.xlabel('Date')
plt.ylabel('Publication Frequency')
plt.title('Publication Frequency Over Time')
plt.xticks(rotation=45)
plt.show()

In [None]:
hourly_counts = dataset.set_index('date').resample('H').size()
print(hourly_counts)

In [None]:
plt.figure(figsize=(10, 6))
dataset['date'].dt.hour.hist(bins=24, edgecolor='black')
plt.xlabel('Hour')
plt.ylabel('Number of News Articles')
plt.title('Distribution of News Publishing Times')
plt.xticks(range(0, 24))
plt.show()

### 4. Publisher Analysis

In [None]:
publisher_counts = dataset['publisher'].value_counts().head(5).sort_values(ascending=False)

In [None]:
plt.figure(figsize=(12, 6))
ax = publisher_counts.plot(kind='bar', color='steelblue')
ax.set_xlabel('Publisher')
ax.set_ylabel('Number of Articles')
ax.set_title('Publisher Contribution to News Feed')
ax.set_xticklabels(publisher_counts.index, rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
top_publishers = publisher_counts.head(5).index
subset_data = dataset[dataset['publisher'].isin(top_publishers)]

In [None]:
for publisher in top_publishers:
    publisher_data = subset_data[subset_data['publisher'] == publisher]
    print(publisher_data)

In [None]:
def extract_domain(email):
    match = re.search("@[\w.]+", email)
    if match:
        return match.group()[1:]
    else:
        return None

In [None]:
dataset['domain'] = dataset['publisher'].apply(extract_domain)
domain_counts = dataset['domain'].value_counts()

In [None]:
plt.figure(figsize=(12, 6))
domain_counts.plot(kind='bar')
plt.xlabel('Domain')
plt.ylabel('Number of Contributions')
plt.title('Domain Contribution to News Feed')
plt.xticks(rotation=45)
plt.show()

## Task-2

### 5. Stock price fetching 

In [None]:
dataset = dataset[['date', 'stock']]
dataset.head()

In [None]:
unique_values = dataset['stock'].unique()
unique_values

In [None]:
grouped_data = dataset.groupby('stock')

In [None]:
earliest_dates = grouped_data['date'].min()
latest_dates = grouped_data['date'].max()

print(earliest_dates, latest_dates)

In [None]:
stock_dataset = pd.DataFrame()

## Task-3

### 6. Quantitative analysis

In [None]:
for stock, earliest_date in earliest_dates.items():
    latest_date = latest_dates[stock]
    
    stock_data = yf.download(stock, start=earliest_date, end=latest_date)
    
    stock_data['stock'] = stock
    stock_dataset = pd.concat([stock_dataset, stock_data], ignore_index=True)

print(stock_dataset)

In [None]:
reason = talib.RSI(stock_dataset["Close"])
print(reason)

In [None]:
macd, signal, hist = talib.MACD(stock_dataset["Close"], fastperiod=12, slowperiod=26, signalperiod=9)
macd, signal, hist

In [None]:
stock_dataset['returns'] = stock_dataset['Close'].pct_change()

stock_dataset['moving_average'] = stock_dataset['Close'].rolling(window=3).mean()

stock_dataset['cumulative_returns'] = (1 + stock_dataset['returns']).cumprod()

stock_dataset['vwap'] = (stock_dataset['Close'] * stock_dataset['Volume']).cumsum() / stock_dataset['Volume'].cumsum()

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(stock_dataset['returns'].dropna(), bins=30, edgecolor='black')
plt.xlabel('Daily Returns')
plt.ylabel('Frequency')
plt.title('Distribution of Daily Returns')
plt.show()

## Task-4

### 7. Correlation between news and stock movement

In [None]:
unique_values = dataset['stock'].unique()
stock_dataset = pd.DataFrame()

In [None]:
for stock in unique_values:
    stock_data = yf.Ticker(stock)
    
    stock_dataset = pd.concat([stock_dataset, stock_data.history(period="max")])

stock_dataset.reset_index(drop=False, inplace=True)
stock_dataset.rename(columns={'Date': 'date'}, inplace=True)
stock_dataset

In [None]:
merged_dataset = pd.merge(dataset, stock_dataset, on='date', how='inner')
merged_dataset

In [None]:
for headline in dataset["headline"].head(20):
    blob = TextBlob(headline)
    sentiment = blob.sentiment.polarity
    dataset["sentiment"] = sentiment

    if sentiment > 0:
        sentiment_label = "Positive"
    elif sentiment < 0:
        sentiment_label = "Negative"
    else:
        sentiment_label = "Neutral"

    print(f"Headline: {headline}")
    print(f"Sentiment: {sentiment_label}")
    print(f"Headline: {dataset['sentiment']}")

In [None]:
dataset.sort_values('date', inplace=True)
dataset['return'] = stock_df['Close'].pct_change() * 100
print(dataset)

In [None]:
daily_sentiments = dataset.groupby('date')['sentiment'].mean()

In [None]:
daily_returns = dataset.groupby('date')['return'].mean()

In [None]:
dataset_output  = pd.merge(daily_sentiments, daily_returns, on='date', how='inner')
dataset_output

In [None]:
spearman_corr, spearman_pvalue = spearmanr(df['sentiment'], df['return'])
print(f"Spearman correlation coefficient: {spearman_corr}")
print(f"P-value: {spearman_pvalue}")