In [27]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import io
from urllib.request import urlopen
import zipfile
import os

import duckdb as ddb
import dask.dataframe as dd

import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

from itertools import combinations    
from scipy.stats import pearsonr

In [None]:
%load_ext sql

In [None]:
%sql duckdb:///eda-ddb/eda-gdelt.ddb

In [None]:
%sql duckdb:///eda-ddb/eda-yfinance.ddb

<h1> Basic eda </h1>

In [6]:
# global ddb as a parquet by using the cli
df2 = dd.read_parquet('output2.parquet').compute()

In [7]:
df2['Date'] = pd.to_datetime(df2['Day'], format='%Y%m%d')
df2 = df2[df2['Date'].dt.year >= 2018]
df2 = df2.drop('Day', axis=1)
mean_df = df2.groupby('Date').mean()



In [None]:
plt.plot(mean_df['GoldsteinScale'])
plt.title('Goldstein Scale average over time')

In [None]:
plt.plot(mean_df['NumMentions'], label='NumMentions_avg')
plt.title('NumMentions average over time')

In [None]:
plt.hist(df2['GoldsteinScale'] )
plt.title('Goldstein Scale Distribution')

In [None]:
plt.plot(df2['NumMentions'],df2['GoldsteinScale'], 'o')
plt.title('NumMentions vs Goldstein Scale')

In [None]:
# Lag anaylysis for the stock data

In [None]:
%%sql
select *
from INFORMATION_SCHEMA.COLUMNS
where TABLE_NAME='yfinance'

In [None]:
stock_df = %sql SELECT * FROM yfinance
stock_df
stock_df = stock_df.DataFrame()


In [None]:
stock_df


In [19]:
from pandas.plotting import lag_plot
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.stattools import pacf, acf


def make_pacf_plot(df, stockname, lag):
    plt.figure(figsize=(10, 6))
    plot_pacf(df['Close'], lags=lag)
    plt.title(f'Partial Autocorrelation Function (PACF) of {stockname} Stock Close Prices')
    plt.xlabel('Lags')
    plt.ylabel('Partial Autocorrelation')
    plt.grid(True)
    plt.show()

def make_acf_plot(df,stockname, lag):
    plt.figure(figsize=(10, 6))
    plot_acf(df['Close'], lags=lag)
    plt.title(f'Autocorrelation Function (ACF) of {stockname} Stock Close Prices')
    plt.xlabel('Lags')
    plt.ylabel('Autocorrelation')
    plt.grid(True)
    plt.show()

def make_lag_plot(df, stockname, t):
    plt.figure(figsize=(10, 6))
    lag_plot(df['Close'], lag=t)
    plt.title(f'Lag Plot of {stockname} Stock Close Prices')
    plt.xlabel('Close Price (t)')
    plt.ylabel(f'Close Price (t + {t})')
    plt.grid(True)
    plt.show()

def get_all_stock_names(df):
    return df['Stock'].unique()



def first_non_significant_acf_lag(series, alpha=0.05, nlags=100):
    acf_values, confint = acf(series, alpha=alpha, nlags=nlags)
    confint = confint - acf_values[:, None]
    for lag in range(1, len(acf_values)):
        if not (acf_values[lag] < confint[lag, 0] or acf_values[lag] > confint[lag, 1]):
            return(lag)
    return None

def first_non_significant_pacf_lag(series, alpha=0.05, nlags=100):
    acf_values, confint = pacf(series, alpha=alpha, nlags=nlags)
    confint = confint - acf_values[:, None]
    for lag in range(1, len(acf_values)):
        if not (acf_values[lag] < confint[lag, 0] or acf_values[lag] > confint[lag, 1]):
            return(lag)
    return None



def avg_non_sig_acf_each_sector():
    all_sectors = stock_df['Sector'].unique()
    for sector in all_sectors:
        sector_df = stock_df[stock_df['Sector'] == sector]

        unique_comp = sector_df['Stock'].unique()
        for comp in unique_comp:
            comp_df = sector_df[sector_df['Stock'] == comp]
            avg_lag = first_non_significant_acf_lag(comp_df['Close'])

        print(f'Sector: {sector}, Average Non-Significant ACF Lag: {avg_lag}')

def avg_non_sig_acf_company_in_sector(sector):
    sector_df = stock_df[stock_df['Sector'] == sector]
    unique_comp = sector_df['Stock'].unique()
    for comp in unique_comp:
        comp_df = sector_df[sector_df['Stock'] == comp]
        avg_lag = first_non_significant_acf_lag(comp_df['Close'])
        print(f'Sector: {sector}, Company: {comp}, Average Non-Significant ACF Lag: {avg_lag}')




In [None]:
for sector in stock_df['Sector'].unique():
    for company in stock_df[stock_df['Sector'] == sector]['Stock'].unique():
        companyseries = stock_df[(stock_df['Sector'] == sector) & (stock_df['Stock'] == company)]['Close']
        print(f'Sector: {sector}, Company: {company}, First Non-Significant PACF Lag: {first_non_significant_pacf_lag(companyseries)}')
        

In [None]:
# Stock decomposition 

def stock_decomposition_plot(stock_name, month_start, month_end, year_start, year_end, period):
    if stock_name =='AAPL':
        stock_plot = stock_df[stock_df['Stock'] == stock_name]
        stock_plot.set_index('Date', inplace=True)
        stock_plot['Day'] = stock_plot.index.day
        stock_plot['Month'] = stock_plot.index.month
        stock_plot['Year'] = stock_plot.index.year
        stock_plot = stock_plot.sort_values(by='Date')
    else:
        stock_plot = stock_df[stock_df['Stock'] == stock_name]
        stock_plot.set_index('Date', inplace=True)
        stock_plot['Day'] = stock_plot.index.day
        stock_plot['Month'] = stock_plot.index.month
        stock_plot['Year'] = stock_plot.index.year
    stock_plot = stock_plot[(stock_plot['Month'] >= month_start) & (stock_plot['Month'] <= month_end) & (stock_plot['Year'] >= year_start) & (stock_plot['Year'] <= year_end)]
    df_plot = stock_plot
    result = seasonal_decompose(df_plot['Close'], model='additive', period=period)
    result.plot()
    plt.gcf().set_size_inches(12, 8)
    plt.suptitle(f'Decomposition of {stock_name} Stock Close Prices', fontsize=10, y = 0)
    plt.tick_params(axis='x', rotation = 45)
    plt.show()



for stock_name in stock_df['Stock'].unique():
    stock_decomposition_plot(stock_name, 1, 12, 2019, 2023, 365)


In [None]:
def correlation_in_each_sector():
    all_sectors = stock_df['Sector'].unique()
    for sector in all_sectors:
        print(f'Sector: {sector}')
        all_comp = stock_df[stock_df['Sector'] == sector]['Stock'].unique()
        all_combinations = (combinations(all_comp, 2))
        filtered_combinations = list(filter(lambda x: x[0] != x[1], all_combinations))
        for comp1, comp2 in filtered_combinations:
            comp1_df = stock_df[stock_df['Stock'] == comp1]
            comp2_df = stock_df[stock_df['Stock'] == comp2]
            comp1_df = comp1_df.set_index('Date')
            comp2_df = comp2_df.set_index('Date')
            comp1_series= comp1_df['Close']
            comp2_series = comp2_df['Close'] 
            concatenated_df = pd.merge(comp1_series, comp2_series, on='Date')
            concatenated_df = concatenated_df.dropna()
            ret = pearsonr(concatenated_df['Close_x'], concatenated_df['Close_y'])
            print(f'Company: {comp1}, Company:{comp2}, Correlation: {ret[0]}, P-Value: {ret[1]}')

        # print(filtered_combinations)

correlation_in_each_sector()

<h1> ai fitler </h1>

In [None]:
# Setup the ai

In [None]:
import json
import requests
import nltk
from nltk.tokenize import word_tokenize


# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
model = 'llama3.2' # TODO: update this for whatever model you wish to use


def generate(prompt, context):
    r = requests.post('http://localhost:11434/api/generate',
                      json={
                          'model': model,
                          'prompt': prompt,
                          'context': context,
                      },
                      stream=True)
    r.raise_for_status()
    response =""
    for line in r.iter_lines():
        body = json.loads(line)
        response_part = body.get('response', '')
        # # the response streams one token at a time, print that as we receive it
        # print(response_part, end='', flush=True)
        # save response to a string that returns when the response is done
        response += response_part
        

        if 'error' in body:
            raise Exception(body['error'])

        if body.get('done', False):
            return response

# This gonna take 15000 days to run
def ai_filter(url, sector):
    prompt = f"Given the URL {url} and sector name {sector}, please follow these steps to assess wheter the headlines on the webpage have a measureable effect on the stock price of the specified sector."
    prompt += "1.Headline Extraction: Extract and summarize the key informatoin from the URL."
    prompt += f"2.Relevance to sector Performance:Based on the content of the URL, determine if they contain information that could influence stock price of {sector}., For example, look for news related to company performance, earnings reports, product launches, regulatory decisions, market trends, or significant events (mergers, acquisitions, etc.). "
    prompt += "3.Conclusion: State whether or not the content are likely to have a substantial direct impact on the stock. give rational connection between the headlines and stock performance. Provide specific reasons to support your conclusion"
    prompt += "4. Finally: from the previous conclusion, answer this question in one word: 'yes' or 'no' without punctuation in a newline: 'Do the headlines have a measureable effect on the stock price of the specified company?'"

    context = []
    response = generate(prompt, context)
    tokens = word_tokenize(response)
    # print(response)
    # Check if "yes" or "no" is in the response
    if tokens[-1].lower()  == "yes":
        return True
    elif tokens[-1].lower() == "no":
        return False
    else:
        # i'll be greedy and assume false
        return False
   
ai_filter("indiantimes.com/bakivsjackhanmaTMR", "Tech")



In [None]:
global_data_ai_test= %sql SELECT * FROM gdelt WHERE Day BETWEEN 20241002 AND 20241003

In [None]:
global_data_ai_test = global_data_ai_test.DataFrame()

In [None]:
# Filter out some data to test and speed up the ai 
mean_std1 = global_data_ai_test['NumMentions'].std() + global_data_ai_test['NumMentions'].mean()
filtered_event = global_data_ai_test[global_data_ai_test['NumMentions'] > mean_std1]

In [None]:
# This took two days in real life which is too long
fitlered2_ret = filtered_event[filtered_event['SOURCEURL'].apply(lambda x: ai_filter(x, sector='tech'))]


In [None]:
fitlered2_ret.to_parquet('fitlered2_ret.parquet', index=False)
# Save the data to a parquet file

In [17]:
filtered_data = dd.read_parquet('fitlered2_ret.parquet').compute()


In [None]:
filtered_data

In [None]:
# Questino 5
import scipy.stats  as stats
import numpy as np


appl_news = pd.read_csv("stock_related_articles/stocks_to_keywords_broad/AAPL_apple.csv")
appl_news['Date'] = pd.to_datetime(appl_news['Date'])
appl_news = appl_news[(appl_news['Date'].dt.year >= 2018) & (appl_news['Date'].dt.year <= 2024)]

aapl_stock = stock_df[stock_df['Stock'] == 'AAPL']

aapl_stock = aapl_stock[(aapl_stock['Date'].dt.year >= 2018) & (aapl_stock['Date'].dt.year <= 2025)]

# print(nvidia_stock.head())

def flat_find_mean_correlation(df1, df2, lag):
    df2 = df2[['Date', 'GoldsteinScale', 'NumMentions']]
    df2 = df2.groupby('Date').mean()
    df1 = df1.set_index('Date')
    df1['Close'] = df1['Close'].shift(-lag)
    df1 = df1['Close']
    concatenated_df = pd.merge(df1, df2, left_index=True, right_index=True)

    # print(concatenated_df.head())
    # print(concatenated_df.head())
    concatenated_df = concatenated_df.dropna()
    ret = stats.pearsonr(concatenated_df['Close'], concatenated_df['GoldsteinScale'])
    # return corrlatoin and p_val
    return ret[0], ret[1]

def flat_find_weightedmean_correlation(df1, df2, lag):

    df2 = df2[['Date', 'GoldsteinScale', 'NumMentions']]
    df2 = df2.groupby('Date').mean()
    df1 = df1.set_index('Date')
    df1['Close'] = df1['Close'].shift(-lag)
    df1 = df1['Close']
    concatenated_df = pd.merge(df1, df2, left_index=True, right_index=True)
    concatenated_df['GoldenWeighted'] = concatenated_df['GoldsteinScale'] * concatenated_df['NumMentions']
    concatenated_df = concatenated_df.dropna()
    ret = stats.pearsonr(concatenated_df['Close'], concatenated_df['GoldenWeighted'])
    # return corrlatoin and p_val
    return ret[0], ret[1]

def flat_find_sqrt_correl(df1, df2, lag, year):
    df2 = df2[['Date', 'GoldsteinScale', 'NumMentions']]
    df2 = df2.groupby('Date').mean()
    df1 = df1.set_index('Date')
    df1['Close'] = df1['Close'].shift(-lag)
    df1 = df1['Close']
    concatenated_df = pd.merge(df1, df2, left_index=True, right_index=True)
    concatenated_df['GoldenWeighted'] = concatenated_df['GoldsteinScale'] * np.sqrt(concatenated_df['NumMentions'])
    concatenated_df = concatenated_df.dropna()
    ret = stats.pearsonr(concatenated_df['Close'], concatenated_df['GoldenWeighted'])
    # return corrlatoin and p_val
    return ret[0], ret[1]
    
def percentage_find_mean_correlation(df1, df2):
    df2 = df2[['Date', 'GoldsteinScale', 'NumMentions']]
    df2 = df2.groupby('Date').mean()
    df1 = df1.set_index('Date')
    df1['Close'] = df1['Close'].pct_change() 
    concatenated_df = pd.merge(df1, df2, left_index=True, right_index=True)
    concatenated_df['GoldenWeighted'] = concatenated_df['GoldsteinScale'] * (concatenated_df['NumMentions'])
    concatenated_df = concatenated_df.dropna()
    ret = stats.pearsonr(concatenated_df['Close'], concatenated_df['GoldenWeighted'])
    # return corrlatoin and p_val
    return ret[0], ret[1]


def percentage_find_sqrt_mean_correlation(df1, df2):
    df2 = df2[['Date', 'GoldsteinScale', 'NumMentions']]
    df2 = df2.groupby('Date').mean()
    df1 = df1.set_index('Date')
    df1['Close'] = df1['Close'].pct_change() 
    concatenated_df = pd.merge(df1, df2, left_index=True, right_index=True)
    concatenated_df['GoldenWeighted'] = concatenated_df['GoldsteinScale'] * np.sqrt(concatenated_df['NumMentions'])
    concatenated_df = concatenated_df.dropna()
    ret = stats.pearsonr(concatenated_df['Close'], concatenated_df['GoldenWeighted'])
    # return corrlatoin and p_val
    return ret[0], ret[1]

def percentage_find_log_mean_correlation(df1, df2):
    df2 = df2[['Date', 'GoldsteinScale', 'NumMentions']]
    df2 = df2.groupby('Date').mean()
    df1 = df1.set_index('Date')
    df1['Close'] = df1['Close'].pct_change() 
    concatenated_df = pd.merge(df1, df2, left_index=True, right_index=True)
    concatenated_df['GoldenWeighted'] = concatenated_df['GoldsteinScale'] * np.log(concatenated_df['NumMentions'])
    concatenated_df = concatenated_df.dropna()
    ret = stats.pearsonr(concatenated_df['Close'], concatenated_df['GoldenWeighted'])
    # return corrlatoin and p_val
    return ret[0], ret[1]

def percentage_find_sum_correlation(df1, df2):
    df2 = df2[['Date', 'GoldsteinScale', 'NumMentions']]
    df2 = df2.groupby('Date').sum()
    df1 = df1.set_index('Date')
    df1['Close'] = df1['Close'].pct_change() 
    concatenated_df = pd.merge(df1, df2, left_index=True, right_index=True)
    concatenated_df['GoldenWeighted'] = concatenated_df['GoldsteinScale'] 
    concatenated_df = concatenated_df.dropna()
    ret = stats.pearsonr(concatenated_df['Close'], concatenated_df['GoldenWeighted'])
    # return corrlatoin and p_val
    return ret[0], ret[1]
def percentage_find_sum_log_correlation(df1, df2):
    df2 = df2[['Date', 'GoldsteinScale', 'NumMentions']]
    df2 = df2.groupby('Date').sum()
    df1 = df1.set_index('Date')
    df1['Close'] = df1['Close'].pct_change() 
    concatenated_df = pd.merge(df1, df2, left_index=True, right_index=True)
    concatenated_df['GoldenWeighted'] = concatenated_df['GoldsteinScale'] * np.log(concatenated_df['NumMentions'])
    concatenated_df = concatenated_df.dropna()
    ret = stats.pearsonr(concatenated_df['Close'], concatenated_df['GoldenWeighted'])
    # return corrlatoin and p_val
    return ret[0], ret[1]


# percentage_find_mean_correlation(aapl_df, aapl_stock, 1, 2024)

# print(percentage_find_sqrt_mean_correlation(nvidia_stock, gd, 1, 2024))
# print(flat_find_mean_correlation(aapl_stock, gd,1, 2024))
# print(percentage_find_sqrt_mean_correlation(aapl_stock, gd, 2024) ) 
# print(percentage_find_mean_correlation(aapl_stock, gd, 2024))

# print(flat_find_mean_correlation(aapl_stock, appl_news, 1))
# print(flat_find_weightedmean_correlation(aapl_stock, appl_news, 1))
# print(flat_find_sqrt_correl(aapl_stock, appl_news, 1, 2024))
# print(percentage_find_mean_correlation(aapl_stock, appl_news))
# print(percentage_find_sqrt_mean_correlation(aapl_stock, appl_news))
# print(percentage_find_log_mean_correlation(aapl_stock, appl_news))
print(percentage_find_sum_correlation(aapl_stock, appl_news))
print(percentage_find_sum_log_correlation(aapl_stock, appl_news))

