In [2]:
# Import necessary libraries
import pandas as pd
import os
import sys

# Adjust the path to import from the src directory
src_path = os.path.abspath('src')
sys.path.insert(0, src_path)

# Import the DescriptiveStats class
from descriptive_stats import DescriptiveStats

# Define the paths to your datasets
analyst_data_path = 'data/raw_analyst_ratings.csv/raw_analyst_ratings.csv'
yfinance_data_folder = 'data/yfinance_data'

# Load the raw analyst ratings dataset
df_analyst = pd.read_csv(analyst_data_path)

# Display the first few rows and columns of the DataFrame to check column names
print("Columns in the Analyst DataFrame:")
print(df_analyst.columns)
print("\nFirst few rows of the Analyst DataFrame:")
display(df_analyst.head())

# Initialize the DescriptiveStats object for the analyst data
stats_analyst = DescriptiveStats(df_analyst)

# Display summary statistics for the analyst dataset
print("Summary Statistics for Analyst Data:")
display(stats_analyst.get_summary_statistics())

# Check and analyze textual lengths (update column name if necessary)
headline_column = 'headline'  # Update this if the actual column name is different
if headline_column in df_analyst.columns:
    print("\nTextual Length Analysis for Analyst Data:")
    display(stats_analyst.analyze_textual_lengths(headline_column))
else:
    print(f"Column '{headline_column}' not found in Analyst Data.")

# Count articles per publisher (update column name if necessary)
publisher_column = 'publisher'  # Update this if the actual column name is different
if publisher_column in df_analyst.columns:
    print("\nArticle Count per Publisher for Analyst Data:")
    display(stats_analyst.count_articles_per_publisher(publisher_column))
else:
    print(f"Column '{publisher_column}' not found in Analyst Data.")

# Analyze publication dates (update column name if necessary)
publication_date_column = 'publication_date'  # Update this if the actual column name is different
if publication_date_column in df_analyst.columns:
    print("\nPublication Date Analysis for Analyst Data:")
    display(stats_analyst.analyze_publication_dates(publication_date_column))
else:
    print(f"Column '{publication_date_column}' not found in Analyst Data.")

# Load multiple CSV files from the yfinance_data folder
def load_multiple_csv(folder_path):
    all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]
    dfs = [pd.read_csv(file) for file in all_files]
    return pd.concat(dfs, ignore_index=True)

df_yfinance = load_multiple_csv(yfinance_data_folder)

# Display the first few rows and columns of the DataFrame to check column names
print("\nColumns in the yfinance DataFrame:")
print(df_yfinance.columns)
print("\nFirst few rows of the yfinance DataFrame:")
display(df_yfinance.head())

# Initialize the DescriptiveStats object for the yfinance data
stats_yfinance = DescriptiveStats(df_yfinance)

# Display summary statistics for the yfinance dataset
print("\nSummary Statistics for yfinance Data:")
display(stats_yfinance.get_summary_statistics())

# Check and analyze textual lengths (update column name if necessary)
if headline_column in df_yfinance.columns:
    print("\nTextual Length Analysis for yfinance Data:")
    display(stats_yfinance.analyze_textual_lengths(headline_column))
else:
    print(f"Column '{headline_column}' not found in yfinance Data.")

# Count articles per publisher (update column name if necessary)
if publisher_column in df_yfinance.columns:
    print("\nArticle Count per Publisher for yfinance Data:")
    display(stats_yfinance.count_articles_per_publisher(publisher_column))
else:
    print(f"Column '{publisher_column}' not found in yfinance Data.")

# Analyze publication dates (update column name if necessary)
if publication_date_column in df_yfinance.columns:
    print("\nPublication Date Analysis for yfinance Data:")
    display(stats_yfinance.analyze_publication_dates(publication_date_column))
else:
    print(f"Column '{publication_date_column}' not found in yfinance Data.")


Columns in the Analyst DataFrame:
Index(['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock'], dtype='object')

First few rows of the Analyst DataFrame:


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A


Summary Statistics for Analyst Data:


Unnamed: 0.1,Unnamed: 0
count,1407328.0
mean,707245.4
std,408100.9
min,0.0
25%,353812.8
50%,707239.5
75%,1060710.0
max,1413848.0



Textual Length Analysis for Analyst Data:


Unnamed: 0,headline_length
count,1407328.0
mean,73.12051
std,40.73531
min,3.0
25%,47.0
50%,64.0
75%,87.0
max,512.0



Article Count per Publisher for Analyst Data:


publisher
Paul Quintaro                      228373
Lisa Levin                         186979
Benzinga Newsdesk                  150484
Charles Gross                       96732
Monica Gerson                       82380
                                    ...  
Shazir Mucklai - Imperium Group         1
Laura Jennings                          1
Eric Martin                             1
Jose Rodrigo                            1
Jeremie Capron                          1
Name: count, Length: 1034, dtype: int64

Column 'publication_date' not found in Analyst Data.

Columns in the yfinance DataFrame:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Dividends', 'Stock Splits'],
      dtype='object')

First few rows of the yfinance DataFrame:


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
0,1980-12-12,0.128348,0.128906,0.128348,0.128348,0.098943,469033600,0.0,0.0
1,1980-12-15,0.12221,0.12221,0.121652,0.121652,0.093781,175884800,0.0,0.0
2,1980-12-16,0.113281,0.113281,0.112723,0.112723,0.086898,105728000,0.0,0.0
3,1980-12-17,0.115513,0.116071,0.115513,0.115513,0.089049,86441600,0.0,0.0
4,1980-12-18,0.118862,0.11942,0.118862,0.118862,0.09163,73449600,0.0,0.0



Summary Statistics for yfinance Data:


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
count,45428.0,45428.0,45428.0,45428.0,45428.0,45428.0,45428.0,45428.0
mean,46.79837,47.397792,46.190101,46.81059,45.367891,217778500.0,0.000847,0.002499
std,78.259474,79.27762,77.218532,78.274673,78.049554,307664500.0,0.022527,0.155291
min,0.034896,0.035547,0.033333,0.034115,0.031291,0.0,0.0,0.0
25%,0.6905,0.708,0.673541,0.693604,0.585204,47464550.0,0.0,0.0
50%,13.646101,13.82771,13.46325,13.64475,13.076301,99212000.0,0.0,0.0
75%,47.242501,47.720626,46.628125,47.193626,44.768961,261059500.0,0.0,0.0
max,542.349976,542.809998,528.359985,539.909973,539.909973,9230856000.0,3.08,20.0


Column 'headline' not found in yfinance Data.
Column 'publisher' not found in yfinance Data.
Column 'publication_date' not found in yfinance Data.


In [3]:
# Import necessary libraries
import pandas as pd
import os
import sys

# Adjust the path to import from the src directory
src_path = os.path.abspath('src')
sys.path.insert(0, src_path)

# Import the DescriptiveStats class
from descriptive_stats import DescriptiveStats

# Define the paths to your datasets
analyst_data_path = 'data/raw_analyst_ratings.csv/raw_analyst_ratings.csv'
yfinance_data_folder = 'data/yfinance_data'

# Load the raw analyst ratings dataset
df_analyst = pd.read_csv(analyst_data_path)

# Display the first few rows and columns of the DataFrame to check column names
print("Columns in the Analyst DataFrame:")
print(df_analyst.columns)
print("\nFirst few rows of the Analyst DataFrame:")
display(df_analyst.head())

# Initialize the DescriptiveStats object for the analyst data
stats_analyst = DescriptiveStats(df_analyst)

# Display summary statistics for the analyst dataset
print("Summary Statistics for Analyst Data:")
display(stats_analyst.get_summary_statistics())

# Check and analyze textual lengths (update column name if necessary)
headline_column = 'headline'  # Update this if the actual column name is different
if headline_column in df_analyst.columns:
    print("\nTextual Length Analysis for Analyst Data:")
    display(stats_analyst.analyze_textual_lengths(headline_column))
else:
    print(f"Column '{headline_column}' not found in Analyst Data.")

# Count articles per publisher (update column name if necessary)
publisher_column = 'publisher'  # Update this if the actual column name is different
if publisher_column in df_analyst.columns:
    print("\nArticle Count per Publisher for Analyst Data:")
    display(stats_analyst.count_articles_per_publisher(publisher_column))
else:
    print(f"Column '{publisher_column}' not found in Analyst Data.")

# Analyze publication dates (update column name if necessary)
publication_date_column = 'publication_date'  # Update this if the actual column name is different
if publication_date_column in df_analyst.columns:
    print("\nPublication Date Analysis for Analyst Data:")
    display(stats_analyst.analyze_publication_dates(publication_date_column))
else:
    print(f"Column '{publication_date_column}' not found in Analyst Data.")

# Load multiple CSV files from the yfinance_data folder
def load_multiple_csv(folder_path):
    all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]
    dfs = [pd.read_csv(file) for file in all_files]
    return pd.concat(dfs, ignore_index=True)

df_yfinance = load_multiple_csv(yfinance_data_folder)

# Display the first few rows and columns of the DataFrame to check column names
print("\nColumns in the yfinance DataFrame:")
print(df_yfinance.columns)
print("\nFirst few rows of the yfinance DataFrame:")
display(df_yfinance.head())

# Initialize the DescriptiveStats object for the yfinance data
stats_yfinance = DescriptiveStats(df_yfinance)

# Display summary statistics for the yfinance dataset
print("\nSummary Statistics for yfinance Data:")
display(stats_yfinance.get_summary_statistics())

# Adjust analysis based on available columns

# Example placeholder columns (update these based on actual columns)
headline_column_yf = 'text'  # Example column name for textual analysis
publisher_column_yf = 'source'  # Example column name for publisher count
publication_date_column_yf = 'date'  # Example column name for publication date

# Check and analyze textual lengths (update column name if necessary)
if headline_column_yf in df_yfinance.columns:
    print("\nTextual Length Analysis for yfinance Data:")
    display(stats_yfinance.analyze_textual_lengths(headline_column_yf))
else:
    print(f"Column '{headline_column_yf}' not found in yfinance Data.")

# Count articles per publisher (update column name if necessary)
if publisher_column_yf in df_yfinance.columns:
    print("\nArticle Count per Publisher for yfinance Data:")
    display(stats_yfinance.count_articles_per_publisher(publisher_column_yf))
else:
    print(f"Column '{publisher_column_yf}' not found in yfinance Data.")

# Analyze publication dates (update column name if necessary)
if publication_date_column_yf in df_yfinance.columns:
    print("\nPublication Date Analysis for yfinance Data:")
    display(stats_yfinance.analyze_publication_dates(publication_date_column_yf))
else:
    print(f"Column '{publication_date_column_yf}' not found in yfinance Data.")


Columns in the Analyst DataFrame:
Index(['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock'], dtype='object')

First few rows of the Analyst DataFrame:


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A


Summary Statistics for Analyst Data:


Unnamed: 0.1,Unnamed: 0
count,1407328.0
mean,707245.4
std,408100.9
min,0.0
25%,353812.8
50%,707239.5
75%,1060710.0
max,1413848.0



Textual Length Analysis for Analyst Data:


Unnamed: 0,headline_length
count,1407328.0
mean,73.12051
std,40.73531
min,3.0
25%,47.0
50%,64.0
75%,87.0
max,512.0



Article Count per Publisher for Analyst Data:


publisher
Paul Quintaro                      228373
Lisa Levin                         186979
Benzinga Newsdesk                  150484
Charles Gross                       96732
Monica Gerson                       82380
                                    ...  
Shazir Mucklai - Imperium Group         1
Laura Jennings                          1
Eric Martin                             1
Jose Rodrigo                            1
Jeremie Capron                          1
Name: count, Length: 1034, dtype: int64

Column 'publication_date' not found in Analyst Data.

Columns in the yfinance DataFrame:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Dividends', 'Stock Splits'],
      dtype='object')

First few rows of the yfinance DataFrame:


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
0,1980-12-12,0.128348,0.128906,0.128348,0.128348,0.098943,469033600,0.0,0.0
1,1980-12-15,0.12221,0.12221,0.121652,0.121652,0.093781,175884800,0.0,0.0
2,1980-12-16,0.113281,0.113281,0.112723,0.112723,0.086898,105728000,0.0,0.0
3,1980-12-17,0.115513,0.116071,0.115513,0.115513,0.089049,86441600,0.0,0.0
4,1980-12-18,0.118862,0.11942,0.118862,0.118862,0.09163,73449600,0.0,0.0



Summary Statistics for yfinance Data:


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
count,45428.0,45428.0,45428.0,45428.0,45428.0,45428.0,45428.0,45428.0
mean,46.79837,47.397792,46.190101,46.81059,45.367891,217778500.0,0.000847,0.002499
std,78.259474,79.27762,77.218532,78.274673,78.049554,307664500.0,0.022527,0.155291
min,0.034896,0.035547,0.033333,0.034115,0.031291,0.0,0.0,0.0
25%,0.6905,0.708,0.673541,0.693604,0.585204,47464550.0,0.0,0.0
50%,13.646101,13.82771,13.46325,13.64475,13.076301,99212000.0,0.0,0.0
75%,47.242501,47.720626,46.628125,47.193626,44.768961,261059500.0,0.0,0.0
max,542.349976,542.809998,528.359985,539.909973,539.909973,9230856000.0,3.08,20.0


Column 'text' not found in yfinance Data.
Column 'source' not found in yfinance Data.
Column 'date' not found in yfinance Data.


In [4]:
# Notebook Code

# Import necessary libraries
import os
import pandas as pd
from src.data_loader import DataLoader
from src.descriptive_stats import DescriptiveStats

# Set up base path for data loading
base_path = os.path.abspath(os.path.join(os.path.dirname('__file__'), 'data'))

# Initialize the DataLoader with the base path
loader = DataLoader(base_path)

# Load the data
data = loader.load_data()

# Display the first few rows of the Yahoo Finance data
if 'yfinance_data' in data and not data['yfinance_data'].empty:
    print("YFinance Data (First 5 rows):")
    display(data['yfinance_data'].head())
else:
    print("No Yahoo Finance data loaded or data is empty.")

# Display the first few rows of the Analyst Ratings data
if 'analyst_ratings' in data and not data['analyst_ratings'].empty:
    print("\nAnalyst Ratings Data (First 5 rows):")
    display(data['analyst_ratings'].head())
else:
    print("No Analyst Ratings data loaded or data is empty.")

# Initialize the DescriptiveStats object with the Yahoo Finance data
if 'yfinance_data' in data and not data['yfinance_data'].empty:
    yfinance_stats = DescriptiveStats(data['yfinance_data'])
    
    # Get and display summary statistics
    print("\nSummary Statistics:")
    display(yfinance_stats.get_summary_statistics())
    
    # Get and display missing values
    print("\nMissing Values:")
    display(yfinance_stats.get_missing_values())
    
    # Get and display data types
    print("\nData Types:")
    display(yfinance_stats.get_data_types())
    
    # Get and display correlation matrix
    print("\nCorrelation Matrix:")
    display(yfinance_stats.get_correlations())
else:
    print("No valid Yahoo Finance data to perform statistics.")

# Initialize the DescriptiveStats object with the Analyst Ratings data
if 'analyst_ratings' in data and not data['analyst_ratings'].empty:
    analyst_stats = DescriptiveStats(data['analyst_ratings'])
    
    # Get and display summary statistics
    print("\nAnalyst Ratings Summary Statistics:")
    display(analyst_stats.get_summary_statistics())
    
    # Get and display missing values
    print("\nAnalyst Ratings Missing Values:")
    display(analyst_stats.get_missing_values())
    
    # Get and display data types
    print("\nAnalyst Ratings Data Types:")
    display(analyst_stats.get_data_types())
    
    # Get and display correlation matrix (if applicable)
    print("\nAnalyst Ratings Correlation Matrix:")
    display(analyst_stats.get_correlations())
else:
    print("No valid Analyst Ratings data to perform statistics.")


Loaded Yahoo Finance data with 45428 rows and 10 columns.
Loaded Analyst Ratings data with 1407328 rows and 6 columns.
YFinance Data (First 5 rows):


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits,source_file
0,1980-12-12,0.128348,0.128906,0.128348,0.128348,0.098943,469033600,0.0,0.0,AAPL_historical_data.csv
1,1980-12-15,0.12221,0.12221,0.121652,0.121652,0.093781,175884800,0.0,0.0,AAPL_historical_data.csv
2,1980-12-16,0.113281,0.113281,0.112723,0.112723,0.086898,105728000,0.0,0.0,AAPL_historical_data.csv
3,1980-12-17,0.115513,0.116071,0.115513,0.115513,0.089049,86441600,0.0,0.0,AAPL_historical_data.csv
4,1980-12-18,0.118862,0.11942,0.118862,0.118862,0.09163,73449600,0.0,0.0,AAPL_historical_data.csv



Analyst Ratings Data (First 5 rows):


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A



Summary Statistics:


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
count,45428.0,45428.0,45428.0,45428.0,45428.0,45428.0,45428.0,45428.0
mean,46.79837,47.397792,46.190101,46.81059,45.367891,217778500.0,0.000847,0.002499
std,78.259474,79.27762,77.218532,78.274673,78.049554,307664500.0,0.022527,0.155291
min,0.034896,0.035547,0.033333,0.034115,0.031291,0.0,0.0,0.0
25%,0.6905,0.708,0.673541,0.693604,0.585204,47464550.0,0.0,0.0
50%,13.646101,13.82771,13.46325,13.64475,13.076301,99212000.0,0.0,0.0
75%,47.242501,47.720626,46.628125,47.193626,44.768961,261059500.0,0.0,0.0
max,542.349976,542.809998,528.359985,539.909973,539.909973,9230856000.0,3.08,20.0



Missing Values:


Date            0
Open            0
High            0
Low             0
Close           0
Adj Close       0
Volume          0
Dividends       0
Stock Splits    0
source_file     0
dtype: int64


Data Types:


Date             object
Open            float64
High            float64
Low             float64
Close           float64
Adj Close       float64
Volume            int64
Dividends       float64
Stock Splits    float64
source_file      object
dtype: object


Correlation Matrix:


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
Open,1.0,0.999894,0.99987,0.999762,0.999043,-0.289218,0.046169,0.006129
High,0.999894,1.0,0.999822,0.999877,0.999182,-0.288655,0.045824,0.006371
Low,0.99987,0.999822,1.0,0.999887,0.999141,-0.289927,0.046469,0.005969
Close,0.999762,0.999877,0.999887,1.0,0.99928,-0.289275,0.046165,0.006156
Adj Close,0.999043,0.999182,0.999141,0.99928,1.0,-0.282462,0.044679,0.006256
Volume,-0.289218,-0.288655,-0.289927,-0.289275,-0.282462,1.0,-0.019026,0.002062
Dividends,0.046169,0.045824,0.046469,0.046165,0.044679,-0.019026,1.0,-0.000605
Stock Splits,0.006129,0.006371,0.005969,0.006156,0.006256,0.002062,-0.000605,1.0



Analyst Ratings Summary Statistics:


Unnamed: 0.1,Unnamed: 0
count,1407328.0
mean,707245.4
std,408100.9
min,0.0
25%,353812.8
50%,707239.5
75%,1060710.0
max,1413848.0



Analyst Ratings Missing Values:


Unnamed: 0    0
headline      0
url           0
publisher     0
date          0
stock         0
dtype: int64


Analyst Ratings Data Types:


Unnamed: 0     int64
headline      object
url           object
publisher     object
date          object
stock         object
dtype: object


Analyst Ratings Correlation Matrix:


Unnamed: 0.1,Unnamed: 0
Unnamed: 0,1.0
