In [1]:
!pip install pyspark -q
!pip install yfinance -q
!pip install yahoo_fin -q

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("finance").master("local[*]").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/05 08:16:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from datetime import datetime, timedelta

from pyspark.sql.window import Window 
import pyspark.sql.functions as F
import pyspark.sql.types as T

import yfinance as yf
import pandas as pd

class FinancialFeaturesUtility():
    def __init__(self):
        self.financial_ratio = {
            'marketCap': 'market_cap',
            'shortRatio': 'short_ratio',
            'trailingPE': 'trailing_pe',
            'trailingEps': 'trailing_eps',
            'priceToBook': 'price_to_book',
            'trailingPegRatio': 'trailing_peg',
        }
        
        self.cashflow_features = {
            'Free Cash Flow': 'fcf',
            'Repayment Of Debt': 'repay_debt',
            'Issuance Of Debt': 'issue_debt',
            'Capital Expenditure': 'capex',
            'Effect Of Exchange Rate Changes': 'fx_rate_changes',
            'Changes In Cash': 'cash_changes',
            'Financing Cash Flow': 'fin_cash_flow',
            'Investing Cash Flow': 'inv_cash_flow',
            'Operating Cash Flow': 'op_cash_flow',
            'Depreciation And Amortization': 'depr_amort',
            'Stock Based Compensation': 'stock_comp',
            'Asset Impairment Charge': 'asset_impair',
            'Net Income From Continuing Operations': 'net_income_cont_ops',
        }
        
        self.balancesheet_features = {
            'Net Debt': 'net_debt',
            'Total Debt': 'tot_debt',
            'Tangible Book Value': 'tangible_bv',
            'Common Stock Equity': 'common_eq',
            'Working Capital': 'work_cap',
            'Total Equity Gross Minority Interest': 'tot_eq_minority',
            'Retained Earnings': 'ret_earn',
            'Long Term Debt': 'lt_debt',
            'Current Liabilities': 'curr_liab',
            'Current Debt': 'curr_debt',
            'Total Assets': 'tot_assets',
            'Total Non Current Assets': 'tot_non_curr_assets',
            'Goodwill And Other Intangible Assets': 'goodwill_intangible',
            'Net PPE': 'net_ppe',
            'Current Assets': 'curr_assets',
            'Restricted Cash': 'rest_cash',
            'Prepaid Assets': 'prepaid_assets',
            'Inventory': 'inventory',
            'Receivables': 'receivables',
            'Cash And Cash Equivalents': 'cash_equiv',
        }
        
        self.finance_financial_features = {
            'Normalized EBITDA': 'norm_ebitda',
            'EBIT': 'ebit',
            'Net Income From Continuing Operation Net Minority Interest': 'net_inc_minority',
            'Reconciled Depreciation': 'recon_depr',
            'Net Interest Income': 'net_interest',
            'Normalized Income': 'norm_income',
            'Diluted EPS': 'diluted_eps',
            'Net Income Common Stockholders': 'net_inc_common_stock',
            'Net Income': 'net_income',
            'Net Income Continuous Operations': 'net_inc_cont_ops',
            'Pretax Income': 'pretax_income',
            'Operating Income': 'op_income',
            'Gross Profit': 'gross_profit',
            'Total Revenue': 'tot_revenue',
        }
        
        self.indicator_map = {
            1: self.financial_ratio,
            2: self.cashflow_features,
            3: self.balancesheet_features,
            4: self.finance_financial_features,
        }  
        
    def schema_generator(self):
        feature_name = list(self.financial_ratio.values()) + list(self.cashflow_features.values()) + list(self.balancesheet_features.values()) + list(self.finance_financial_features.values())
        struct_fields = [T.StructField(name, T.DoubleType(), nullable=False) for name in feature_names]
        struct_fields = [T.StructField('ticker', T.StringType(), nullable=False)] + struct_fields
        schema = T.StructType(struct_fields)
        
        return schema
    
    def financial_ratio_transformer(
        self, 
        ticker,
        stock_info
    ):
        df = pd.DataFrame(columns = list(self.financial_ratio.values()))
        
        try:
            feature_values = []
            for feature, field_name in self.financial_ratio.items():
                if feature in stock_info: 
                    feature_values.append(stock_info[feature])
                else: 
                    feature_values.append(0.0) 
            
            df.loc[ticker] = feature_values
            df['fr_flag'] = 0
        except:
            print(f"FAILED: financial ratio featurs for {ticker}")
            df.loc[ticker] = [float('nan')] * len(df.columns)
            df['fr_flag'] = 1
        
        df = df.reset_index().rename(columns={'index': 'ticker'})
        return df
    
    def cashflow_transformer(
        self,
        ticker,
        stock_cashflow
    ):
        df = pd.DataFrame(columns = list(self.cashflow_features.values()))
        
        try:
            cashflow = stock_cashflow.iloc[:,:1].T
            cashflow_feature_values = []
            for feature, field_name in self.cashflow_features.items():
                if feature in cashflow:
                    cashflow_feature_values.append(cashflow[feature].values[0])
                else:
                    cashflow_feature_values.append(0.0)

            df.loc[ticker] = cashflow_feature_values
            df['cf_flag'] = 0
        except:
            print(f"FAILED: cashflow for {ticker}")
            df.loc[ticker] = [float('nan')] * len(df.columns)
            df['cf_flag'] = 1
        
        df = df.reset_index().rename(columns={'index': 'ticker'})
        return df
    
    def balancesheet_transformer(
        self,
        ticker,
        stock_balancesheet
    ):

        df = pd.DataFrame(columns = list(self.balancesheet_features.values()))
       
        try:
            balancesheet = stock_balancesheet.iloc[:,:1].T
            balancesheet_feature_values = []
            for feature, field_name in self.balancesheet_features.items():
                if feature in balancesheet:
                    balancesheet_feature_values.append(balancesheet[feature].values[0])
                else:
                    balancesheet_feature_values.append(0.0)

            df.loc[ticker] = balancesheet_feature_values
            df['bs_flag'] = 0
        except:
            print(f"FAILED: balancesheet for {ticker}")
            df.loc[ticker] = [float('nan')] * len(df.columns)
            df['bs_flag'] = 1
        
        df = df.reset_index().rename(columns={'index': 'ticker'})
        return df
    
    def financial_transformer(
        self,
        ticker,
        stock_financial
    ):
        df = pd.DataFrame(columns = list(self.finance_financial_features.values()))
        
        try:
            f = stock_financial.iloc[:,:1].T
            ffinancial_feature_values = []
            for feature, field_name in self.finance_financial_features.items():
                if feature in f:
                    ffinancial_feature_values.append(f[feature].values[0])
                else:
                    ffinancial_feature_values.append(0.0)

            df.loc[ticker] = ffinancial_feature_values
            df['fin_flag'] = 0
        except:
            print(f"FAILED: financial for {ticker}")
            df.loc[ticker] = [float('nan')] * len(df.columns)
            df['fin_flag'] = 1
        
        df = df.reset_index().rename(columns={'index': 'ticker'})
        
        return df
    
    def transformer(
        self,
        ticker,
        stock_info,
        stock_cashflow,
        stock_balancesheet,
        stock_financial
    ):
        financial_ratio_sdf = spark.createDataFrame(self.financial_ratio_transformer(ticker, stock_info))
        
        cashflow_sdf = spark.createDataFrame(self.cashflow_transformer(ticker, stock_cashflow))
        
        balance_sdf = spark.createDataFrame(self.balancesheet_transformer(ticker, stock_balancesheet))
        
        financial_sdf = spark.createDataFrame(self.financial_transformer(ticker, stock_financial))
                
        sdf = (
            financial_ratio_sdf
            .join(cashflow_sdf, on=['ticker'], how='left')
            .join(balance_sdf, on=['ticker'], how='left')
            .join(financial_sdf, on=['ticker'], how='left')
        ).fillna(0.0)
        
        sdf = (
            sdf
            .withColumn(
                'error_flag', 
                F.when(
                    (F.col('fr_flag') == 1) |
                    (F.col('cf_flag') == 1) |
                    (F.col('bs_flag') == 1) |
                    (F.col('fin_flag') == 1),
                    1
                ).otherwise(0)
            )
            .drop(*['fr_flag','cf_flag','bs_flag','fin_flag'])
        )
        
        sdf = sdf.withColumn('created_dt', F.lit(datetime.now()))
        sdf = sdf.withColumn('dt', F.lit(datetime.now().strftime("%Y-%m-%d")))
        
        return sdf

class FinancialFeaturesETL():
    def __init__(
        self,
        ticker
    ):
        self.ticker = ticker
        self.utility = FinancialFeaturesUtility()
        
        # Data will be filled during extract process
        self.info = None
        self.stock_info = None
        self.financials = None
        self.balancesheet = None
        self.cashflow = None
    
    def extract(self):
        try:
            self.info = yf.Ticker(self.ticker)
            
            self.stock_info = self.info.info
            self.financials = self.info.financials
            self.balancesheet = self.info.balancesheet
            self.cashflow = self.info.cashflow
        except:
            print('ERROR: extract information')
            pass
            
    def transform(self):
        financial_features_df = self.utility.transformer(
            self.ticker,
            self.stock_info,
            self.cashflow,
            self.balancesheet,
            self.financials
        )
        
        return financial_features_df
    
    
    def load(
        self,
        data_sdf,
        sdf
    ):
        """
        Update Stock information
        Add Partition to make sure that we update based on the most recent data (latest)
        """
        window_latest_info = Window.partitionBy(F.col('ticker'), F.col('created_at')).orderBy(F.desc(F.col('created_at')))
        updated_sdf = (
            data_sdf.union(sdf)
            .withColumn('latest_ranking', F.row_number().over(window_latest_info))
            .filter(F.col('latest_ranking') == 1)
        ).drop(*['latest_ranking'])
        
        return updated_sdf
    
    def etl_execute(self):
        pass 
        # return financial_features_sdf, log_sdf

In [4]:
aapl = FinancialFeaturesETL('AAPL')
aapl.extract()
sdf = aapl.transform()
sdf.toPandas().head(n=2)
# sdf.printSchema()

23/08/05 08:16:22 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
  series = series.astype(t, copy=False)


Unnamed: 0,ticker,market_cap,short_ratio,trailing_pe,trailing_eps,price_to_book,trailing_peg,fcf,repay_debt,issue_debt,...,net_inc_common_stock,net_income,net_inc_cont_ops,pretax_income,op_income,gross_profit,tot_revenue,error_flag,created_dt,dt
0,AAPL,2862466000000.0,2.35,30.898134,5.89,46.038452,2.641,111443000000.0,-9543000000.0,9420000000.0,...,99803000000.0,99803000000.0,99803000000.0,119103000000.0,119437000000.0,170782000000.0,394328000000.0,0,2023-08-05 08:16:20.919822,2023-08-05
