## <span style="color:orange">Data Processor</span> for Deferred Revenue Strategy

This sheet starts the process of coding a backtest for a strategy picking stocks based on the ratio of the deferred revenue of equity to the liability value of equity. The first step is to load data on both accounting numbers and stock returns, and organize it so we can access it as we backtest.

### Data processor required functions

According to the pseudo-code above, the <span style="color:green">Data Processor</span> needs to do the following tasks:
1. Load all the necessary raw data (in constructor)
1. Return an array of unique dates in the raw data (`unique_dates()`)
1. For a given date, return a signal DataFrame containing all the latest signals for the appropriate universe of securities (`signal_df_for_date(date)`)
1. For a given date, return a price DataFrame containing the latest prices for all securities potentially in the portfolio, including those not in the current investable universe (`price_df_for_date(date)`)

In [1]:
import pandas as pd
import datetime as dt
import numpy as np
from pathlib import Path

In [2]:
# x = pd.read_csv('monthly_returns.csv')

In [9]:
# y = pd.read_csv('spy.csv')
# y['date'] = pd.to_datetime(x['date'])
# z = pd.read_stata('gvkey_permno_conversion.dta')
# z


Unnamed: 0,index,gvkey,permno,datadate,fyearq,fqtr
0,0,001000,25881.0,1970-12-31,1970.0,4.0
1,1,001000,25881.0,1971-03-31,1971.0,1.0
2,2,001000,25881.0,1971-06-30,1971.0,2.0
3,3,001000,25881.0,1971-09-30,1971.0,3.0
4,4,001000,25881.0,1971-12-31,1971.0,4.0
...,...,...,...,...,...,...
1173792,1173792,351038,16161.0,2021-12-31,2021.0,4.0
1173793,1173793,351038,16161.0,2022-03-31,2022.0,1.0
1173794,1173794,351038,16161.0,2022-06-30,2022.0,2.0
1173795,1173795,351038,16161.0,2022-09-30,2022.0,3.0


In [4]:
# pd.read_stata('gvkey_permno_conversion.dta')

In [5]:
# x = pd.read_csv('monthly_returns.csv')
# x['date'] = pd.to_datetime(x['date'])
# y = pd.read_csv('spy.csv')
# y['date'] = pd.to_datetime(x['date'], format="%Y-%m-%d")
# merged_df = pd.merge(x, y, on='date', how='inner')
# print(merged_df)

In [6]:
# pd.read_csv( 'spy.csv')

In [10]:
# Define class here
class DRDataProcessor():
    
    # Path to where we store the data
    data_folder_path = Path('') 
    
    # Number of days between quarterly earnings announcement and when we can use data
    min_accounting_lag = 30
    max_accounting_lag = 365
    
    # Minimum share price to open a new position
    min_share_price = 10
    
    # Constructor, loads/cleans/merges data as needed
    def __init__(self):
        
        # Load price data: monthly 1961-2020 sample from CRSP including all public US equities
        # In monthly_returns.csv
        self.price_df = pd.read_csv(self.data_folder_path / 'monthly_returns.csv')
        
        # spy = pd.read_csv(self.data_folder_path / 'spy.csv')
        # spy['gvkey'] = '123456789'
        # spy['dr'] = -1
        # self.price_df['date'] =  pd.to_datetime( self.price_df['date'])
        # spy['date'] =  pd.to_datetime( spy['date'])
        # self.price_df = pd.merge(self.price_df, spy, on='date', how='inner')
        
        
        # Parse the yyyyMMdd int dates into DateTime64
        # Based on formatting strings here
        # https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
        self.price_df['date'] = pd.to_datetime(self.price_df.loc[:,'date'], format ="%Y%m%d")
        
        # Prices sometimes negative to indicate no volume at closing auction
        # In these cases, price = -0.5*(bid+ask)
        # But we don't use that information and so want prices to always be positive
        # See http://www.crsp.org/products/documentation/data-definitions-p
        self.price_df.loc[:,'prc'] = np.absolute(self.price_df.loc[:,'prc'])
        
        # Add next-months return as a new column 'ret_next'
        # Use the safe_lead_lag: want lead return but only when permno the same
        #need this to be 6 months into the future 
        self.price_df.loc[:,'ret_next'] = safe_lead_lag(self.price_df.loc[:,'ret'],self.price_df.loc[:,'permno'],1)
        
        # Load accounting data used for DR signal
        # Quarterly sample from 1961-2020 from Compustat Fundamentals Quarterly
        # Stored in the `comp_bm.dta` file
        # `.dta` files are Stata data, and do a better job than `.csv` files of remembering data types
        self.signal_df = pd.read_stata(self.data_folder_path / 'comp_dr.dta')
        
        # The problem with our accounting data is that it identifies stocks using gvkey instead of permno
        # To merge with return_df, we need to use another dataset that converts gvkey to permno
        # This is stored in the gvkey_permno_conversion.dta file
        
        self.gvkey_permno_conversion = pd.read_stata(self.data_folder_path / 'gvkey_permno_conversion.dta')

        # fake_df = pd.DataFrame()
        # fake_df['datadate'] = spy['date']
        # fake_df['gvkey'] = '123456789'
        # fake_df['datadate'] = pd.to_datetime(fake_df['datadate'])
        # self.gvkey_permno_conversion['datadate'] = pd.to_datetime(self.gvkey_permno_conversion['datadate'])
        # self.gvkey_permno_conversion = pd.merge(self.gvkey_permno_conversion,fake_df, on ='datadate', how = 'inner')
        # print(self.gvkey_permno_conversion)
        # print(self.gvkey_permno_conversion[self.gvkey_permno_conversion['gvkey'] == '123456789'])
        

        # Use a merge command to add the permno column to our signal_df
        self.signal_df = self.signal_df.merge(self.gvkey_permno_conversion,on=['gvkey','datadate'])    
        # print(self.signal_df[self.signal_df['gvkey'] == '123456789'])
        # print(self.signal_df.info())
        
    # Returns an array with the unique dates for which we have loaded data
    # Uses from the price_df since that's how frequency we can update portfolio value
    # Filters all dates in price_df to return only dates for which we have signals as well
    def unique_dates(self):
        price_dates = pd.Series( np.sort(self.price_df.loc[:,'date'].unique()) )
        min_signal_date = self.signal_df.loc[:,'datadate'].min() + np.timedelta64(self.min_accounting_lag,'D')
        max_signal_date =  self.signal_df.loc[:,'datadate'].max() + np.timedelta64(self.max_accounting_lag,'D')
        return price_dates[ (price_dates >= min_signal_date) & (price_dates <= max_signal_date) ].array
    
    # Returns a DataFrame containing one row for all securities in price_df as of date.
    # Columns must include:
    # - 'date': date on which price data observed
    # - 'security_id': a security identifier
    # - 'prc': price on date
    # - 'ret': return from previous date to date
    # Ignores liquidity and future-return availability requirements
    # To be used only for closing decisions and execution decisions
    # Some of the returned stocks cannot be traded
    def price_df_for_date(self,date):
        price_date_df = self.price_df.loc[ self.price_df.loc[:,'date'] == date, :]
        return price_date_df.rename(columns={'permno':'security_id'}) 
    
    # Returns a DataFrame where each row is a security in the strategy's universe,
    # Columes must include:
    # - 'date': date on which price data observed
    # - 'security_id': a security identifier
    # - whatever signals the trading rule needs to decide which securities to open new positions in
    #   - In this case, return cshoq, prccq, and ceqq so trading rule can compute B/M ratio
    #
    # Also responsible for applying whatever liquidity filters are wanted to narrow universe,
    # and check that we have future return data (no point in backtesting if we don't know what happens next)
    def signal_df_for_date(self,date):
        # find set of permnos considered tradeable as of date 
                
        # start with all rows return_df on date with non-nan and non-infinite ret_next
        date_price_df = self.price_df.loc[ self.price_df.loc[:,'date'] == date,:]
        date_price_df = date_price_df.loc[ np.isfinite(self.price_df.loc[:,'ret_next']),:]
        date_price_df = date_price_df.drop(columns=['ret_next'])
        
        # now signal data
        # first only look at data announced at least accounting_lag_days prior to date
        all_past_signal_df = self.signal_df.loc[(self.signal_df.loc[:,'datadate'] < date - np.timedelta64(self.min_accounting_lag,'D')),:]
        # then grab only the latest observation for each permno
        latest_signal_df = all_past_signal_df.groupby('permno').last()
        
        # now merge with return data and return        
        merged_df = date_price_df.merge(latest_signal_df,on='permno',how='inner')
        merged_df = merged_df.rename(columns={'permno':'security_id'})  # use permno as our security_id
        
        # filter by liquidity requirements
        merged_df = self.liquidity_filter(merged_df)
        
        # and return without the ret_next column so backtests don't cheat by using it
        return merged_df

    # Returns a filterd version of the passed DataFrame,
    # with all observations deemed too illiquid removed
    # Liquidity requirements:
    #  - price >= $10
    def liquidity_filter(self,df):
        return df.loc[ df.loc[:,'prc'] >= self.min_share_price,:]
        
###################################################################
# Helper methods, do not modify
###################################################################

# Function safe_lead_lag returns a new Series with the lead/lagged values
#  but only when a group is the same for the lead/lag
# Inputs:
# - data_series: data we want to lead/lag
# - group_series: grouping we want to be the same for the lead/lag to be value
# requires data_series and group_series already by sorted by group_series
# so that all alike values of group_series are adjacent,
# meaning group_series should look like:
#    g_0
#    g_0
#    g_0
#    g_0
#    g_1
#    g_1
#    g_2
#    g_2 
#    ...
# where g_i indicates the observation is in group i,
# and once the first g_{i+1} appears no more g_i values appear
# 
# lead_lag > 0 returns a data_series with values of data_series lead_lag rows ahead
# as long as group_series remains the same, NaN if group different
# lead_lag < 0 returns a data_series with values of data_series -lead_lag rows behind 
# (same as lead_lag rows ahead) as long as group_series remains the same, NaN if group different
def safe_lead_lag(data_series,group_series,lead_lag): 
    df = pd.DataFrame({ 'data': data_series, 'group': group_series })
    return df.groupby(['group'])['data'].shift(-lead_lag)