## <span style="color:orange">Data Processor</span> for Deferred Revenue Strategy

This sheet starts the process of coding a backtest for a strategy picking stocks based on the ratio of the deferred revenue of equity to the liability value of equity. The first step is to load data on both accounting numbers and stock returns, and organize it so we can access it as we backtest.

### Data processor required functions

According to the pseudo-code above, the <span style="color:green">Data Processor</span> needs to do the following tasks:
1. Load all the necessary raw data (in constructor)
1. Return an array of unique dates in the raw data (`unique_dates()`)
1. For a given date, return a signal DataFrame containing all the latest signals for the appropriate universe of securities (`signal_df_for_date(date)`)
1. For a given date, return a price DataFrame containing the latest prices for all securities potentially in the portfolio, including those not in the current investable universe (`price_df_for_date(date)`)

In [1]:
import pandas as pd
import datetime as dt
import numpy as np
from pathlib import Path

In [2]:
# x = pd.read_csv('monthly_returns.csv')

In [45]:
# spy = pd.read_csv('spy.csv')

In [46]:
# spy

Unnamed: 0,PERMNO,date,SICCD,TICKER,ACPERM,NWPERM,RET
0,84398,2000-01-31,6726,SPY,,,-0.049362
1,84398,2000-02-29,6726,SPY,,,-0.015667
2,84398,2000-03-31,6726,SPY,,,0.096832
3,84398,2000-04-28,6726,SPY,,,-0.035121
4,84398,2000-05-31,6726,SPY,,,-0.015723
...,...,...,...,...,...,...,...
283,84398,2023-08-31,6726,SPY,,,-0.016252
284,84398,2023-09-29,6726,SPY,,,-0.047267
285,84398,2023-10-31,6726,SPY,,,-0.021709
286,84398,2023-11-30,6726,SPY,,,0.091344


In [3]:
# y = pd.read_csv('spy.csv')
# y['date'] = pd.to_datetime(x['date'])
# z = pd.read_stata('gvkey_permno_conversion.dta')
# z


Unnamed: 0,index,gvkey,permno,datadate,fyearq,fqtr
0,0,001000,25881.0,1970-12-31,1970.0,4.0
1,1,001000,25881.0,1971-03-31,1971.0,1.0
2,2,001000,25881.0,1971-06-30,1971.0,2.0
3,3,001000,25881.0,1971-09-30,1971.0,3.0
4,4,001000,25881.0,1971-12-31,1971.0,4.0
...,...,...,...,...,...,...
1173792,1173792,351038,16161.0,2021-12-31,2021.0,4.0
1173793,1173793,351038,16161.0,2022-03-31,2022.0,1.0
1173794,1173794,351038,16161.0,2022-06-30,2022.0,2.0
1173795,1173795,351038,16161.0,2022-09-30,2022.0,3.0


In [4]:
# pd.read_stata('gvkey_permno_conversion.dta')

Unnamed: 0,index,gvkey,permno,datadate,fyearq,fqtr
0,0,001000,25881.0,1970-12-31,1970.0,4.0
1,1,001000,25881.0,1971-03-31,1971.0,1.0
2,2,001000,25881.0,1971-06-30,1971.0,2.0
3,3,001000,25881.0,1971-09-30,1971.0,3.0
4,4,001000,25881.0,1971-12-31,1971.0,4.0
...,...,...,...,...,...,...
1173792,1173792,351038,16161.0,2021-12-31,2021.0,4.0
1173793,1173793,351038,16161.0,2022-03-31,2022.0,1.0
1173794,1173794,351038,16161.0,2022-06-30,2022.0,2.0
1173795,1173795,351038,16161.0,2022-09-30,2022.0,3.0


In [26]:
# x = pd.read_csv('monthly_returns.csv')
# x['date'] = pd.to_datetime(x['date'])
# y = pd.read_csv('spy.csv')
# y['date'] = pd.to_datetime(x['date'])
# # merged_df = pd.merge(x, y, on='date', how='inner')
# y.rename(columns = {'PERMNO': 'permno'}, inplace = True)
# concat = pd.concat([x,y])
# print(concat)
# print(y)

     permno                          date     prc     vol       ret  shrout  \
0     10000 1970-01-01 00:00:00.019851231     NaN     NaN       NaN     NaN   
1     10000 1970-01-01 00:00:00.019860131  4.3750  1771.0       NaN  3680.0   
2     10000 1970-01-01 00:00:00.019860228  3.2500   828.0 -0.257143  3680.0   
3     10000 1970-01-01 00:00:00.019860331  4.4375  1078.0  0.365385  3680.0   
4     10000 1970-01-01 00:00:00.019860430  4.0000   957.0 -0.098592  3793.0   
..      ...                           ...     ...     ...       ...     ...   
283   84398 1970-01-01 00:00:00.020071231     NaN     NaN       NaN     NaN   
284   84398 1970-01-01 00:00:00.020080131     NaN     NaN       NaN     NaN   
285   84398 1970-01-01 00:00:00.020080229     NaN     NaN       NaN     NaN   
286   84398 1970-01-01 00:00:00.020080331     NaN     NaN       NaN     NaN   
287   84398 1970-01-01 00:00:00.020080430     NaN     NaN       NaN     NaN   

         retx   SICCD TICKER  ACPERM  NWPERM       

In [10]:
# gv = pd.read_stata( 'gvkey_permno_conversion.dta')
# fake_df = pd.DataFrame()
# fake_df['datadate'] = x['date']
# fake_df['gvkey'] = '123456789'
# fake_df['datadate'] = pd.to_datetime(fake_df['datadate'])
# gv['datadate'] = pd.to_datetime(gv['datadate'])
# gv = pd.concat([gv, fake_df])
# print(gv)
# print(gv[gv['gvkey'] == '123456789'])

         index      gvkey   permno                      datadate  fyearq  fqtr
0          0.0     001000  25881.0 1970-12-31 00:00:00.000000000  1970.0   4.0
1          1.0     001000  25881.0 1971-03-31 00:00:00.000000000  1971.0   1.0
2          2.0     001000  25881.0 1971-06-30 00:00:00.000000000  1971.0   2.0
3          3.0     001000  25881.0 1971-09-30 00:00:00.000000000  1971.0   3.0
4          4.0     001000  25881.0 1971-12-31 00:00:00.000000000  1971.0   4.0
...        ...        ...      ...                           ...     ...   ...
3474759    NaN  123456789      NaN 1970-01-01 00:00:00.020220831     NaN   NaN
3474760    NaN  123456789      NaN 1970-01-01 00:00:00.020220930     NaN   NaN
3474761    NaN  123456789      NaN 1970-01-01 00:00:00.020221031     NaN   NaN
3474762    NaN  123456789      NaN 1970-01-01 00:00:00.020221130     NaN   NaN
3474763    NaN  123456789      NaN 1970-01-01 00:00:00.020221230     NaN   NaN

[4648561 rows x 6 columns]
         index      gvke

In [16]:
# raw = pd.read_stata('comp_dr.dta')
# raw.dropna(subset=['drc', 'lt'], inplace = True)
# raw = raw[(raw['drc'] != 0) & (raw['lt'] != 0)]
# raw['dr'] = raw['drc'] / raw['lt']
# raw

Unnamed: 0,gvkey,datadate,indfmt,consol,popsrc,datafmt,tic,conm,curcd,fyr,dlc,drc,drlt,lt,costat,LINKTYPE,LPERMNO,MthCalDt,dr
12,001004,2012-05-31,INDL,C,D,STD,AIR,AAR CORP,USD,5.0,122.865,1.160,12.834,1329.631,A,LU,54594.0,2012-05-31,0.000872
13,001004,2013-05-31,INDL,C,D,STD,AIR,AAR CORP,USD,5.0,86.400,1.200,11.700,1217.400,A,LU,54594.0,2013-05-31,0.000986
14,001004,2014-05-31,INDL,C,D,STD,AIR,AAR CORP,USD,5.0,69.700,1.300,10.200,1198.800,A,LU,54594.0,2014-05-30,0.001084
15,001004,2015-05-31,INDL,C,D,STD,AIR,AAR CORP,USD,5.0,69.000,1.300,8.900,669.900,A,LU,54594.0,2015-05-29,0.001941
16,001004,2016-05-31,INDL,C,D,STD,AIR,AAR CORP,USD,5.0,12.000,1.300,7.600,576.300,A,LU,54594.0,2016-05-31,0.002256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135036,345980,2022-12-31,INDL,C,D,STD,WISH,CONTEXTLOGIC INC,USD,12.0,7.000,18.000,0.000,322.000,A,LC,20333.0,2022-12-30,0.055901
135037,345980,2023-12-31,INDL,C,D,STD,WISH,CONTEXTLOGIC INC,USD,12.0,7.000,12.000,0.000,206.000,A,LC,20333.0,2023-12-29,0.058252
135039,347007,2023-12-31,INDL,C,D,STD,IBRX,IMMUNITYBIO INC,USD,12.0,6.783,0.100,0.000,1090.389,A,LC,15533.0,2023-12-29,0.000092
135040,347085,2023-02-28,INDL,C,D,STD,KARO,KAROOOOO LTD,USD,2.0,4.060,15.456,6.113,57.738,A,LC,21069.0,2023-02-28,0.267692


In [30]:
# spy = pd.read_csv('spy.csv')
# spy['gvkey'] = '123456789'
# spy['drc'] = -1
# spy['lt'] = 1
#         # spy['dr'] = -1
# price_df['date'] =  pd.to_datetime( price_df['date'])
# spy['date'] =  pd.to_datetime( spy['date'])
# spy.rename(columns = {'PERMNO': 'permno'}, inplace = True)
# price_df = pd.concat([price_df, spy])
# price_df[price_df['gvkey'] == '123456789']

Unnamed: 0,permno,date,prc,vol,ret,shrout,retx,SICCD,TICKER,ACPERM,NWPERM,RET,gvkey,drc,lt
0,84398,2000-01-31,,,,,,6726.0,SPY,,,-0.049362,123456789,-1.0,1.0
1,84398,2000-02-29,,,,,,6726.0,SPY,,,-0.015667,123456789,-1.0,1.0
2,84398,2000-03-31,,,,,,6726.0,SPY,,,0.096832,123456789,-1.0,1.0
3,84398,2000-04-28,,,,,,6726.0,SPY,,,-0.035121,123456789,-1.0,1.0
4,84398,2000-05-31,,,,,,6726.0,SPY,,,-0.015723,123456789,-1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,84398,2023-08-31,,,,,,6726.0,SPY,,,-0.016252,123456789,-1.0,1.0
284,84398,2023-09-29,,,,,,6726.0,SPY,,,-0.047267,123456789,-1.0,1.0
285,84398,2023-10-31,,,,,,6726.0,SPY,,,-0.021709,123456789,-1.0,1.0
286,84398,2023-11-30,,,,,,6726.0,SPY,,,0.091344,123456789,-1.0,1.0


In [44]:
# price_df = pd.read_csv( 'monthly_returns.csv')
        
# spy = pd.read_csv('spy.csv')
# spy['gvkey'] = '123456789'
# spy['drc'] = -1
# spy['lt'] = 1
#         # spy['dr'] = -1
# price_df['date'] =  pd.to_datetime( price_df['date'])
# spy['date'] =  pd.to_datetime( spy['date'])
# spy.rename(columns = {'PERMNO': 'permno'}, inplace = True)
# price_df = pd.concat([price_df, spy])


# signal_df = pd.read_stata('comp_dr.dta')
# signal_df = signal_df.dropna(subset=['drc', 'lt'])
# signal_df = signal_df[(signal_df['drc'] != 0) & (signal_df['lt'] != 0)]
# signal_df['dr'] = signal_df['drc'] / signal_df['lt']

# # print(signal_df['permno'], 'peromno')


# gvkey_permno_conversion = pd.read_stata('gvkey_permno_conversion.dta')

# fake_df = pd.DataFrame()
# fake_df['datadate'] = spy['date']
# fake_df['gvkey'] = '123456789'
# fake_df['permno'] = 84398.0
# fake_df['datadate'] = pd.to_datetime(fake_df['datadate'])
# # gvkey_permno_conversion['datadate'] = pd.to_datetime(gvkey_permno_conversion['datadate'])
# # gvkey_permno_df = pd.DataFrame({'gvkey': ['123456789'], 'permno': [84398]})
# signal_df = pd.concat([signal_df, gvkey_permno_df])
#         # self.gvkey_permno_conversion = pd.merge(self.gvkey_permno_conversion,fake_df, on ='datadate', how = 'inner')
# gvkey_permno_conversion = pd.concat([gvkey_permno_conversion, fake_df])
# # print(gvkey_permno_conversion)
# # print(gvkey_permno_conversion[gvkey_permno_conversion['gvkey'] == '123456789'])
# signal_df = signal_df.merge(gvkey_permno_conversion,on=['gvkey','datadate']) 
# signal_df['permno'] = signal_df['permno_x'].combine_first(signal_df['permno_y'])


# signal_df.drop(columns=['permno_x', 'permno_y'], inplace=True)

# # print(signal_df['gvkey'].dtype)
# print(signal_df[signal_df['permno'] == 84398.0])
# print(signal_df)

# # print(signal_df[signal_df['gvkey'] == '123456789'], 'spygvkey')

object
Empty DataFrame
Columns: [gvkey, datadate, indfmt, consol, popsrc, datafmt, tic, conm, curcd, fyr, dlc, drc, drlt, lt, costat, LINKTYPE, LPERMNO, MthCalDt, dr, index, fyearq, fqtr, permno]
Index: []

[0 rows x 23 columns]
        gvkey   datadate indfmt consol popsrc datafmt   tic  \
0      001004 2012-05-31   INDL      C      D     STD   AIR   
1      001004 2013-05-31   INDL      C      D     STD   AIR   
2      001004 2014-05-31   INDL      C      D     STD   AIR   
3      001004 2015-05-31   INDL      C      D     STD   AIR   
4      001004 2016-05-31   INDL      C      D     STD   AIR   
...       ...        ...    ...    ...    ...     ...   ...   
38388  329141 2022-06-30   INDL      C      D     STD  RNLX   
38389  330227 2020-12-31   INDL      C      D     STD  CTRM   
38390  330227 2021-12-31   INDL      C      D     STD  CTRM   
38391  339965 2022-01-31   INDL      C      D     STD  SNOW   
38392  345980 2021-12-31   INDL      C      D     STD  WISH   

              

In [10]:
# Define class here
class DRDataProcessor():
    #1 = USA 0 = Japan
    country = 0
    
    # Path to where we store the data
    data_folder_path = Path('') 
    
    # Number of days between quarterly earnings announcement and when we can use data
    min_accounting_lag = 30
    max_accounting_lag = 365
    
    # Minimum share price to open a new position
    min_share_price = 10
    
    # Constructor, loads/cleans/merges data as needed
    def __init__(self):
        #all that is below is for JAPAN
        if self.country == 0:
            japan_data = pd.read_stata('japan_data.dta')
            japan_data = japan_data.rename(columns={'accdq': 'drc', 'lltq': 'lt', 'gvkey': 'security_id', 'prccm': 'prc'})
            japan_data = japan_data.dropna(subset=['ret'])
            # print(japan_data)
            japan_data['date'] = japan_data['datadate']
            self.signal_df = japan_data
            self.price_df = japan_data            
            self.price_df.loc[:,'ret_next'] = safe_lead_lag(self.price_df.loc[:,'ret'],self.price_df.loc[:,'security_id'],1)

        # all that is below is for AMERICA 

        if self.country == 1:
            # Load price data: monthly 1961-2020 sample from CRSP including all public US equities
            # In monthly_returns.csv
            self.price_df = pd.read_csv(self.data_folder_path / 'monthly_returns.csv')
            
            # spy = pd.read_csv(self.data_folder_path / 'spy.csv')
            # spy['gvkey'] = '123456789'
            # spy['drc'] = -1
            # spy['lt'] = 1
            # # spy['dr'] = -1
            # self.price_df['date'] =  pd.to_datetime( self.price_df['date'])
            
            # spy['date'] =  pd.to_datetime( spy['date'])
            # spy = spy[spy['date'].dt.year != 2023]
            # spy.rename(columns = {'PERMNO': 'permno'}, inplace = True)
            # self.price_df = pd.concat([self.price_df, spy])
            
            
            # Parse the yyyyMMdd int dates into DateTime64
            # Based on formatting strings here
            # https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
            self.price_df['date'] = pd.to_datetime(self.price_df.loc[:,'date'], format ="%Y%m%d")
            
            # Prices sometimes negative to indicate no volume at closing auction
            # In these cases, price = -0.5*(bid+ask)
            # But we don't use that information and so want prices to always be positive
            # See http://www.crsp.org/products/documentation/data-definitions-p
            self.price_df.loc[:,'prc'] = np.absolute(self.price_df.loc[:,'prc'])
            
            # Add next-months return as a new column 'ret_next'
            # Use the safe_lead_lag: want lead return but only when permno the same
            #need this to be 6 months into the future 
            self.price_df.loc[:,'ret_next'] = safe_lead_lag(self.price_df.loc[:,'ret'],self.price_df.loc[:,'permno'],1)
            
            # Load accounting data used for DR signal
            # Quarterly sample from 1961-2020 from Compustat Fundamentals Quarterly
            # Stored in the `comp_bm.dta` file
            # `.dta` files are Stata data, and do a better job than `.csv` files of remembering data types
            self.signal_df = pd.read_stata(self.data_folder_path / 'comp_dr.dta')
            self.signal_df = self.signal_df.dropna(subset=['drc', 'lt'])
            self.signal_df = self.signal_df[(self.signal_df['drc'] != 0) & (self.signal_df['lt'] != 0)]
            # self.signal_df['dr'] = self.signal_df['drc'] / self.signal_df['lt']
            # The problem with our accounting data is that it identifies stocks using gvkey instead of permno
            # To merge with return_df, we need to use another dataset that converts gvkey to permno
            # This is stored in the gvkey_permno_conversion.dta file
            
            self.gvkey_permno_conversion = pd.read_stata(self.data_folder_path / 'gvkey_permno_conversion.dta')
    
            # fake_df = pd.DataFrame()
            # fake_df['datadate'] = spy['date']
            # fake_df['gvkey'] = '123456789'
            # fake_df['permno'] = '84398'
            # fake_df['datadate'] = pd.to_datetime(fake_df['datadate'])
            # self.gvkey_permno_conversion['datadate'] = pd.to_datetime(self.gvkey_permno_conversion['datadate'])
            # # self.gvkey_permno_conversion = pd.merge(self.gvkey_permno_conversion,fake_df, on ='datadate', how = 'inner')
            # self.gvkey_permno_conversion = pd.concat([self.gvkey_permno_conversion, fake_df])
            # print(self.gvkey_permno_conversion)
            # print(self.gvkey_permno_conversion[self.gvkey_permno_conversion['gvkey'] == '123456789'])
            
    
            # Use a merge command to add the permno column to our signal_df
            self.signal_df = self.signal_df.merge(self.gvkey_permno_conversion,on=['gvkey','datadate'])    
            # self.signal_df['permno'] = self.signal_df['permno_x'].combine_first(signal_df['permno_y'])


        # self.signal_df.drop(columns=['permno_x', 'permno_y'], inplace=True)
        # print(self.signal_df[self.signal_df['gvkey'] == '123456789'])
        # print(self.signal_df.info())
        
    # Returns an array with the unique dates for which we have loaded data
    # Uses from the price_df since that's how frequency we can update portfolio value
    # Filters all dates in price_df to return only dates for which we have signals as well
    def unique_dates(self):
        if self.country == 1:
            price_dates = pd.Series( np.sort(self.price_df.loc[:,'date'].unique()) )
            min_signal_date = self.signal_df.loc[:,'datadate'].min() + np.timedelta64(self.min_accounting_lag,'D')
            max_signal_date =  self.signal_df.loc[:,'datadate'].max() + np.timedelta64(self.max_accounting_lag,'D')
            return price_dates[ (price_dates >= min_signal_date) & (price_dates <= max_signal_date) ].array
        if self.country == 0:
            price_dates = pd.Series( np.sort(self.price_df.loc[:,'datadate'].unique()) )
            min_signal_date = self.signal_df.loc[:,'datadate'].min() + np.timedelta64(self.min_accounting_lag,'D')
            max_signal_date =  self.signal_df.loc[:,'datadate'].max() + np.timedelta64(self.max_accounting_lag,'D')
            return price_dates[ (price_dates >= min_signal_date) & (price_dates <= max_signal_date) ].array
    
    # Returns a DataFrame containing one row for all securities in price_df as of date.
    # Columns must include:
    # - 'date': date on which price data observed
    # - 'security_id': a security identifier
    # - 'prc': price on date
    # - 'ret': return from previous date to date
    # Ignores liquidity and future-return availability requirements
    # To be used only for closing decisions and execution decisions
    # Some of the returned stocks cannot be traded
    def price_df_for_date(self,date):
        if self.country == 1:
            price_date_df = self.price_df.loc[ self.price_df.loc[:,'date'] == date, :]
            return price_date_df.rename(columns={'permno':'security_id'}) 
        if self.country == 0:
            price_date_df = self.price_df.loc[ self.price_df.loc[:,'datadate'] == date, :]
            #gvkey already been returned
            return price_date_df
    
    # Returns a DataFrame where each row is a security in the strategy's universe,
    # Columes must include:
    # - 'date': date on which price data observed
    # - 'security_id': a security identifier
    # - whatever signals the trading rule needs to decide which securities to open new positions in
    #   - In this case, return cshoq, prccq, and ceqq so trading rule can compute B/M ratio
    #
    # Also responsible for applying whatever liquidity filters are wanted to narrow universe,
    # and check that we have future return data (no point in backtesting if we don't know what happens next)
    def signal_df_for_date(self,date):
        # find set of permnos considered tradeable as of date 
        if self.country == 1:
        # start with all rows return_df on date with non-nan and non-infinite ret_next
            date_price_df = self.price_df.loc[ self.price_df.loc[:,'date'] == date,:]
            date_price_df = date_price_df.loc[ np.isfinite(self.price_df.loc[:,'ret_next']),:]
            date_price_df = date_price_df.drop(columns=['ret_next'])
            
            # now signal data
            # first only look at data announced at least accounting_lag_days prior to date
            all_past_signal_df = self.signal_df.loc[(self.signal_df.loc[:,'datadate'] < date - np.timedelta64(self.min_accounting_lag,'D')),:]
            # then grab only the latest observation for each permno
            latest_signal_df = all_past_signal_df.groupby('permno').last()
            
            
            # now merge with return data and return        
            merged_df = date_price_df.merge(latest_signal_df,on='permno',how='inner')
            merged_df = merged_df.rename(columns={'permno':'security_id'})  # use permno as our security_id
            merged_df = self.liquidity_filter(merged_df)
        else:
            #all this was already done so just return it
            date_price_df = self.price_df.loc[ self.price_df.loc[:,'datadate'] == date,:]
            date_price_df = date_price_df.loc[ np.isfinite(self.price_df.loc[:,'ret_next']),:]
            date_price_df = date_price_df.drop(columns=['ret_next'])
            
            # now signal data
            # first only look at data announced at least accounting_lag_days prior to date
            all_past_signal_df = self.signal_df.loc[(self.signal_df.loc[:,'datadate'] < date - np.timedelta64(self.min_accounting_lag,'D')),:]
            # then grab only the latest observation for each permno
            latest_signal_df = all_past_signal_df.groupby('security_id').last()
            merged_df = date_price_df.merge(latest_signal_df,on='security_id',how='inner')
            merged_df.rename(columns = {'prc_x': 'prc', 'datadate_x': 'datadate', 'date_x': "date", 'ret_x': 'ret', 'index_x': 'index', 'drc_x': 'drc', 'lt_x': 'lt'}, inplace = True)
            return merged_df[merged_df['prc'] >= self.min_share_price]
        
        # filter by liquidity requirements
       
        
        # and return without the ret_next column so backtests don't cheat by using it
        
        return merged_df

    # Returns a filterd version of the passed DataFrame,
    # with all observations deemed too illiquid removed
    # Liquidity requirements:
    #  - price >= $10
    def liquidity_filter(self,df):
        if self.country == 1:
            return df.loc[ df.loc[:,'prc'] >= self.min_share_price,:]
        return df.loc[ df.loc[:,'prc'] >= self.min_share_price,:]
        
###################################################################
# Helper methods, do not modify
###################################################################

# Function safe_lead_lag returns a new Series with the lead/lagged values
#  but only when a group is the same for the lead/lag
# Inputs:
# - data_series: data we want to lead/lag
# - group_series: grouping we want to be the same for the lead/lag to be value
# requires data_series and group_series already by sorted by group_series
# so that all alike values of group_series are adjacent,
# meaning group_series should look like:
#    g_0
#    g_0
#    g_0
#    g_0
#    g_1
#    g_1
#    g_2
#    g_2 
#    ...
# where g_i indicates the observation is in group i,
# and once the first g_{i+1} appears no more g_i values appear
# 
# lead_lag > 0 returns a data_series with values of data_series lead_lag rows ahead
# as long as group_series remains the same, NaN if group different
# lead_lag < 0 returns a data_series with values of data_series -lead_lag rows behind 
# (same as lead_lag rows ahead) as long as group_series remains the same, NaN if group different
def safe_lead_lag(data_series,group_series,lead_lag): 
    df = pd.DataFrame({ 'data': data_series, 'group': group_series })
    return df.groupby(['group'])['data'].shift(-lead_lag)