In [None]:
import re
import pandas as pd
import numpy as np

In [None]:
def prepareForModel(insiderDat):
    def fixTitle(title):
        '''
        I figure that the Chair of the Board is the most fiscally powerful person in a company, so to break ties for
        people who hold multiple titles, I'll prioritize COB, then C-suite, then other directors, then anyone else.
        '''

        directorKeywords = ['Dir', 'VP', 'Vice', 'V.P.', 'Pres']
        officerKeywords = ['CEO', 'C.E.O' 'COO', 'C.O.O', 'CHRO', 'C.H.R.O', 
                           'CFO', 'C.F.O', 'CTO', 'C.T.O', 'Chief']
        chairKeywords = ['COB', 'C.O.B.', 'Chair']

        if any([re.search(s, title, re.IGNORECASE) for s in chairKeywords]):
            newTitle = 'Chair'
        elif any([re.search(s, title, re.IGNORECASE) for s in officerKeywords]):
            newTitle = 'Officer'
        elif any([re.search(s, title, re.IGNORECASE) for s in directorKeywords]):
            newTitle = 'Director'
        else:
            newTitle = 'Other'

        return newTitle
    
    if 'Title' in insiderDat.columns:
        insiderDat.Title = [fixTitle(r) for r in insiderDat.Title]

    insiderDat['FilingDate'] = pd.to_datetime(insiderDat['FilingDate']).dt.date
    insiderDat = insiderDat.astype({'Price': 'float', 
                                    'Qty': 'float', 
                                    'Owned': 'float', 
                                    'DeltaOwn': 'float', 
                                    'Value': 'float', 
                                    'NumTrades': 'int', 
                                    'TradeToFileTime': 'int', 
                                    '%VolumeChange': 'float', 
                                    '%FuturePriceChange': 'float'
                                   })
    
    return insiderDat

In [2]:
def returnXandY(insiderDat, startDate, endDate):
    '''Split the data'''
    dateRange = pd.date_range(start=startDate, end=endDate).date

    insiderDat = insiderDat.drop(columns=['CompanyName', 'TradeDate', 'InsiderName'])
    if 'Unnamed: 0' in insiderDat.columns:
        insiderDat = insiderDat.drop(columns=['Unnamed: 0'])
    
    dummies_data = pd.get_dummies(insiderDat, columns=['Title', 'TradeType'], prefix=['Title', None])

    data_XY = dummies_data[dummies_data['FilingDate'].isin(dateRange)]
    data_XY = data_XY.dropna()
    data_X = data_XY.drop(columns=['FilingDate', '%FuturePriceChange', 'Ticker'])
    data_Y = data_XY['%FuturePriceChange']

    assert np.any(np.isnan(data_X)) == False
    assert np.all(np.isfinite(data_X)) == True
    assert np.any(np.isnan(data_Y)) == False
    assert np.all(np.isfinite(data_Y)) == True
    
    return data_XY, data_X, data_Y