In [1]:
import pandas as pd
import csv
import re

compustat_location = '../archive/compustat_feb.csv'
breach_location = '../data/data_breaches_final.csv'
bea_location = '../archive/BEA_database.csv'
trends_tic_location = '../data/trends_tic.csv'
trends_conm_location = '../data/trends_conm.csv'
out_location = '../data/COMPUSTAT_merged_cleaned_trends_feb.csv'

In [2]:
remove = ['CORP', 'INC', 'LTD', 'CORPORATION', 'INCORPORATED', 'LLC', 'LTD', 'GROUP', 'NV', 'GRP', 'PLC']

def clean_conm(name):
    name = str(name)
    name = re.sub(r'[^\s\w]+', '', name)
    namewords = name.split()
    resultwords = [word for word in namewords if word not in remove]
    result = ' '.join(resultwords)
    result = result.strip(' ')
    return result

def clean_tic(name):
    name = str(name)
    name = re.sub(r'[^\s\w]+', '', name)
    result = name.strip(' ')
    return result

In [3]:
compustat = pd.read_csv(compustat_location)
compustat['datadate'] = pd.to_datetime(compustat['datadate'], format='%Y%m%d')
# Remove any data from outside of US companies
compustat = compustat[compustat['curcdq'] == 'USD']
compustat['datamonth'] = compustat['datadate'].apply(lambda x: x.replace(day = 1))
compustat['clean_name'] = compustat['conm'].apply(clean_conm)
compustat['clean_tic'] = compustat['tic'].apply(clean_tic)

In [4]:
breach = pd.read_csv(breach_location)
breach['datacqtr'] = breach['yearquarter']
breach['gvkey'] = breach['GVKEY']
breach = breach[breach['match'] == 1]
breach['Date Made Public'] = pd.to_datetime(breach['Date Made Public'], format='%B %d, %Y')

In [5]:
bea = pd.read_csv(bea_location)
bea['datacqtr'] = bea['yearquarter']

In [6]:
# trends = pd.read_csv(trends_location).set_index('date')
# trends_tic = trends.iloc[:, ::2]
# trends_company = trends.iloc[:, 1::2]

trends_company = pd.read_csv(trends_conm_location, index_col='date').stack()
trends_company = trends_company.reset_index()
trends_company['date'] = pd.to_datetime(trends_company['date'])
trends_company.columns = ['datamonth', 'clean_name', 'trend_index_company']

trends_tic = pd.read_csv(trends_tic_location, index_col='date').stack()
trends_tic = trends_tic.reset_index()
trends_tic['date'] = pd.to_datetime(trends_tic['date'])
trends_tic.columns = ['datamonth', 'clean_tic', 'trend_index_tic']

In [7]:
out = pd.merge(compustat, breach, how='outer', on=['gvkey', 'datacqtr'])

In [8]:
out = pd.merge(out, bea, how='left', on='datacqtr')

In [9]:
out = pd.merge(out, trends_company, how='left', on=['datamonth','clean_name'])
out = pd.merge(out, trends_tic, how='left', on=['datamonth','clean_tic'])

In [10]:
del out['Unnamed: 0']
del out['orig_name']

out['tic'] = out['tic_x']
del out['tic_x']
del out['tic_y']

out['conm'] = out['conm_x']
del out['conm_x']
del out['conm_y']

out['sic'] = out['sic_x']
del out['sic_x']
del out['sic_y']

out['yearquarter'] = out['yearquarter_x']
del out['yearquarter_x']
del out['yearquarter_y']


In [11]:
out = out[~out['datadate'].isnull()]

In [12]:
print(len(breach['gvkey'].unique()))
print(len(out['gvkey'].unique()))

436
431


In [13]:
out.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19473 entries, 0 to 19472
Columns: 443 entries, gvkey to yearquarter
dtypes: datetime64[ns](3), float64(415), int64(1), object(24)
memory usage: 66.0+ MB


In [14]:
quantiles = out.groupby(by='gvkey').apply(lambda x: x['revtq'].iloc[0]).to_frame().quantile([0.25, 0.5, 0.75, 1])
quantiles.index.names = ['quantile']
quantiles.columns = ['revtq']

out['rev_quart_1'] = 0
out['rev_quart_2'] = 0
out['rev_quart_3'] = 0
out['rev_quart_4'] = 0

def classify_rev(comp):
   
    g = comp['gvkey'].iloc[0]
    fr = comp['revtq'].iloc[0]
    q1 = quantiles.loc[0.25].values[0]
    q2 = quantiles.loc[0.5].values[0]
    q3 = quantiles.loc[0.75].values[0]
    q4 = quantiles.loc[1].values[0]
    
    if fr <= q4 and fr > q3:
        quartile = 4
    elif fr <= q3 and fr > q2:
        quartile = 3
    elif fr <= q2 and fr > q1:
        quartile = 2
    else:
        quartile = 1
        
    if quartile == 4:
        out.loc[out['gvkey'] == g, 'rev_quart_4'] = 1
        
    if quartile == 3:
        out.loc[out['gvkey'] == g, 'rev_quart_3'] = 1

    if quartile == 2:
        out.loc[out['gvkey'] == g, 'rev_quart_2'] = 1

    if quartile == 1:
        out.loc[out['gvkey'] == g, 'rev_quart_1'] = 1
 

out.groupby(by='gvkey').apply(classify_rev)


In [15]:
# out.loc[out['gvkey'] == 1013, 'rev_quart_1'] = 0
#out.loc[out['gvkey'] == 1013, 'rev_quart_4']
print(len(out[out['rev_quart_1'] == 1]))
print(len(out[out['rev_quart_2'] == 1]))
print(len(out[out['rev_quart_3'] == 1]))
print(len(out[out['rev_quart_4'] == 1]))

6617
4120
4263
4473


In [16]:
def closest_breach(company_record):
    cols = list(set(breach.columns.to_list()) & set(company_record.columns.to_list()))
    for i in range(150):
        for col in cols:
            company_record[col] = company_record[col].ffill(limit=1).bfill(limit=1) 
    return company_record

out = out.set_index('datadate').groupby('gvkey').apply(closest_breach).reset_index()

In [20]:
out = out.drop(out[out['match'].isnull()].index)

In [30]:
out.to_csv(out_location, index=False, quoting=csv.QUOTE_ALL)