In [1]:
import pandas as pd
import csv
import re

compustat_location = '../archive/COMPUSTAT_database.csv'
breach_location = '../data/data_breaches_final.csv'
bea_location = '../archive/BEA_database.csv'
trends_tic_location = '../data/trends_tic.csv'
trends_conm_location = '../data/trends_conm.csv'
out_location = '../data/COMPUSTAT_merged_trends.csv'

In [2]:
remove = ['CORP', 'INC', 'LTD', 'CORPORATION', 'INCORPORATED', 'LLC', 'LTD', 'GROUP', 'NV', 'GRP', 'PLC']

def clean_name(name):
    name = str(name)
    name = re.sub(r'[^\s\w]+', '', name)
    namewords = name.split()
    resultwords = [word for word in namewords if word not in remove]
    result = ' '.join(resultwords)
    result = result.strip(' ')
    return result

In [3]:
compustat = pd.read_csv(compustat_location)
compustat['datadate'] = pd.to_datetime(compustat['datadate'], format='%Y%m%d')
# Remove any data from outside of US companies
compustat = compustat[compustat['curcdq'] == 'USD']
compustat['datamonth'] = compustat['datadate'].apply(lambda x: x.replace(day = 1))
compustat['clean_name'] = compustat['conm'].apply(clean_name)
compustat['clean_tic'] = compustat['tic'].apply(clean_name)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
breach = pd.read_csv(breach_location)
breach['datacqtr'] = breach['yearquarter']
breach['gvkey'] = breach['GVKEY']
breach = breach[breach['match'] == 1]
breach['Date Made Public'] = pd.to_datetime(breach['Date Made Public'], format='%B %d, %Y')

In [5]:
bea = pd.read_csv(bea_location)
bea['datacqtr'] = bea['yearquarter']

In [6]:
# trends = pd.read_csv(trends_location).set_index('date')
# trends_tic = trends.iloc[:, ::2]
# trends_company = trends.iloc[:, 1::2]

trends_company = pd.read_csv(trends_conm_location, index_col='date').stack()
trends_company = trends_company.reset_index()
trends_company['date'] = pd.to_datetime(trends_company['date'])
trends_company.columns = ['datamonth', 'clean_name', 'trend_index_company']

trends_tic = pd.read_csv(trends_tic_location, index_col='date').stack()
trends_tic = trends_tic.reset_index()
trends_tic['date'] = pd.to_datetime(trends_tic['date'])
trends_tic.columns = ['datamonth', 'clean_tic', 'trend_index_tic']

In [7]:
out = pd.merge(compustat, breach, how='outer', on=['gvkey', 'datacqtr'])

In [8]:
out = pd.merge(out, bea, how='left', on='datacqtr')

In [9]:
out = pd.merge(out, trends_company, how='outer', on=['datamonth','clean_name'])
out = pd.merge(out, trends_tic, how='outer', on=['datamonth','clean_tic'])

In [10]:
del out['Unnamed: 0']
del out['orig_name']

out['tic'] = out['tic_x']
del out['tic_x']
del out['tic_y']

out['conm'] = out['conm_x']
del out['conm_x']
del out['conm_y']

out['sic'] = out['sic_x']
del out['sic_x']
del out['sic_y']

out['yearquarter'] = out['yearquarter_x']
del out['yearquarter_x']
del out['yearquarter_y']


In [13]:
out = out[~out['datadate'].isnull()]

In [14]:
out.to_csv(out_location, index=False, quoting=csv.QUOTE_ALL)