In [75]:
import pandas as pd
import csv
import datetime
import re

crsp_location = '../archive/CRSP_small_database.csv'
breach_location = '../data/data_breaches_final.csv'
ff_location = '../data/FF_factors.csv'
trends_tic_location = '../data/trends_tic.csv'
trends_conm_location = '../data/trends_conm.csv'
out_location = '../data/CRSP_merged_mar.csv'

results_10days_location = '../data/stock_study_10day.csv'
results_5days_location = '../data/stock_study_5day.csv'
results_2days_location = '../data/stock_study_2day.csv'

In [3]:
remove = ['CORP', 'INC', 'LTD', 'CORPORATION', 'INCORPORATED', 'LLC', 'LTD', 'GROUP', 'NV', 'GRP', 'PLC']

def clean_name(name):
    name = str(name)
    name = re.sub(r'[^\s\w]+', '', name)
    namewords = name.split()
    resultwords = [word for word in namewords if word not in remove]
    result = ' '.join(resultwords)
    result = result.strip(' ')
    return result

In [4]:
crsp = pd.read_csv(crsp_location)
crsp['date'] = pd.to_datetime(crsp['date'], format='%Y%m%d')
crsp.columns
crsp['datamonth'] = crsp['date'].apply(lambda x: x.replace(day = 1))
crsp['clean_name'] = crsp['COMNAM'].apply(clean_name)
#crsp['clean_tic'] = crsp['TICKER'].apply(clean_tic)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
breach = pd.read_csv(breach_location)
breach['date'] = pd.to_datetime(breach['Date Made Public'], format='%B %d, %Y')
breach['TICKER'] = breach['tic']
breach = breach[breach['match'] == 1]

In [6]:
ff = pd.read_csv(ff_location)
ff['date'] = pd.to_datetime(ff['date'], format='%Y%m%d')

In [8]:
# By doing left we explicitly exclude firms that have the breach not on a business day
out = pd.merge(crsp, breach, how='left', on=['date', 'TICKER'])


In [9]:
out = pd.merge(out, ff, how='left', on='date')


In [10]:
out = out.drop(out[out['TICKER'].isnull() & out['match'].isnull()].index)

In [11]:
matched = out.groupby(by='TICKER').apply(lambda x: x[x['match'] == 1])['TICKER'].unique()
out = out[out['TICKER'].isin(matched)]

In [98]:
breach_dates = out[out['match'].notnull()][['date', 'TICKER']]

def index_occurances(record):
    record = record.reset_index().reset_index()
    del record['index']
    record.columns = ['index', 'date', 'TICKER']
    record = record.sort_values(by='date')
    record['index'] = record['index'] + 1
    record['ticker_occurance'] = record['TICKER'] + '_' + record['index'].map(str)
    
    return record

matched = breach_dates.groupby(by='TICKER').apply(index_occurances)
del matched['index']
matched = matched.reset_index(drop=True)
matched = pd.merge(matched, breach, on=['date', 'TICKER'], how='left')
matched

Unnamed: 0,date,TICKER,ticker_occurance,Company,City,State,Type of breach,Type of organization,Total Records,Year of Breach,...,local,customer,employee,credit_card,cvv,social_security,name,address,personal_information,yearquarter
0,2011-11-02,AAN,AAN_1,Aaron's,Fresno,California,STAT,BSR,1008,2011,...,0,1,0,0,0,1,1,0,0,2011Q4
1,2013-10-22,AAN,AAN_2,Aaron's,Atlanta,Georgia,DISC,BSR,0,2013,...,0,1,0,0,0,0,0,0,0,2013Q4
2,2008-03-31,AAP,AAP_1,Advance Auto Parts,Roanoke,Virginia,HACK,BSR,56000,2008,...,0,0,0,0,0,0,0,0,0,2008Q1
3,2016-03-16,AAP,AAP_2,Advanced Auto Parts,Roanoke,Virginia,HACK,BSR,0,2016,...,0,0,1,0,0,1,1,0,0,2016Q1
4,2011-04-01,AAPL,AAPL_1,iTunes (Apple),Cupertino,California,HACK,BSR,0,2011,...,0,1,0,1,0,0,0,1,0,2011Q2
5,2012-09-04,AAPL,AAPL_2,Apple,Cupertino,California,HACK,BSR,1000000,2012,...,0,0,0,0,0,0,0,0,0,2012Q3
6,2013-02-19,AAPL,AAPL_3,Apple,Cupertino,California,HACK,BSR,0,2013,...,0,0,1,0,0,0,0,0,0,2013Q1
7,2013-07-22,AAPL,AAPL_4,Apple Inc.,Cupertino,California,HACK,BSR,0,2013,...,0,1,0,0,0,0,1,1,0,2013Q3
8,2014-02-26,AAPL,AAPL_5,Apple,Cupertino,California,HACK,BSO,0,2014,...,0,0,0,0,0,0,0,0,0,2014Q1
9,2017-09-11,ABB,ABB_1,ABB Inc.,,,HACK,BSO,0,2017,...,0,0,1,0,0,1,1,1,1,2017Q3


In [101]:
r10 = pd.read_csv(results_10days_location)
r5 = pd.read_csv(results_5days_location)
r2 = pd.read_csv(results_2days_location)

del r10['Unnamed: 0']
del r5['Unnamed: 0']
del r2['Unnamed: 0']

r10.columns = [col + '_10d' for col in r10.columns]
r10['ticker_occurance'] = r10['ticker_occurance_10d']
del r10['ticker_occurance_10d']
r5.columns = [col + '_5d' for col in r5.columns]
r5['ticker_occurance'] = r5['ticker_occurance_5d']
del r5['ticker_occurance_5d']
r2.columns = [col + '_2d' for col in r2.columns]
r2['ticker_occurance'] = r2['ticker_occurance_2d']
del r2['ticker_occurance_2d']

es = pd.merge(r10, r5, on='ticker_occurance', how='outer')
es = pd.merge(es, r2, on='ticker_occurance', how='outer')
es = pd.merge(es, matched, on='ticker_occurance', how='left')
es

Unnamed: 0,car_mmodel_10d,car_ffmodel_10d,aar_mmodel_10d,aar_ffmodel_10d,est_size_10d,pred_size_10d,car_mmodel_percent_10d,car_ffmodel_percent_10d,aar_mmodel_percent_10d,aar_ffmodel_percent_10d,...,local,customer,employee,credit_card,cvv,social_security,name,address,personal_information,yearquarter
0,0.050739,0.022279,0.002146,0.000343,104.0,8.0,5.073870,2.227927,0.214634,0.034327,...,0,0,1,0,0,1,1,0,0,2013Q3
1,0.034746,0.028969,0.002624,0.001805,101.0,7.0,3.474637,2.896936,0.262352,0.180514,...,0,1,0,0,0,0,0,0,0,2013Q1
2,0.061919,0.058915,0.003404,0.002372,101.0,9.0,6.191900,5.891547,0.340441,0.237209,...,0,0,1,0,0,1,0,0,0,2006Q1
3,0.088987,0.087714,0.014963,0.014827,101.0,7.0,8.898700,8.771395,1.496251,1.482739,...,0,0,1,0,0,1,1,0,0,2007Q2
4,-0.006995,0.000124,0.000415,0.000251,103.0,9.0,-0.699499,0.012369,0.041518,0.025076,...,0,1,0,0,0,0,0,0,0,2013Q4
5,-0.008832,-0.013880,0.003819,0.002756,103.0,6.0,-0.883187,-1.387965,0.381911,0.275618,...,0,0,1,0,0,1,1,1,0,2007Q1
6,0.027689,0.014115,0.005062,0.004502,104.0,7.0,2.768893,1.411530,0.506244,0.450239,...,0,0,1,0,0,1,1,1,0,2016Q2
7,-0.072815,-0.062898,-0.003999,-0.003542,105.0,6.0,-7.281524,-6.289827,-0.399926,-0.354176,...,0,0,0,0,0,1,1,1,1,2015Q4
8,-0.075453,-0.045441,-0.003975,-0.001598,100.0,9.0,-7.545254,-4.544071,-0.397509,-0.159805,...,0,0,1,0,0,0,0,0,0,2007Q2
9,-0.075453,-0.045441,-0.003975,-0.001598,100.0,9.0,-7.545254,-4.544071,-0.397509,-0.159805,...,0,1,0,1,0,0,1,1,0,2007Q2


In [102]:
es.to_csv('../data/event_study_controls.csv')

In [9]:
def closest_breach(company_record):
    cols = list(set(breach.columns.to_list()) & set(company_record.columns.to_list()))
    for i in range(4400):
        for col in cols:
            company_record[col] = company_record[col].ffill(limit=1).bfill(limit=1) 
    return company_record

out = out.set_index('date').groupby('TICKER').apply(closest_breach).reset_index()

KeyboardInterrupt: 

In [23]:
out.to_csv(out_location, index=False, quoting=csv.QUOTE_ALL)