In [1]:
import pandas as pd
import csv
import re

compustat_location = '../archive/compustat_feb.csv'
breach_location = '../data/data_breaches_final.csv'
bea_location = '../archive/BEA_database.csv'
trends_tic_location = '../data/trends_tic.csv'
trends_conm_location = '../data/trends_conm.csv'
employees_location = '../data/employees.csv'
out_location = '../data/COMPUSTAT_merged_cleaned_trends_feb.csv'

In [2]:
remove = ['CORP', 'INC', 'LTD', 'CORPORATION', 'INCORPORATED', 'LLC', 'LTD', 'GROUP', 'NV', 'GRP', 'PLC']

def clean_conm(name):
    name = str(name)
    name = re.sub(r'[^\s\w]+', '', name)
    namewords = name.split()
    resultwords = [word for word in namewords if word not in remove]
    result = ' '.join(resultwords)
    result = result.strip(' ')
    return result

def clean_tic(name):
    name = str(name)
    name = re.sub(r'[^\s\w]+', '', name)
    result = name.strip(' ')
    return result

In [3]:
compustat = pd.read_csv(compustat_location)
compustat['datadate'] = pd.to_datetime(compustat['datadate'], format='%Y%m%d')
# Remove any data from outside of US companies
compustat = compustat[compustat['curcdq'] == 'USD']
compustat['datamonth'] = compustat['datadate'].apply(lambda x: x.replace(day = 1))
compustat['clean_name'] = compustat['conm'].apply(clean_conm)
compustat['clean_tic'] = compustat['tic'].apply(clean_tic)
compustat['fyear'] = compustat['datafqtr'].apply(lambda x: int(str(x)[0:4]))

In [4]:
breach = pd.read_csv(breach_location)
breach['datacqtr'] = breach['yearquarter']
breach['gvkey'] = breach['GVKEY']
breach = breach[breach['match'] == 1]
breach['Date Made Public'] = pd.to_datetime(breach['Date Made Public'], format='%B %d, %Y')

In [5]:
bea = pd.read_csv(bea_location)
bea['datacqtr'] = bea['yearquarter']

In [6]:
# trends = pd.read_csv(trends_location).set_index('date')
# trends_tic = trends.iloc[:, ::2]
# trends_company = trends.iloc[:, 1::2]

trends_company = pd.read_csv(trends_conm_location, index_col='date').stack()
trends_company = trends_company.reset_index()
trends_company['date'] = pd.to_datetime(trends_company['date'])
trends_company.columns = ['datamonth', 'clean_name', 'trend_index_company']

trends_tic = pd.read_csv(trends_tic_location, index_col='date').stack()
trends_tic = trends_tic.reset_index()
trends_tic['date'] = pd.to_datetime(trends_tic['date'])
trends_tic.columns = ['datamonth', 'clean_tic', 'trend_index_tic']

In [7]:
employees = pd.read_csv(employees_location)
employees['fyear'].fillna(employees['datadate'].apply(lambda x: int(str(x)[0:4]) + 1), inplace = True)
employees = employees[['gvkey', 'fyear', 'emp']]

In [8]:
out = pd.merge(compustat, breach, how='outer', on=['gvkey', 'datacqtr'])

In [9]:
out = pd.merge(out, bea, how='left', on='datacqtr')

In [10]:
out = pd.merge(out, trends_company, how='left', on=['datamonth','clean_name'])
out = pd.merge(out, trends_tic, how='left', on=['datamonth','clean_tic'])

In [11]:
out = pd.merge(out, employees, how='left', on=['fyear', 'gvkey'])

In [12]:
del out['Unnamed: 0']
del out['orig_name']

out['tic'] = out['tic_x']
del out['tic_x']
del out['tic_y']

out['conm'] = out['conm_x']
del out['conm_x']
del out['conm_y']

out['sic'] = out['sic_x']
del out['sic_x']
del out['sic_y']

out['yearquarter'] = out['yearquarter_x']
del out['yearquarter_x']
del out['yearquarter_y']


In [13]:
out = out[~out['datadate'].isnull()]

In [14]:
print(len(breach['gvkey'].unique()))
print(len(out['gvkey'].unique()))

436
431


In [23]:
quantiles = out.groupby(by='gvkey').apply(lambda x: x['revtq'].iloc[0]).to_frame().quantile([0.25, 0.5, 0.75, 1])
quantiles.index.names = ['quantile']
quantiles.columns = ['revtq']

out['rev_quart_1'] = 0
out['rev_quart_2'] = 0
out['rev_quart_3'] = 0
out['rev_quart_4'] = 0
out['first_rev'] = 0

def classify_rev(comp):
   
    g = comp['gvkey'].iloc[0]
    fr = comp['revtq'].iloc[0]
    q1 = quantiles.loc[0.25].values[0]
    q2 = quantiles.loc[0.5].values[0]
    q3 = quantiles.loc[0.75].values[0]
    q4 = quantiles.loc[1].values[0]
    
    out.loc[out['gvkey'] == g, 'first_rev'] = fr
    
    if fr <= q4 and fr > q3:
        quartile = 4
    elif fr <= q3 and fr > q2:
        quartile = 3
    elif fr <= q2 and fr > q1:
        quartile = 2
    else:
        quartile = 1
        
    if quartile == 4:
        out.loc[out['gvkey'] == g, 'rev_quart_4'] = 1
        
    if quartile == 3:
        out.loc[out['gvkey'] == g, 'rev_quart_3'] = 1

    if quartile == 2:
        out.loc[out['gvkey'] == g, 'rev_quart_2'] = 1

    if quartile == 1:
        out.loc[out['gvkey'] == g, 'rev_quart_1'] = 1
 

out.groupby(by='gvkey').apply(classify_rev)


In [25]:
out['first_rev']

0         305.600
1         305.600
2         305.600
3         305.600
4         305.600
5         305.600
6         305.600
7         305.600
8         305.600
9         305.600
10        305.600
11        305.600
12        305.600
13        305.600
14        305.600
15        305.600
16        305.600
17        305.600
18        305.600
19        305.600
20        305.600
21        305.600
22       5309.000
23       5309.000
24       5309.000
25       5309.000
26       5309.000
27       5309.000
28       5309.000
29       5309.000
           ...   
22278      42.808
22279      42.808
22280      42.808
22281      42.808
22282      42.808
22283      42.808
22284      42.808
22285       6.107
22286       6.107
22287       6.107
22288       6.107
22289       6.107
22290       6.107
22291       6.107
22292       6.107
22293       6.107
22294       6.107
22295       6.107
22296       6.107
22297       6.107
22298       6.107
22299       6.107
22300       6.107
22301       6.107
22302     

In [26]:
# out.loc[out['gvkey'] == 1013, 'rev_quart_1'] = 0
#out.loc[out['gvkey'] == 1013, 'rev_quart_4']
print(len(out[out['rev_quart_1'] == 1]))
print(len(out[out['rev_quart_2'] == 1]))
print(len(out[out['rev_quart_3'] == 1]))
print(len(out[out['rev_quart_4'] == 1]))

7942
4596
4737
5033


In [27]:
def closest_breach(company_record):
    cols = list(set(breach.columns.to_list()) & set(company_record.columns.to_list()))
    for i in range(150):
        for col in cols:
            company_record[col] = company_record[col].ffill(limit=1).bfill(limit=1) 
    return company_record

out = out.set_index('datadate').groupby('gvkey').apply(closest_breach).reset_index()

In [28]:
out = out.drop(out[out['match'].isnull()].index)

In [29]:
na_rev_companies = out[out['revtq'].isnull()]['gvkey'].unique()
na_rev_comp = out[out['gvkey'].isin(na_rev_companies)]

In [30]:
na_rev_companies

array([  1013,   2005,   2019,   2269,   2968,   3082,   3238,   3643,
         4640,   4674,   4678,   4685,   4699,   4739,   5568,   5643,
         5786,   7647,   7982,   8007,   8245,   9783,  10035,  10187,
        10726,  11856,  11861,  12976,  14275,  15199,  15509,  15545,
        15617,  15782,  15855,  16116,  16245,  16384,  16784,  17934,
        18498,  18683,  19873,  20067,  22306,  23107,  23821,  24481,
        24905,  26156,  26272,  30188,  60923,  64336, 110250, 111537,
       111662, 111819, 114524, 125054, 139662, 141913, 153769, 161989,
       165993, 175607, 176928, 176939, 178012, 179027, 179534, 180405,
       184167, 184180, 184498, 185419, 186342, 187462, 187969, 188856,
       189860, 197559])

In [31]:
out.to_csv(out_location, index=False, quoting=csv.QUOTE_ALL)