In [16]:
import pandas as pd
import csv
import re

compustat_location = '../archive/compustat_feb.csv'
breach_location = '../data/data_breaches_final.csv'
bea_location = '../archive/BEA_database.csv'
trends_tic_location = '../data/trends_tic.csv'
trends_conm_location = '../data/trends_conm.csv'
out_location = '../data/COMPUSTAT_merged_trends_feb.csv'

In [17]:
remove = ['CORP', 'INC', 'LTD', 'CORPORATION', 'INCORPORATED', 'LLC', 'LTD', 'GROUP', 'NV', 'GRP', 'PLC']

def clean_conm(name):
    name = str(name)
    name = re.sub(r'[^\s\w]+', '', name)
    namewords = name.split()
    resultwords = [word for word in namewords if word not in remove]
    result = ' '.join(resultwords)
    result = result.strip(' ')
    return result

def clean_tic(name):
    name = str(name)
    name = re.sub(r'[^\s\w]+', '', name)
    result = name.strip(' ')
    return result

In [18]:
compustat = pd.read_csv(compustat_location)
compustat['datadate'] = pd.to_datetime(compustat['datadate'], format='%Y%m%d')
# Remove any data from outside of US companies
compustat = compustat[compustat['curcdq'] == 'USD']
compustat['datamonth'] = compustat['datadate'].apply(lambda x: x.replace(day = 1))
compustat['clean_name'] = compustat['conm'].apply(clean_conm)
compustat['clean_tic'] = compustat['tic'].apply(clean_tic)

In [19]:
breach = pd.read_csv(breach_location)
breach['datacqtr'] = breach['yearquarter']
breach['gvkey'] = breach['GVKEY']
breach = breach[breach['match'] == 1]
breach['Date Made Public'] = pd.to_datetime(breach['Date Made Public'], format='%B %d, %Y')

In [20]:
bea = pd.read_csv(bea_location)
bea['datacqtr'] = bea['yearquarter']

In [21]:
# trends = pd.read_csv(trends_location).set_index('date')
# trends_tic = trends.iloc[:, ::2]
# trends_company = trends.iloc[:, 1::2]

trends_company = pd.read_csv(trends_conm_location, index_col='date').stack()
trends_company = trends_company.reset_index()
trends_company['date'] = pd.to_datetime(trends_company['date'])
trends_company.columns = ['datamonth', 'clean_name', 'trend_index_company']

trends_tic = pd.read_csv(trends_tic_location, index_col='date').stack()
trends_tic = trends_tic.reset_index()
trends_tic['date'] = pd.to_datetime(trends_tic['date'])
trends_tic.columns = ['datamonth', 'clean_tic', 'trend_index_tic']

In [22]:
out = pd.merge(compustat, breach, how='outer', on=['gvkey', 'datacqtr'])

In [23]:
out = pd.merge(out, bea, how='left', on='datacqtr')

In [24]:
out = pd.merge(out, trends_company, how='left', on=['datamonth','clean_name'])
out = pd.merge(out, trends_tic, how='left', on=['datamonth','clean_tic'])

In [25]:
del out['Unnamed: 0']
del out['orig_name']

out['tic'] = out['tic_x']
del out['tic_x']
del out['tic_y']

out['conm'] = out['conm_x']
del out['conm_x']
del out['conm_y']

out['sic'] = out['sic_x']
del out['sic_x']
del out['sic_y']

out['yearquarter'] = out['yearquarter_x']
del out['yearquarter_x']
del out['yearquarter_y']


In [39]:
out = out[~out['datadate'].isnull()]

In [44]:
print(len(breach['gvkey'].unique()))
print(len(out['gvkey'].unique()))

436
431


In [66]:
out.head()

Unnamed: 0,gvkey,datadate,fyearq,fqtr,fyr,indfmt,consol,popsrc,datafmt,cusip,...,real_gd_investment,real_exports,real_imports,real_government_spending,trend_index_company,trend_index_tic,tic,conm,sic,yearquarter
0,1013,2005-07-31,2005.0,3.0,10.0,INDL,C,D,STD,886309,...,100.04,69.685,87.021,95.934,47.0,95.0,ADCT,ADC TELECOMMUNICATIONS INC,3661.0,
1,1013,2005-10-31,2005.0,4.0,10.0,INDL,C,D,STD,886309,...,101.362,69.853,87.593,96.251,38.0,52.0,ADCT,ADC TELECOMMUNICATIONS INC,3661.0,
2,1013,2006-01-31,2006.0,1.0,10.0,INDL,C,D,STD,886309,...,104.657,71.871,90.392,96.326,29.0,49.0,ADCT,ADC TELECOMMUNICATIONS INC,3661.0,
3,1013,2006-04-30,2006.0,2.0,10.0,INDL,C,D,STD,886309,...,106.211,74.602,92.313,97.549,24.0,40.0,ADCT,ADC TELECOMMUNICATIONS INC,3661.0,
4,1013,2006-07-31,2006.0,3.0,10.0,INDL,C,D,STD,886309,...,105.526,76.06,93.32,97.52,27.0,20.0,ADCT,ADC TELECOMMUNICATIONS INC,3661.0,


In [104]:
quantiles = out.groupby(by='gvkey').apply(lambda x: x['revtq'].iloc[0]).to_frame().quantile([0.25, 0.5, 0.75, 1])
quantiles.index.names = ['quantile']
quantiles.columns = ['revtq']

out['rev_quart_1'] = 0
out['rev_quart_2'] = 0
out['rev_quart_3'] = 0
out['rev_quart_4'] = 0

def classify_rev(comp):
   
    g = comp['gvkey'].iloc[0]
    fr = comp['revtq'].iloc[0]
    q1 = quantiles.loc[0.25].values[0]
    q2 = quantiles.loc[0.5].values[0]
    q3 = quantiles.loc[0.75].values[0]
    q4 = quantiles.loc[1].values[0]
    
    if fr <= q4 and fr > q3:
        quartile = 4
    elif fr <= q3 and fr > q2:
        quartile = 3
    elif fr <= q2 and fr > q1:
        quartile = 2
    else:
        quartile = 1
        
    if quartile == 4:
        out.loc[out['gvkey'] == g, 'rev_quart_4'] = 1
        
    if quartile == 3:
        out.loc[out['gvkey'] == g, 'rev_quart_3'] = 1

    if quartile == 2:
        out.loc[out['gvkey'] == g, 'rev_quart_2'] = 1

    if quartile == 1:
        out.loc[out['gvkey'] == g, 'rev_quart_1'] = 1
 

out.groupby(by='gvkey').apply(classify_rev)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus

In [105]:
# out.loc[out['gvkey'] == 1013, 'rev_quart_1'] = 0
#out.loc[out['gvkey'] == 1013, 'rev_quart_4']
print(len(out[out['rev_quart_1'] == 1]))
print(len(out[out['rev_quart_2'] == 1]))
print(len(out[out['rev_quart_3'] == 1]))
print(len(out[out['rev_quart_4'] == 1]))

6617
4120
4263
4473


In [107]:
out.to_csv(out_location, index=False, quoting=csv.QUOTE_ALL)