In [4]:
import pandas as pd
import csv
import datetime
import re

crsp_location = '../archive/CRSP_small_database.csv'
breach_location = '../data/data_breaches_final.csv'
ff_location = '../data/FF_factors.csv'
trends_tic_location = '../data/trends_tic.csv'
trends_conm_location = '../data/trends_conm.csv'
out_location = '../data/CRSP_merged_trends.csv'

In [2]:
remove = ['CORP', 'INC', 'LTD', 'CORPORATION', 'INCORPORATED', 'LLC', 'LTD', 'GROUP', 'NV', 'GRP', 'PLC']

def clean_name(name):
    name = str(name)
    name = re.sub(r'[^\s\w]+', '', name)
    namewords = name.split()
    resultwords = [word for word in namewords if word not in remove]
    result = ' '.join(resultwords)
    result = result.strip(' ')
    return result

In [6]:
crsp = pd.read_csv(crsp_location)
crsp['date'] = pd.to_datetime(crsp['date'], format='%Y%m%d')
crsp.columns
crsp['datamonth'] = crsp['date'].apply(lambda x: x.replace(day = 1))
crsp['clean_name'] = crsp['COMNAM'].apply(clean_name)
crsp['clean_tic'] = crsp['TICKER'].apply(clean_name)

In [7]:
breach = pd.read_csv(breach_location)
breach['date'] = pd.to_datetime(breach['Date Made Public'], format='%B %d, %Y')
breach['TICKER'] = breach['tic']
breach = breach[breach['match'] == 1]

In [8]:
ff = pd.read_csv(ff_location)
ff['date'] = pd.to_datetime(ff['date'], format='%Y%m%d')

In [9]:
trends_company = pd.read_csv(trends_conm_location, index_col='date').stack()
trends_company = trends_company.reset_index()
trends_company['date'] = pd.to_datetime(trends_company['date'])
trends_company.columns = ['datamonth', 'clean_name', 'trend_index_company']

trends_tic = pd.read_csv(trends_tic_location, index_col='date').stack()
trends_tic = trends_tic.reset_index()
trends_tic['date'] = pd.to_datetime(trends_tic['date'])
trends_tic.columns = ['datamonth', 'clean_tic', 'trend_index_tic']

In [10]:
out = pd.merge(crsp, breach, how='outer', on=['date', 'TICKER'])

In [11]:
out = pd.merge(out, ff, how='left', on='date')

In [12]:
out = pd.merge(out, trends_company, how='outer', on=['datamonth','clean_name'])
out = pd.merge(out, trends_tic, how='outer', on=['datamonth','clean_tic'])

In [13]:
out

Unnamed: 0,PERMNO,date,SHRCD,EXCHCD,SICCD,NCUSIP,TICKER,COMNAM,SHRCLS,TSYMBOL,...,personal_information,yearquarter,Mkt-RF,SMB,HML,RMW,CMA,RF,trend_index_company,trend_index_tic
0,10104.0,2005-01-03,11.0,3.0,7370.0,68389X10,ORCL,ORACLE CORP,,ORCL,...,,,-0.97,-0.65,-0.04,0.37,-0.02,0.008,97.0,1.0
1,10104.0,2005-01-04,11.0,3.0,7370.0,68389X10,ORCL,ORACLE CORP,,ORCL,...,,,-1.30,-0.50,0.44,0.90,-0.49,0.008,97.0,1.0
2,10104.0,2005-01-05,11.0,3.0,7370.0,68389X10,ORCL,ORACLE CORP,,ORCL,...,,,-0.51,-1.13,0.00,0.08,-0.14,0.008,97.0,1.0
3,10104.0,2005-01-06,11.0,3.0,7370.0,68389X10,ORCL,ORACLE CORP,,ORCL,...,,,0.34,-0.03,0.13,0.46,-0.13,0.008,97.0,1.0
4,10104.0,2005-01-07,11.0,3.0,7370.0,68389X10,ORCL,ORACLE CORP,,ORCL,...,,,-0.22,-0.82,-0.02,-0.17,-0.02,0.008,97.0,1.0
5,10104.0,2005-01-10,11.0,3.0,7370.0,68389X10,ORCL,ORACLE CORP,,ORCL,...,,,0.42,0.30,0.04,0.05,-0.23,0.008,97.0,1.0
6,10104.0,2005-01-11,11.0,3.0,7370.0,68389X10,ORCL,ORACLE CORP,,ORCL,...,,,-0.68,-0.30,0.30,0.93,-0.17,0.008,97.0,1.0
7,10104.0,2005-01-12,11.0,3.0,7370.0,68389X10,ORCL,ORACLE CORP,,ORCL,...,,,0.38,-0.10,-0.16,-0.22,0.13,0.008,97.0,1.0
8,10104.0,2005-01-13,11.0,3.0,7370.0,68389X10,ORCL,ORACLE CORP,,ORCL,...,,,-0.77,0.25,0.38,0.05,0.04,0.008,97.0,1.0
9,10104.0,2005-01-14,11.0,3.0,7370.0,68389X10,ORCL,ORACLE CORP,,ORCL,...,,,0.66,0.51,0.07,-0.25,0.16,0.008,97.0,1.0


In [14]:
out.to_csv(out_location, index=False, quoting=csv.QUOTE_ALL)