In [5]:
from datetime import datetime

import numpy as np
import pandas as pd

FAMA_49CRSP = 'FAMA_49CRSP.csv'

In [6]:
def read_csv(filename):
    """
    Read in CSV file of financial ratios and returns a pandas dataframe.
    df is sorted by date in ascending order. 
    """
    df = pd.read_csv(filename)
    
    df = df.drop(labels=['pe_op_basic_Median', 'pe_op_dil_Median', 'PEG_1yrforward_Median', 'PEG_ltgforward_Median'], axis=1)
    df = df.dropna()
    df = df.sort_values(by='public_date', ascending=True)
    
    return df

def pct_format(col): 
    temp = [float(x[:-1]) if x is not np.nan else 0 for x in col]
    return pd.Series(temp)

def write_csv(filename):
    """
    Read in CSV file of financial ratios and returns a pandas dataframe.
    Drops empty columns and reformats a str col into a float col. 
    df is sorted by date in ascending order. 
    """
    df = read_csv(filename)
    
    df['divyield_Median'] = pct_format(df['divyield_Median'])
    
    df.to_csv('FAMA_49CRSP.csv')
    
write_csv('ee6d2f60cdafb550.csv')

In [8]:
def split_industries():
    df = pd.read_csv(FAMA_49CRSP)
    
    industries = set(df['FFI49_desc'])    
    for ind in industries: 
        df_ind = df[df['FFI49_desc'] == ind]
        df_ind = df_ind.drop(labels='FFI49_desc', axis=1)
        df_ind.to_csv('industries/{}.csv'.format(ind))
        
    return industries

print(split_industries())

{'OIL', 'FABPR', 'MEDEQ', 'TRANS', 'BUSSV', 'TOYS', 'RTAIL', 'SOFTW', 'CNSTR', 'OTHER', 'BANKS', 'GUNS', 'WHLSL', 'DRUGS', 'RLEST', 'ELCEQ', 'CHIPS', 'AUTOS', 'GOLD', 'PAPER', 'FIN', 'LABEQ', 'MACH', 'SODA', 'INSUR', 'COAL', 'RUBBR', 'AERO', 'FOOD', 'BOXES', 'HARDW', 'BOOKS', 'MEALS', 'UTIL', 'SHIPS', 'AGRIC', 'CHEMS', 'BLDMT', 'BEER', 'CLTHS', 'STEEL', 'SMOKE', 'MINES', 'HSHLD', 'PERSV', 'TXTLS', 'FUN', 'HLTH', 'TELCM'}
