# Workflow to transfer CSV data to SQL database

In [1]:
import os

In [2]:
old_path = os.getcwd()

print(f"Current working directory:\n\n\t{old_path}")

new_path = old_path[:-len('Notebooks')-1]
os.chdir(new_path)

print(f"\n\nNew working directory:\n\n\t{new_path}")

Current working directory:

	/Users/glangetasq/Library/Mobile Documents/com~apple~CloudDocs/Columbia/Classes/Fall_20/DeepLearning/FundClusteringProject/Repo/Notebooks


New working directory:

	/Users/glangetasq/Library/Mobile Documents/com~apple~CloudDocs/Columbia/Classes/Fall_20/DeepLearning/FundClusteringProject/Repo


# Imports

In [3]:
import pandas as pd

# Local imports
import Config
from Config.SQL.Structure.fund_clustering.morning_star import FORMATTING as MORNINGSTAR_FORMATTING
import DataHelper as dh
from Tools.latest_date_in_dataframe import latest_date_in_dataframe

# SQL Login Config

In [4]:
username = 'fx_admin'
password = '#Flexstone2020'
schema = ''
secrets_dir = '/Users/glangetasq'

# Read the CSV data

In [5]:
reader = dh.get_data_reader('csv')

## Ticker data

In [6]:
db_name = 'fund_clustering'
table_name = 'ticker'
path = Config.DATA_PATHS['ticker']

print(f"Loading from {path}...")
reader.load_table(db_name, table_name, path)

Loading from /Users/glangetasq/Library/Mobile Documents/com~apple~CloudDocs/Columbia/Classes/Fall_20/DeepLearning/FundClusteringProject/DataSummer/Tickers.csv...


In [7]:
ticker = reader.get_dataframe(db_name, table_name)


ticker = ticker.rename(columns={'crsp_fundno':'fundNo'})
ticker['caldt'] = pd.to_datetime(ticker['caldt'], format='%Y%m%d')

# Extract the latest tickers from the dataframe
ticker = ticker.groupby('fundNo').apply(latest_date_in_dataframe('caldt'))

# Only the fundNo and ticker are in the SQL table
ticker = ticker[['fundNo', 'ticker']]

# Drop Nas
ticker = ticker.dropna()

ticker.head()

Unnamed: 0_level_0,fundNo,ticker
fundNo,Unnamed: 1_level_1,Unnamed: 2_level_1
105,105,APITX
2704,2704,GFIZX
2706,2706,GGIZX
2708,2708,GCOZX
2724,2724,GGBZX


## Returns data

In [9]:
db_name = 'fund_clustering'
table_name = 'returns'
path = Config.DATA_PATHS['returns']

print(f"Loading from {path}...")
reader.load_table(db_name, table_name, path)

Loading from /Users/glangetasq/Library/Mobile Documents/com~apple~CloudDocs/Columbia/Classes/Fall_20/DeepLearning/FundClusteringProject/DataSummer/data_trimmed.csv...


In [10]:
returns = reader.get_dataframe(db_name, table_name)

In [12]:
returns['date'] = pd.to_datetime(returns['date'])
returns.drop('date', axis=1).astype(float)
returns.head()

Unnamed: 0,date,105,2704,2706,2708,2724,2725,2727,2729,2731,...,87961,89331,89332,91557,91558,91559,93598,93941,94443,94457
0,2010-01-04,0.024129,0.005268,0.010772,0.014401,0.018217,0.023312,0.021108,0.015113,0.017544,...,0.017737,0.001309,0.001908,0.009282,0.003001,0.015945,0.005342,-0.177083,0.0,0.0
1,2010-01-05,0.003927,0.00262,0.002664,0.002662,0.001883,-0.000786,0.0,0.003102,0.003918,...,0.003768,0.001961,0.004444,0.002299,0.004005,0.0,0.007439,-0.05425,0.0009,0.0009
2,2010-01-06,0.003911,0.0,0.000886,0.00177,0.00188,0.003931,0.000861,0.0,0.002342,...,0.002346,0.0,-0.000632,0.002294,0.000997,0.002846,0.003165,-0.034417,0.0,0.0
3,2010-01-07,-0.001299,0.000871,0.000885,0.0,0.0,-0.008614,0.003442,0.001237,0.006231,...,0.002341,0.0,-0.001265,0.001144,0.000996,0.001428,0.006309,-0.063366,-0.000899,-0.000899
4,2010-01-08,0.006502,0.002611,0.004421,0.0053,0.005629,0.008689,0.004288,0.005559,0.003096,...,0.000934,0.000652,0.002533,0.003415,0.00199,0.004259,0.003135,-0.078224,0.0009,0.0009


In [13]:
# Transform it to long format: primary keys (date, fundNo), value is the daily return
returns = pd.wide_to_long(returns, '', i='date', j='fundNo')
returns.columns = ['r']
returns = returns.reset_index()
returns.head()

Unnamed: 0,date,fundNo,r
0,2010-01-04,105,0.024129
1,2010-01-05,105,0.003927
2,2010-01-06,105,0.003911
3,2010-01-07,105,-0.001299
4,2010-01-08,105,0.006502


# Morningstar data

In [10]:
db_name = 'fund_clustering'
table_name = 'morning_star'
path = Config.DATA_PATHS['morningstar']

print(f"Loading from {path}...")
reader.load_table(db_name, table_name, path)

Loading from /Users/glangetasq/Library/Mobile Documents/com~apple~CloudDocs/Columbia/Classes/Fall_20/DeepLearning/FundClusteringProject/DataSummer/Summary_Updated.csv...


  if (await self.run_code(code, result,  async_=asy)):


In [11]:
mrnstar = reader.get_dataframe(db_name, table_name)

# Formatting using the dict in config
cols_to_drop = []
for colname in mrnstar:

    formatting_function = MORNINGSTAR_FORMATTING.get(colname, None)

    if formatting_function:
        mrnstar[colname] = formatting_function(mrnstar[colname])
    else:
        cols_to_drop.append(colname)
        
        
mrnstar = mrnstar.drop(cols_to_drop, axis=1)

In [12]:
mrnstar = mrnstar.rename(columns={'crsp_fundno':'fundNo', 'caldt':'date'})
mrnstar.head()

Unnamed: 0,fundNo,date,per_com,per_pref,per_conv,per_corp,per_muni,per_govt,per_oth,per_cash,per_bond,per_abs,per_mbs,per_eq_oth,per_fi_oth,lipper_class_name
0,105,2010-03-31,77.44,0.06,0.0,0.0,0.0,0.0,0.54,1.49,0.0,0.0,0.0,20.46,0.0,Global Multi-Cap Core
1,105,2010-06-30,78.48,0.06,0.0,0.0,0.0,0.0,0.03,2.12,0.0,0.0,0.0,19.3,0.0,Global Multi-Cap Core
2,105,2010-09-30,77.52,0.05,0.0,0.0,0.0,0.0,0.04,2.69,0.0,0.0,0.0,19.7,0.0,Global Multi-Cap Core
3,105,2010-12-31,77.47,0.08,0.0,0.0,0.0,0.0,0.04,1.19,0.0,0.0,0.0,21.22,0.0,Global Multi-Cap Core
4,105,2011-03-31,77.99,0.05,0.0,0.0,0.0,0.0,0.09,1.89,0.0,0.0,0.0,19.97,0.0,Global Multi-Cap Core


# Write dataframes to SQL

In [14]:
writer = dh.get_data_writer(username=username, password=password, secrets_dir=secrets_dir)

## Ticker table

In [15]:
db_name = 'fund_clustering'
table_name = 'ticker'

writer.update_raw_data(db_name, table_name, ticker)

## Returns table

This takes some time to run.

In [None]:
db_name = 'fund_clustering'
table_name = 'returns'

writer.update_raw_data(db_name, table_name, returns, chunk_size=100000)

## Morningstar table

This takes even longer to run.

In [None]:
db_name = 'fund_clustering'
table_name = 'morning_star'

writer.update_raw_data(db_name, table_name, mrnstar, chunk_size=50000)