# How to make new data for unit testing

(Almost) everything is contained in the DataMaker class

In [1]:
import DataHelper
import pandas as pd
import numpy as np

There's three variables to create a new row:

```python
holding_asset_row = {'caldt':year, 'cash':100*pct, 'equity':100*pct, 'bond':100*pct, 'security':100*pct}
daily_returns_ts = pd.Series(daily_returns_series, index=datetime_axis)
fund_mrnstar_row = {'caldt':year, 'lipser_class_name':string}
```

Example:

In [2]:
holding_asset_row = {'caldt':2020, 'cash':50, 'equity':25, 'bond':12.5, 'security':12.5}

In [3]:
daily_returns_ts = pd.Series(np.random.rand(9), index=pd.to_datetime([f"2020-10-0{i}" for i in range(1, 10)]))
daily_returns_ts

2020-10-01    0.586803
2020-10-02    0.120362
2020-10-03    0.831593
2020-10-04    0.296304
2020-10-05    0.109187
2020-10-06    0.636765
2020-10-07    0.149235
2020-10-08    0.566731
2020-10-09    0.969903
dtype: float64

In [4]:
fund_mrnstar_row = {'caldt':2020, 'lipser_class_name':'No one cares'}

In [5]:
holding_asset_row_2 = {'caldt':2020, 'cash':100, 'equity':0, 'bond':0, 'security':0}
daily_returns_ts_2 = pd.Series(np.random.rand(9), index=pd.to_datetime([f"2020-10-0{i}" for i in range(1, 10)]))
fund_mrnstar_row_2 = {'caldt':2020, 'lipser_class_name':'No one cares'}

# There's 3 way to add fake data

## 1. Add one at a time

In [6]:
maker = DataHelper.get_data_maker()

maker.add_fake_fund(holding_asset_row, daily_returns_ts, fund_mrnstar_row)
maker.add_fake_fund(holding_asset_row_2, daily_returns_ts_2, fund_mrnstar_row_2);

## 2. Bulk add by list of 3-tuples

In [7]:
maker = DataHelper.get_data_maker()

maker.bulkadd_fake_fund([
    (holding_asset_row, daily_returns_ts, fund_mrnstar_row),
    (holding_asset_row_2, daily_returns_ts_2, fund_mrnstar_row_2),
])

## 3. (Not important) Builder Design Pattern

In [8]:
maker = DataHelper.get_data_maker()

# maker.add_fake_fund return itself, so we can continue to use .add_fake_fund again and again.
maker.add_fake_fund(holding_asset_row, daily_returns_ts, fund_mrnstar_row)\
    .add_fake_fund(holding_asset_row_2, daily_returns_ts_2, fund_mrnstar_row_2);

# When every fake funds has been added, convert to actual DataCache

In [9]:
preprocessor = DataHelper.get_data_preprocessor()
clustering_year = 2020
cache = maker.convert_to_data_cache(preprocessor, clustering_year)

# Expected results

In [10]:
cache.returns

Unnamed: 0,0,1
2020-10-01,0.586803,0.748527
2020-10-02,0.120362,0.980786
2020-10-03,0.831593,0.286497
2020-10-04,0.296304,0.683949
2020-10-05,0.109187,0.820992
2020-10-06,0.636765,0.301519
2020-10-07,0.149235,0.896867
2020-10-08,0.566731,0.275698
2020-10-09,0.969903,0.618464


In [11]:
cache.cumul_returns

Unnamed: 0,0,1
2020-10-01,1.586803,1.748527
2020-10-02,1.777794,3.463456
2020-10-03,3.256196,4.455727
2020-10-04,4.221021,7.503217
2020-10-05,4.681901,13.663302
2020-10-06,7.663171,17.783042
2020-10-07,8.806784,33.732066
2020-10-08,13.797859,43.031939
2020-10-09,27.180444,69.645637


In [12]:
cache.holding_asset

Unnamed: 0,crsp_fundno,caldt,cash,equity,bond,security
0,0,2020-12-31,50,25,12.5,12.5
1,1,2020-12-31,100,0,0.0,0.0


In [13]:
cache.asset_type

['cash', 'equity', 'bond', 'security']

In [14]:
cache.fund_mrnstar

Unnamed: 0,crsp_fundno,caldt,lipper_class_name,lipser_class_name
0,0,2020-12-31,,No one cares
1,1,2020-12-31,,No one cares


In [15]:
cache.fundno_ticker

{0: 0, 1: 1}

# Use this new data in the models

In [16]:
from Models import TwoLayerFundClustering

In [17]:
model = TwoLayerFundClustering(clustering_year)
model.fit(source_type='CustomCache', cache=cache) # Doesn't work with this fake data.

ValueError: Number of labels is 2. Valid values are 2 to n_samples - 1 (inclusive)