# Analyzing Chilean Mutual Funds

## Imports

Most of the code required to run our analysis is located within th `dva` package, so we only need to import that and a few others.

In [None]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

import dva

In [None]:
HERE = pathlib.Path('.').cwd()
DATA_DIR = HERE.joinpath('data')
RAW_FILES = DATA_DIR.joinpath('raw')


## Pull Data from Web

Here we connect to the AAFM Public Daily Statistics API to pull daily mutual fund data from 2015 through our project start (Feb 13, 2021).  Each JSON response is converted to a Pandas DataFrame and exported to CSV in the `RAW_FILES` directory. 

In [None]:
dates = dva.get_dates('2015-01-01', '2021-02-13')
dva.pull_all(dates, RAW_FILES)

## Convert Raw Data to Parquet

We use Dask to extract and subset raw data into separate, compressed parquet files:

- `fund_data.parq`, containing static, descriptive and categorical data on each unique mutual fund;
- `fund_flows.parq`, containing time series data on mutual fund inflows and outflows, 2015 - 2021;
- `fund_prices.parq`, containing time series data on mutual fund prices, 2015 - 2021.

In [None]:
dva.raw_to_parq(RAW_FILES, DATA_DIR)

## Transform and Join Data

We join our static fund data with time series fund data, downsample to monthly price observations, and tranform monthly prices into monthly percentage returns.

We filter out observations that have **at least** 36 months of data and still exist in the most recent month.  We'll also filter out any observations whose price does not change across the entire time period.


In [None]:
price_data = DATA_DIR.joinpath('fund_prices.parq')
fund_data = DATA_DIR.joinpath('fund_data.parq')

prices = dva.get_monthly_prices(price_data)
fund_data_monthly_prices = dva.clean_from_monthly_prices_raw(dva.remove_closed_funds(dva.join_fund_data(prices, fund_data)))
fund_data_monthly_prices.drop(['svsCategoryId', 'currency'], axis=1, inplace=True)

In [None]:
static = dva.get_static_data(fund_data_monthly_prices)
prices = dva.get_price_data(fund_data_monthly_prices)
returns = dva.get_return_data(prices)
dataset = dva.make_dataset(static, returns)
cat_means = dva.get_category_means(dataset)
monthly_returns = dva.fill_na(dataset, cat_means)

## Map Fund Return Data to Lower Dimensions, Join with Fund Flows Data



In [None]:
mapped = dva.map_all_by_group(monthly_returns.fillna(0), 'tsne')
fund_flows = dva.get_fund_flows_2020(DATA_DIR.joinpath('fund_flows.parq'))
fund_data_monthly_returns_tsne = mapped.merge(fund_flows, on='fundRUNSeries')

sc = MinMaxScaler((10, 50))
fund_data_monthly_returns_tsne['netPatrimony_scaled'] = sc.fit_transform(fund_data_monthly_returns_tsne['netPatrimony'].values.reshape(-1, 1))

## Calculate Annualized Fund Metrics, Join



In [None]:
fund_stats = dva.add_perf_metrics(fund_data_monthly_prices)
data = fund_data_monthly_returns_tsne.merge(fund_stats[['fundRUNSeries', 'ann_return', 'ann_stdev']], on='fundRUNSeries')
data['ann_return'] = data['ann_return'] * 100
data['ann_stdev'] = data['ann_stdev'] * 100

## Find Anomalies



In [None]:
all_data = []
for aafm_cat in data['aafmCategory'].unique():
    mask = data['aafmCategory'] == aafm_cat
    df = data[mask].copy()
    df['dist_anomaly'] = dva.get_anomalies(df, 1.75)
    all_data.append(df)
scored = pd.concat(all_data).sort_index()

## Clustering with K-Means

Here we'll cluster our funds for the purpose of re-creating a covariance and/or correlation matrix.  We'll start with 31 unique fund categories, and re-group into 10 unique clusters.

## Selecting subset of columns

We'll use the annualized returns and standard deviation, along with the TSNE components, to cluster the funds.  We're not including the monthly returns as that information is already encoded in the TSNE components.

In [None]:
cols = data.columns[-4:-2] # only use annual return and stdev
cols = cols.append(data.columns[-9:-7])  # adds tSNE coordinates
df = data[cols]

## Run K-means pipeline and plot results

We'll use our original TSNE mapping, but replace the `aafmCategory` factor with our new cluster labels.

### Choosing Number of Clusters

We examined results of four methods to determine appropriate number of clusters:
- Elbow method
- Calinski-Harabasz
- Silhouette
- Davies-Bouldin

The elbow method is rather subjective.  Results indicate the optimal number could lie between 7-10 clusters.

The Calinski-Harabasz score is inconclusive-- the score exhibits multiple, increasing peaks.  The Silhouette and Davies-Bouldin scores indicate that 10 clusters may be appropriate; therefore, we choose to use 10 clusters.

In [None]:
pl = Pipeline(
        steps=[
            ('scaler', StandardScaler()),
            ('cluster', KMeans(n_clusters=10, n_init=20, random_state=7))
        ]
    )
pl.fit(df)
labels = pl.named_steps['cluster'].labels_
n_clusters = len(np.unique(labels))
scored['cluster'] = labels

In [None]:
file_name = 'FundDataWithMonthlyReturnsTSNEScoredScaledClustered.csv'
scored.to_csv(DATA_DIR.joinpath(file_name))

## Efficient Frontier Portfolio

We'll now use generate portfolios of mutual fund assets to create efficient frontiers per Modern Portfolio Theory.  We'll first use our K-Means generated clusters and then the original AAFM categores.

In [None]:
data_cluster=scored.loc[:,'2015-01-31':'2021-01-31']
data_cluster['cluster']=scored['cluster']
data_cluster=data_cluster.groupby('cluster').mean().T
data_cluster=data_cluster[1:]
data_cluster.head()

In [None]:
cluster_funds=scored['cluster'].unique()

#create 6000 portfolios so we can have a sample of almost all portfolio condition due to volatility and returns.
np.random.seed(42)
num_ports = 6000
all_weights = np.zeros((num_ports, len(data_cluster.columns)))
ret_arr = np.zeros(num_ports)
vol_arr = np.zeros(num_ports)
sharpe_arr = np.zeros(num_ports)

for x in range(num_ports):
    # Weights
    weights = np.array(np.random.random(10))
    weights = weights/np.sum(weights)
    
    # Save weights
    all_weights[x,:] = weights
    
    # Expected return
    ret_arr[x] = np.sum( (data_cluster.mean() * weights * 12))
    
    # Expected volatility
    vol_arr[x] = np.sqrt(np.dot(weights.T, np.dot(data_cluster.cov()*12, weights)))
    
    # Sharpe Ratio
    sharpe_arr[x] = ret_arr[x]/vol_arr[x]

In [None]:
#the optimal portfolio is located in this point.
print('Max sharpe ratio in the array: {}'.format(sharpe_arr.max()))
print('Location in the array: {}'.format(sharpe_arr.argmax()))

In [None]:
#the weight of the optimal portfolio is:
print(all_weights[1901,:]*100)

In [None]:
max_sr_ret=ret_arr[sharpe_arr.argmax()]
max_sr_vol=vol_arr[sharpe_arr.argmax()]

### Sample Portfolios Using K-Mean Clusters

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(vol_arr, ret_arr, c=sharpe_arr, cmap='viridis')
plt.colorbar(label='Sharpe Ratio')
plt.xlabel('Volatility')
plt.ylabel('Return')
plt.scatter(max_sr_vol, max_sr_ret,c='red', s=50) # red dot
plt.show()

In [None]:
efficient_frontier_cluster=pd.DataFrame(all_weights*100, columns=cluster_funds)
efficient_frontier_cluster['x_coord_vol']=vol_arr
efficient_frontier_cluster['y_coord_ret']=ret_arr
efficient_frontier_cluster.head()

In [None]:
file_name = 'efficient_frontier_kmeans_cluster_withweights.csv'
efficient_frontier_cluster.to_csv(DATA_DIR.joinpath(file_name))

We continue with the aafm categories.

In [None]:
data_aafm=data.loc[:,'2015-01-31':'2021-01-31']
data_aafm['aafmCategory']=data['aafmCategory']
data_aafm=data_aafm.groupby('aafmCategory').mean().T
data_aafm=data_aafm[1:]
data_aafm.head()

In [None]:
aafm_funds=data_aafm.columns

In [None]:
#create 6000 portfolios so we can have a sample of almost all portfolio condition due to volatility and returns.
np.random.seed(42)
num_ports = 6000
all_weights = np.zeros((num_ports, len(data_aafm.columns)))
ret_arr = np.zeros(num_ports)
vol_arr = np.zeros(num_ports)
sharpe_arr = np.zeros(num_ports)

for x in range(num_ports):
    # Weights
    weights = np.array(np.random.random(len(aafm_funds)))
    weights = weights/np.sum(weights)
    
    # Save weights
    all_weights[x,:] = weights
    
    # Expected return
    ret_arr[x] = np.sum( (data_aafm.mean() * weights * 12))
    
    # Expected volatility
    vol_arr[x] = np.sqrt(np.dot(weights.T, np.dot(data_aafm.cov()*12, weights)))
    
    # Sharpe Ratio
    sharpe_arr[x] = ret_arr[x]/vol_arr[x]

In [None]:
#the optimal portfolio is located in this point.
print('Max sharpe ratio in the array: {}'.format(sharpe_arr.max()))
print('Location in the array: {}'.format(sharpe_arr.argmax()))

In [None]:
#the weight of the optimal portfolio is:
print(all_weights[2738,:]*100)

In [None]:
max_sr_ret=ret_arr[sharpe_arr.argmax()]
max_sr_vol=vol_arr[sharpe_arr.argmax()]

### Sample Portfolios Using Original AAFM Categories

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(vol_arr, ret_arr, c=sharpe_arr, cmap='viridis')
plt.colorbar(label='Sharpe Ratio')
plt.xlabel('Volatility')
plt.ylabel('Return')
plt.scatter(max_sr_vol, max_sr_ret,c='red', s=50) # red dot
plt.show()

In [None]:
efficient_frontier_aafm=pd.DataFrame(all_weights*100,columns=aafm_funds)
efficient_frontier_aafm['x_coord_vol']=vol_arr
efficient_frontier_aafm['y_coord_ret']=ret_arr
efficient_frontier_aafm.head()

In [None]:
file_name = 'efficient_frontier_kmeans_aafmcat_withweights.csv'
efficient_frontier_cluster.to_csv(DATA_DIR.joinpath(file_name))