# Imports and Configurations 

In [None]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt
import matplotlib as mpl 
import seaborn as sns 

In [None]:
pd.options.display.float_format = '{:.2f}'.format

# Data 

In [None]:
mut_funds = pd.read_csv("./Data/MutualFunds_Cleaned.csv")

In [None]:
mut_funds.head(3)

Unnamed: 0,fund_symbol,inception_date,fund_net_annual_expense_ratio,fund_yield,fund_return_3years,fund_r_squared_3years,fund_standard_deviation_3years,fund_sharpe_ratio_3years,fund_treynor_ratio_3years,fund_alpha_3years,fund_beta_3years,fund_mean_annual_return_3years,rating,risk_rating,return_rating,investment_type,size_type,median_market_cap,net_asset_value,fund_return_ytd
0,AAAAX,2007-07-29,1.22,1.22,3.03,85.8,12.46,0.18,1.32,-2.51,1.09,0.31,3.0,3.0,3.0,Blend,Large,18592.11,697910000.0,-7.6
1,AAADX,2011-12-29,1.43,2.8,9.71,87.23,8.46,0.93,6.82,0.17,1.17,0.8,4.0,5.0,5.0,Blend,Large,168478.91,89380000.0,0.3
2,AAAGX,1999-10-28,1.12,0.0,19.44,90.82,19.81,0.92,16.87,7.68,1.06,1.65,3.0,4.0,3.0,Growth,Large,340488.16,1490000000.0,24.2


In [None]:
mut_funds.dtypes

fund_symbol                        object
inception_date                     object
fund_net_annual_expense_ratio     float64
fund_yield                        float64
fund_return_3years                float64
fund_r_squared_3years             float64
fund_standard_deviation_3years    float64
fund_sharpe_ratio_3years          float64
fund_treynor_ratio_3years         float64
fund_alpha_3years                 float64
fund_beta_3years                  float64
fund_mean_annual_return_3years    float64
rating                            float64
risk_rating                       float64
return_rating                     float64
investment_type                    object
size_type                          object
median_market_cap                 float64
net_asset_value                   float64
fund_return_ytd                   float64
dtype: object

In [None]:
df = mut_funds.drop(["fund_symbol","inception_date"],axis=1)

# Initial Data Transformations 

## Remove Outliers

In [None]:
def reject_outliers(data):
    u = data.mean()
    s = data.std()
    data_filtered = data[(data > u-3*s) & (data < u+3*s)]
    return data_filtered

def reject_outliers_for_pipe(data):

    df = data.copy()
    df_num = data.select_dtypes([np.number])
    numeric_columns = df.select_dtypes([np.number]).columns

    u = data.mean()
    s = data.std()
    df_num_filtered = df_numm[(df_num > u-3*s) & (df_num < u+3*s)]

    df.loc[:,numeric_columns] = df_num_nooutliers

    return df

In [None]:
df_num = df.select_dtypes([np.number])

In [None]:
df_num_outliers_nan = reject_outliers(df_num)

In [None]:
print(df_num.shape,df_num_outliers_nan.dropna(axis=0).shape) #2000 rows that contain an outlier

(22689, 16) (20977, 16)


In [None]:
numeric_columns = df.select_dtypes([np.number]).columns
df.loc[:,numeric_columns] = df_num_outliers_nan
df = df.dropna().reset_index(drop=True)

## Numeric Transformations 

In [None]:
#Take logs of positively skewed variables

df['net_asset_value_log'] = np.log(df.net_asset_value.replace(0,1))
df['fund_yield_log'] = np.log(df.fund_yield.replace(0,1))
df.drop(['net_asset_value','fund_yield'],axis=1,inplace=True)

# Data Pre-Processing

## Encode Categorical Variables

In [None]:
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder

In [None]:
# Size type: Large, .. -> Ordinal

ord_enc = OrdinalEncoder()

df.size_type = ord_enc.fit_transform(df.size_type.values.reshape(-1,1))

In [None]:
# Investment type: Blend, .. -> Dummy
one_enc = OneHotEncoder()

dummies = pd.DataFrame(one_enc.fit_transform(df[['investment_type']]).toarray())
df = (df.join(dummies)).drop('investment_type',axis=1)
df.columns = df.columns.to_series().replace({0:"Blend",1:"Growth",2:"Value"})

## Normalization

In [None]:
# DF -> Array

X_df = df.drop("fund_return_ytd", axis=1)
X = X_df.values
feature_names = X_df.columns.to_list()

y_ser = df.fund_return_ytd
y = y_ser.values

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X = scaler.fit_transform(X)

## Data Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=41)

# Model Fitting

## Simple Linear Model

In [None]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()

reg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
reg.score(X_test,y_test)

0.8213772828078421

## Ridge

In [None]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=1)

ridge_reg.fit(X_train,y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

In [None]:
ridge_reg.score(X_test,y_test)

0.8215152442564434