In [2]:
import numpy as np
import pandas as pd

In [3]:
def clean_data(df):
    """Cleans input dataframe und returns a cleaned dataframe, by setting the index as datetime and
        filling NaNs with the previous value"""
    start_date = pd.to_datetime(df['Date'].iloc[0], infer_datetime_format = True)
    end_date = pd.to_datetime(df['Date'].iloc[-1], infer_datetime_format = True)
    df.set_index('Date', inplace = True)
    
    ### Replace missing data by previous dates
    dates = pd.date_range(start=start_date, end=end_date)
    dates = pd.DataFrame(dates, dates, columns=['Dummy'])
    df = pd.merge(dates, df, left_index=True, right_index=True, how='left')
    df.fillna(method='ffill', inplace=True)
    df.drop(columns=['Dummy'], inplace=True)
    #print(df)
    df.reset_index(inplace = True)
    #print(df)
    ############## RENAME COLUMN
    df['Date'] = df['index']
    df.drop(columns=['index'], inplace = True)
    return df

In [4]:
def clean_data_with_outliers(df):
    """Cleans input dataframe und returns a cleaned dataframe, by setting the index as datetime and
        filling NaNs with the previous value"""
    start_date = pd.to_datetime(df['Date'].iloc[0], infer_datetime_format = True)
    end_date = pd.to_datetime(df['Date'].iloc[-1], infer_datetime_format = True)
    df.set_index('Date', inplace = True)
    
    ### Replace missing data by previous dates
    dates = pd.date_range(start=start_date, end=end_date)
    dates = pd.DataFrame(dates, dates, columns=['Dummy'])
    df = pd.merge(dates, df, left_index=True, right_index=True, how='left')
    df.fillna(method='ffill', inplace=True)
    df.drop(columns=['Dummy'], inplace=True)
    ### Replace outliers by previous data point
    col = df.columns[4]
    original = np.array(df[col])
    for n in range(len(original)):
        original = np.array(df[col])
        shift = np.roll(original, 1)
        shift_back = np.roll(original, -1)
        diff = abs(original - shift)
        diff_back = abs(original - shift_back)
        ### Parameter to check wether a data point is an outlier
        max_diff = np.quantile(original, q=0.75) - np.quantile(original, q=0.25)
        if n == 0:
            #print(diff_back[n])
            if diff_back[n] > max_diff:
                df[col].iloc[0] = df[col].iloc[1]
        else:        
            if diff[n] > max_diff:
                #print(diff[n], df[col].iloc[n-20:n+2])
                df[col].iloc[n] = df[col].iloc[n-1]
                #print("after ", df[col].iloc[n-20:n+2])

    #print(df)
    df.reset_index(inplace = True)
    #print(df)
    ############## RENAME COLUMN
    df['Date'] = df['index']
    df.drop(columns=['index'], inplace = True)
    return df

In [5]:
def simulate_investment(start_date, end_date, df, interval='monthly', amount=100, costs=0.01, TER=0.01):
    """Computes the total return of some stock (given by the df with Date, Price) starting at start_date
    ending at end_date and an invested amount of money each interval (daily, monthly, quarterly) at a cost 
    (Ordergebühr) of costs and a TER (total expense ratio yearly)
    """

    startdate = pd.to_datetime(start_date, infer_datetime_format = True)
    enddate = pd.to_datetime(end_date, infer_datetime_format = True)
    df_temp = df.copy()
    
    if (df_temp.isna().sum().sum())>0:
        print("Cleaning dataframe: ", df_temp.isna().sum().sum(), " NaNs in total.")
        df_temp = clean_data(df_temp)
        df_temp.set_index('Date', inplace = True)
    else:
        df_temp.set_index('Date', inplace = True)
    
    ### financial things
    portfolio = {'Index':[], 'Amount':[], 'Price':[], 'Value':[]}
    value = 0.
    
    
    if interval == 'monthly':
        offset = 1
    elif interval == 'quarterly':
        offset = 3
    newoffset = offset
    date = startdate
    while (date <= enddate):
        ### price at this date
        price = df_temp['Adj Close'].loc[date]
        if interval == 'monthly':
            TER_daily = TER/12.
        else:
            print("TER not correctly implemented for other than monthly allocation!")
        if date == startdate:
            value += amount - amount*costs
        else:
            value -= TER_daily * value 
            value = price/oldprice * value 
            value += amount - amount*costs
        #portfolio.append({'Index':date, 'Amount':amount, 'Price':price, 'Value':value})
        portfolio['Index'].append(date)
        portfolio['Amount'].append(amount)
        portfolio['Price'].append(price)
        portfolio['Value'].append(value)

        #print("Investing ", amount, " € at a price of ", df_temp['Adj Close'].loc[date], " at ", date, "with actual value =", value)
        ### go to next date
        oldprice = price
        date = startdate + pd.DateOffset(months=newoffset)
        newoffset += offset
    inv = pd.DataFrame.from_dict(portfolio)
    inv.set_index('Index', inplace=True)
    return(inv)
    
    ## number of stocks to buy at given price
    #stocks = int(amount/df['Adj Close'])
    ## total money invested at given price
    #invested = stocks * df['Adj Close']
    

In [6]:
def invest_return_yearly(inv):
    expenses = inv['Amount'].sum()
    value = inv['Value'].iloc[-1]
    years = int(str(inv.reset_index()['Index'].iloc[-1] - inv.reset_index()['Index'].iloc[0]).split(" days")[0])/365.
    annual_return = (value / expenses-1)/years*100
    return(annual_return)

In [None]:
def prepare_portfolio_data(mindate, maxdate, portfolio):
    """Prepares and cleans data for all ETFs in portfolio and returns an array with the data"""
    range_days = int((maxdate - mindate).days) + 1
    num_data = np.zeros((range_days, len(portfolio)))
    for n, etf in enumerate(portfolio):
        df = pd.read_csv("data/{0}.csv".format(etf))
        df = clean_data_with_outliers(df)
        df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
        ### fill in missing data points
        dff = np.array(df.loc[(df['Date'] >= mindate) & (df['Date'] <= maxdate), 'Adj Close'])
        num_data[:,n] = dff
    ### each row represents data of one stock
    num_data = num_data.transpose()
    return num_data

In [7]:
def portfolio_return_risk(stocks, parameters, total_amount):
    """Computes the total return for a portfolio consisting of a list of stocks: 
    stocks: np.array of shape (num_stocks, num_datapoints)
    parameters = array{[percentages], [costs], [TER]}"""
    num_stocks = len(stocks)
    returns = []
    percentage = parameters.transpose()[0]
    for n in range(num_stocks):
        time = pd.date_range(start='1970-01-01', periods=len(stocks[n]))
        df = pd.DataFrame({'Date':time, 'Adj Close': stocks[n]})
        amount = parameters[n][0]*total_amount
        costs = parameters[n][1]
        TER = parameters[n][2]
        r = simulate_investment(time.min(), time.max(), df, amount=amount, costs=costs, TER=TER);
        returns.append(invest_return_yearly(r))
    ret = np.array(returns)

    
    total_return = np.dot(percentage,ret)
    total_var = 0.
    for n in range(num_stocks):
        total_var += percentage[n]**2*np.var(stocks[n])
        for m in range(n):
            total_var += 2.*percentage[n]*percentage[m]*np.cov(stocks, bias=1)[n][m]
    if num_stocks == 1:
        print("Only one stock is part of portfoilio")
        return_risk = (returns[0], np.var(stocks[0]));
    elif percentage.max()==1:
        idx = np.where(percentage==1)[0][0]
        print(idx)
        print("Only one stock is part of portfolio, because of stock mix!")
        return_risk = (returns[idx], np.var(stocks[idx]))
    else:
        return_risk = (total_return, total_var);
    return(return_risk)
        

In [8]:
####
#Portfolio funktion: erhält liste von aktien und berechnet für jede aktie mean, var, cov 
#    andere Funktion: berechnet für gewisse Auswahl an Aktien das optimale Mischungsverhältnis

In [9]:
def get_lows_highs(df, X):
    """Computes the Top X dates of lowest Adj Close price for given series"""
    
    df.sort_values('Adj Close', ascending=True, inplace = True)
    low_list = df['Adj Close'].iloc[:X]
    high_list = df['Adj Close'].iloc[-X:]
    
    return (low_list, high_list)

In [10]:
def pearson(stocks):
    """Computes pearson correlation matrix for a list of stocks in the same date interval
        stocks: np.array of shape (num_stocks, num_datapoints)"""
    if len(stocks) == 1:
        pearson = 1
    else:
        num_stocks = len(stocks)
        ### bias=1 means normalizing with 1/N
        cov = np.cov(stocks, bias=1)
        pearson = np.zeros((num_stocks, num_stocks))
        for n in range(num_stocks):
            for m in range(num_stocks):
                pearson[n][m] = cov[n][m]/(np.sqrt(np.var(stocks[n])*np.var(stocks[m])))
                #pearson[n][m] = cov[n][n]/(np.var(stocks[n]))
                #pearson[n][n+1] = cov[n][n+1]/(np.sqrt(np.var(stocks[n])*np.var(stocks[n+1])))
                #pearson[n+1][n] = cov[n+1][n]/(np.sqrt(np.var(stocks[n+1])*np.var(stocks[n])))
    return(pearson)

In [None]:
def normalize(stocks):
    """Normalizes all stock data
    stocks: np.array of shape (num_stocks, num_datapoints)"""
    num_stocks = len(stocks)
    for n in range(num_stocks):
        Min = stocks[n].min()
        Max = stocks[n].max()
        stocks[n] -= Min
        stocks[n] /= (Max-Min)