# Data analysis
### Load packages

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import os

from datetime import datetime
from scipy.stats import skew
from scipy.stats import kurtosis

### Load data

In [None]:
Directory = 'C:/.../TFT_for_Stock_Movement_Prediction/data'

# Target and return feature
CCR = pd.read_csv(os.path.join(Directory, 'CCR.csv'), index_col = [0])

# Portfolio
constituents_data = pd.read_csv(os.path.join(Directory, 'constituents_data.csv'), index_col = [0], header = [0, 1])
portfolio_table = pd.read_csv(os.path.join(Directory, 'portfolio_table.csv'), index_col = [0])

### Define datasets

In [None]:
# Remove first unused dates
constituents_data = constituents_data.iloc[1:, :]

# Define study period lengths
period_b = 0, 250, 500, 750, 1000, 1250, 1500, 1750, 2000, 2250
period_e = 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000, 3250

training_size = 750
test_size = 250
validation_split = 0.2
training_cutoff = int(training_size - training_size * validation_split)

### Plots
#### Portfolio closing price

In [None]:
portfolio_closing = pd.DataFrame(index = range(len(constituents_data.index)), columns = ['Close'])

for i in range(len(constituents_data.index)):
    portfolio_closing['Close'].iloc[i] = sum(constituents_data['Close'].iloc[i])

fig = go.Figure(go.Scatter(name = 'Portfolio', line_width = 10, x = constituents_data.index, y = np.log(portfolio_closing['Close'].astype('float')), marker = dict(color = 'cadetblue', size = 2), showlegend = True))
for i in range(len(period_e)):
    fig.add_vline(x = datetime.strptime(constituents_data.index[period_e[i] - test_size], '%Y-%m-%d').timestamp() * 1000, line_width = 0.5, line_dash = '7', line_color = 'lightgrey', annotation_text = f'Test {i + 1}')  
fig.add_vline(x = datetime.strptime(constituents_data.index[period_b[0] + training_cutoff], '%Y-%m-%d').timestamp() * 1000, line_width = 0.5, line_dash = '7', line_color = 'lightgrey', annotation_text = f'Val. 1')  
fig.add_vline(x = datetime.strptime(constituents_data.index[period_b[0] + 3], '%Y-%m-%d').timestamp() * 1000, line_width = 0.5, line_dash = '7', line_color = 'lightgrey', annotation_text = f'Training 1')  
fig.update_layout(annotations = [{**a, **{'y':1.02}} for a in fig.to_dict()['layout']['annotations']], yaxis_title = 'ln (closing price)', xaxis_title = 'Date', plot_bgcolor = 'white', xaxis_range = [constituents_data.index[0], constituents_data.index[-1]], showlegend = False)
fig.update_xaxes(ticks = 'outside', showgrid = False, ticklen = 5)
fig.update_yaxes(gridwidth = 0.5, gridcolor = 'lightgrey')
config = {'toImageButtonOptions': {'scale': 10}}
fig.show(config = config)

### Tables
#### Portfolio summary

In [None]:
# Portfolio sectors
portfolio_sector = pd.DataFrame(index = portfolio_table['Industry'].value_counts().index, columns = ['Number of Stocks'])
portfolio_sector['Number of Stocks'] = portfolio_table['Industry'].value_counts()

# Mean return by sector
stock_return = pd.DataFrame(index = [0], columns = constituents_data['Close'].columns)

portfolio_return = pd.DataFrame(index = portfolio_table['Industry'].value_counts().index, columns = ['Mean return'])
portfolio_return['Mean return'] = 0
portfolio_return['Mean return'] = portfolio_return['Mean return'].astype('float')

for i in constituents_data['Close'].columns: # For daily returns
    stock_return[i] = sum(CCR[i]) / period_e[len(period_e) - 1]

for i in range(len(portfolio_table['Industry'].value_counts().index)):
    for j in range(len(stock_return.columns)):
        if (portfolio_table['Industry'][portfolio_table['Symbol'] == stock_return.columns[j]][j] == portfolio_table['Industry'].value_counts().index[i]):
            portfolio_return['Mean return'][i] += round(stock_return.iloc[0][j] / portfolio_table['Industry'].value_counts()[i] * 100, 4)

# Standard deviation by sector
portfolio_sd = pd.DataFrame(index = portfolio_table['Industry'].value_counts().index, columns = ['Standard deviation'])
portfolio_sd['Standard deviation'] = 0
portfolio_sd['Standard deviation'] = portfolio_sd['Standard deviation'].astype('float')

for i in range(len(portfolio_table['Industry'].value_counts().index)):
    temp = []
    for j in range(len(stock_return.columns)):
        if (portfolio_table['Industry'][portfolio_table['Symbol'] == stock_return.columns[j]][j] == portfolio_table['Industry'].value_counts().index[i]):
            temp.append(stock_return.iloc[0][j])
    portfolio_sd['Standard deviation'][i] = round(np.std(temp) * 100, 4)
    
# Skewness by sector
portfolio_skew = pd.DataFrame(index = portfolio_table['Industry'].value_counts().index, columns = ['Skewness'])
portfolio_skew['Skewness'] = 0
portfolio_skew['Skewness'] = portfolio_skew['Skewness'].astype('float')

for i in range(len(portfolio_table['Industry'].value_counts().index)):
    temp = []
    for j in range(len(stock_return.columns)):
        if (portfolio_table['Industry'][portfolio_table['Symbol'] == stock_return.columns[j]][j] == portfolio_table['Industry'].value_counts().index[i]):
            temp.append(stock_return.iloc[0][j])
    portfolio_skew['Skewness'][i] = round(skew(temp), 4)
    
# Kurtosis by sector
portfolio_kurtosis = pd.DataFrame(index = portfolio_table['Industry'].value_counts().index, columns = ['Kurtosis'])
portfolio_kurtosis['Kurtosis'] = 0
portfolio_kurtosis['Kurtosis'] = portfolio_kurtosis['Kurtosis'].astype('float')

for i in range(len(portfolio_table['Industry'].value_counts().index)):
    temp = []
    for j in range(len(stock_return.columns)):
        if (portfolio_table['Industry'][portfolio_table['Symbol'] == stock_return.columns[j]][j] == portfolio_table['Industry'].value_counts().index[i]):
            temp.append(stock_return.iloc[0][j])
    portfolio_kurtosis['Kurtosis'][i] = round(kurtosis(temp, fisher = False), 4)
    
# Portfolio table 
portfolio_statistics_table = pd.concat([portfolio_sector, portfolio_return, portfolio_sd, portfolio_skew, portfolio_kurtosis], axis=1)
portfolio_statistics_table.loc['All'] = [len(portfolio_table), round(sum(stock_return.iloc[0]) / len(stock_return.iloc[0]) * 100, 4), round(np.std(stock_return.iloc[0]) * 100, 4), round(skew(stock_return.iloc[0]), 4), round(kurtosis(stock_return.iloc[0], fisher = False), 4)]
portfolio_statistics_table['Number of Stocks'] = portfolio_statistics_table['Number of Stocks'].astype('int')
portfolio_statistics_table