In [None]:
import pandas as pd
import numpy as np
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('../cleandata/processed_data.csv')

# log_return
df["log_return"] = np.log(df['adjusted'] / df['adjusted'].shift(1))
df.dropna(inplace=True)

df.head()

In [None]:
print(df.info())
df.describe()

In [None]:
# převedeme do long formátu
df_long = df[["sentiment_negative", "sentiment_neutral", "sentiment_none", "sentiment_positive"]].melt(var_name='variable', value_name='value')

sns.countplot(data=df_long, x='variable', hue='value')
plt.title("Distribuce sentiment proměnných")

In [None]:
fig, ax = plt.subplots(figsize=(12,6))

cols1 = ['adjusted', 'volume', 'vix', 'g_trends', 'bull_bear_spread_surv']
df_norm1 = df[cols1].apply(lambda x: (x - x.mean()) / x.std())
for col in cols1:
    ax.plot(df['date'], df_norm1[col], label=col)
ax.legend()
ax.set_title('Porovnání časových řad (skupina 1)')
ax.set_ylabel('Standardizovaná hodnota')

In [None]:
fig, ax = plt.subplots(figsize=(12,6))

cols2 = ['adjusted', 'sma_20', 'sma_50', 'ema_20', 'basic_volatility', 'atr', 'macd', 'macd_signal', 'bb_up', 'bb_dn', 'obv', 'adx']
df_norm2 = df[cols2].apply(lambda x: (x - x.mean()) / x.std())
for col in cols2:
    ax.plot(df['date'], df_norm2[col], label=col)
ax.legend()
ax.set_title('Porovnání časových řad (skupina 2)')
ax.set_xlabel('Datum')
ax.set_ylabel('Standardizovaná hodnota')

In [None]:
# distribuce predikovane y + v levels
fig, ax = plt.subplots(2, 1, figsize=(10, 6))

sns.histplot(df["adjusted"],  kde=True, ax=ax[0])
ax[0].set_title("Adjusted")
sns.histplot(df["log_return"], kde=True, ax=ax[1])
ax[1].set_title("Log return")

plt.tight_layout()

In [None]:
y = df["log_return"]

# Plot pro vyvazeni trid klasifikacni y
y_clf = (y > 0.005).astype(int)
sns.countplot(x=y_clf)
plt.show()

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(10,6))
# ACF
plot_acf(df["adjusted"], ax=ax[0], lags=40)
ax[0].set_title(f"ACF pro adjusted")
# PACF
plot_pacf(df["adjusted"], ax=ax[1], lags=40)
ax[1].set_title(f"PACF pro adjusted")

plt.tight_layout()

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(10,6))
# ACF
plot_acf(y, ax=ax[0], lags=40)
ax[0].set_title(f"ACF pro log_return")
# PACF
plot_pacf(y, ax=ax[1], lags=40)
ax[1].set_title(f"PACF pro log_return")

plt.tight_layout()

In [None]:
# Korelace features
# ukazuji pouze korelace > 0.5
mask = (df.drop(columns=["date", "adjusted"]).corr().abs() < 0.5)
sns.heatmap(df.drop(columns=["date", "adjusted"]).corr(), mask=mask, cmap="YlOrRd")


In [None]:
sns.heatmap(df.drop(columns=["date", "adjusted"]).corr(), cmap="YlOrRd")

In [None]:
# Korelace kazde x (lag feature) s aktualni y (log return) -> potencialni predikcni sila

# vytvoreni X a y
features = [
    'vix', 'sentiment_neutral', 'sentiment_positive', 'sentiment_negative', 'g_trends',
    'bull_bear_spread_surv', 'volume', 'sma_20',
    'sma_50', 'ema_20', 'basic_volatility', 'atr', 'rsi', 'macd',
    'macd_signal', 'bb_up', 'bb_dn', 'obv', 'stochrsi', 'adx'
]

def create_lags(lags):
    X = pd.DataFrame()

    for l in range(1, lags + 1):
        for col in features:
            X[col + f'_lag{l}'] = df[col].shift(l)

    X.dropna(inplace=True)
    return X

lags = 10

X = create_lags(lags)
features_names = X.columns

y_clf = (y > 0.005).astype(int)

# Omezeni radku dle hodnoty LAG
y, y_clf = y[lags:], y_clf[lags:]


In [None]:
corrs = X.corrwith(y)
corrs_sorted = corrs.sort_values(ascending=False, key=abs)
print(corrs_sorted.head(20))

corrs_sorted.to_csv("python/plots_tabs/correlations_log_return.csv")

sns.barplot(corrs_sorted[:10])
plt.ylabel("Correlation with y")
plt.show()

In [None]:
corrs = X.corrwith(y_clf)
corrs_sorted = corrs.sort_values(ascending=False, key=abs)
print(corrs_sorted.head(20))

corrs_sorted.to_csv("python/plots_tabs/correlations_log_return_clf.csv")

sns.barplot(corrs_sorted[:10])
plt.ylabel("Correlation with y_clf")
plt.show()

In [None]:
sns.pairplot(df.drop(columns="date"), vars=df.drop(columns="date").columns[:5], height=2)

In [None]:
sns.pairplot(df.drop(columns="date"), vars=df.drop(columns="date").columns[5:10], height=2)

In [None]:
sns.pairplot(df.drop(columns="date"), vars=df.drop(columns="date").columns[15:21], height=2)

In [None]:
# Feature importance na prvnim okne
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

rfr = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
rfr.fit(X[:2630], y[:2630])

# Feature importance
rfr_importances = pd.Series(rfr.feature_importances_, index=features_names)
# jen použité feature
rfr_importances_used = rfr_importances[rfr_importances > 0].sort_values(ascending=False)



rfc = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)
rfc.fit(X[:2630], y_clf[:2630])

# Feature importance
rfc_importances = pd.Series(rfc.feature_importances_, index=features_names)
# jen použité feature
rfc_importances_used = rfr_importances[rfc_importances > 0].sort_values(ascending=False)



In [None]:
plt.figure(figsize=(25, 6))
sns.barplot(rfr_importances_used[:20])
plt.title("First Window Feature Importances By Regression")
plt.ylabel("Importance")
plt.xlabel("Feature")
plt.show()


In [None]:
plt.figure(figsize=(25, 6))
sns.barplot(rfc_importances_used[:20])
plt.title("First Window Feature Importances")
plt.ylabel("Importance")
plt.xlabel("Feature")
plt.show()
