# Simple correlation analysis of variables

## We perform this to know which variables we should keep

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

external_df = pd.read_csv('/Users/arielguerra/Downloads/data-3/timeseries/2015/external.csv')
global_df = pd.read_csv('/Users/arielguerra/Downloads/data-3/timeseries/2015/global.csv')
by_actor_df = pd.read_csv('/Users/arielguerra/Downloads/data-3/timeseries/2015/by_actor.csv')

external_df.sort_values(by=['week', 'weekday'], inplace=True)
global_df.sort_values(by=['week', 'weekday'], inplace=True)
by_actor_df.sort_values(by=['week', 'weekday'], inplace=True)

In [None]:
external_df.set_index('date', inplace=True)
global_df.set_index('date', inplace=True)
by_actor_df.set_index('date', inplace=True)

external_relevant = external_df[['PriceUSD', 'HashRate']]
global_relevant = global_df[['nb_transactions', 'total_received_satoshi', 'total_sent_satoshi', 'mean_feeUSD']]
by_actor_relevant = by_actor_df[['received', 'spent', 'nb_transactions', 'self_spent']]

by_actor_aggregated = by_actor_relevant.groupby('date').sum()

merged_df = pd.concat([external_relevant, global_relevant, by_actor_aggregated], axis=1)

In [ ]:
styled_column_names = {
    'PriceUSD': 'Price USD',
    'HashRate': 'Hash Rate',
    'total_sent_satoshi': 'Total Sent Satoshi',
    'mean_feeUSD': 'Mean Fee USD',
    'received': 'Total Received',
    'spent': 'Total Spent',
    'self_spent': 'Self Spent'
}

merged_df.rename(columns=styled_column_names, inplace=True)

correlation_matrix_styled = merged_df.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix_styled, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Bitcoin Dataset Variables')
plt.show()