<a href="https://colab.research.google.com/github/daekee0325/Algorithmic-Trading---Python/blob/main/C4F8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Select the relevant columns (r to w correspond to indices 17 to 22)
columns_to_analyze = ['C4F8_r', 'CF4_r', 'C2F6_r', 'CHF3_r', 'CH2F2_r', 'CH3F_r']

# Perform correlation analysis with respect to the 'plasma' column
correlation_results = data[columns_to_analyze].corrwith(data['plasma'])

# Display the results to the user
import ace_tools as tools; tools.display_dataframe_to_user("Correlation Analysis Results", correlation_results)


In [None]:
# Calculate the correlation matrix for columns r to w (C4F8_r to CH3F_r)
correlation_matrix = data[columns_to_analyze].corr()

# Display the correlation matrix to the user
import ace_tools as tools; tools.display_dataframe_to_user("Correlation Matrix for Columns r to w", correlation_matrix)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the matplotlib figure
plt.figure(figsize=(10, 8))

# Draw the heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)

# Set titles and labels
plt.title('Correlation Matrix Heatmap for Columns r to w')
plt.show()


In [None]:
import scipy.stats as stats
import numpy as np

# Extracting the columns to test
plasma_values = data['plasma'].unique()
mean_test_results = {}

# Perform t-tests for each column between different plasma values
for column in columns_to_analyze:
    # Create groups based on plasma values
    groups = [data[data['plasma'] == plasma][column].dropna() for plasma in plasma_values]

    # Perform ANOVA if there are more than two groups
    if len(plasma_values) > 2:
        f_val, p_val = stats.f_oneway(*groups)
        mean_test_results[column] = p_val
    else:
        t_stat, p_val = stats.ttest_ind(*groups, equal_var=False)
        mean_test_results[column] = p_val

# Display the results of the mean tests
mean_test_results_df = pd.DataFrame(list(mean_test_results.items()), columns=['Column', 'P-Value'])

import ace_tools as tools; tools.display_dataframe_to_user("Mean Test Results by Plasma Levels", mean_test_results_df)

# If ANOVA is significant (p < 0.05), perform pairwise t-tests
pairwise_test_results = {}

if len(plasma_values) > 2:
    significant_columns = mean_test_results_df[mean_test_results_df['P-Value'] < 0.05]['Column']

    for column in significant_columns:
        pairwise_results = {}
        for i in range(len(plasma_values)):
            for j in range(i + 1, len(plasma_values)):
                group1 = data[data['plasma'] == plasma_values[i]][column].dropna()
                group2 = data[data['plasma'] == plasma_values[j]][column].dropna()
                t_stat, p_val = stats.ttest_ind(group1, group2, equal_var=False)
                pairwise_results[f'{plasma_values[i]} vs {plasma_values[j]}'] = p_val
        pairwise_test_results[column] = pairwise_results

if pairwise_test_results:
    pairwise_test_results_df = pd.DataFrame(pairwise_test_results).T
    tools.display_dataframe_to_user("Pairwise Mean Test Results by Plasma Levels", pairwise_test_results_df)


In [None]:
# Plot the distribution of each column from r to w based on plasma levels with distinct colors

plt.figure(figsize=(15, 10))

for i, column in enumerate(columns_to_analyze, 1):
    plt.subplot(2, 3, i)
    sns.histplot(data=data, x=column, hue='plasma', element='step', stat="density", common_norm=False, palette="Set2")
    plt.title(f'Distribution of {column} by Plasma Levels')

plt.tight_layout()
plt.show()


In [None]:
# Plot the distribution of each column from r to w based on plasma levels with distinct colors

plt.figure(figsize=(15, 10))

for i, column in enumerate(columns_to_analyze, 1):
    plt.subplot(2, 3, i)
    sns.histplot(data=data, x=column, hue='plasma', element='step', stat="density", common_norm=False, palette="Set2")
    plt.title(f'Distribution of {column} by Plasma Levels')

plt.tight_layout()
plt.show()


In [None]:
# Simplifying the layout to avoid overlapping and exceeding subplot limits

plt.figure(figsize=(15, 15))

for i, column in enumerate(columns_to_analyze):
    # Plot distribution with outliers
    plt.subplot(len(columns_to_analyze), 2, 2 * i + 1)
    sns.histplot(data=data, x=column, hue='plasma', element='step', stat="density", common_norm=False, palette="Set2")
    plt.title(f'{column} Distribution with Outliers')

    # Plot distribution without outliers
    plt.subplot(len(columns_to_analyze), 2, 2 * i + 2)
    data_no_outliers = remove_outliers(data, column)
    sns.histplot(data=data_no_outliers, x=column, hue='plasma', element='step', stat="density", common_norm=False, palette="Set2")
    plt.title(f'{column} Distribution without Outliers')

plt.tight_layout()
plt.show()


In [None]:
# Performing MANOVA with plasma as the independent variable and columns r to w as dependent variables

# Preparing the data
X = data[['plasma']]
y = data[columns_to_analyze]

# Performing MANOVA
manova = MANOVA(endog=y, exog=X)
manova_results = manova.mv_test()

manova_results
