In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Load data from CSV files
data1 = pd.read_csv('F1_collaborative_variable1.csv')
data2 = pd.read_csv('F1_collaborative_variable2.csv')
data3 = pd.read_csv('F1_collaborative_variable3.csv')
data4 = pd.read_csv('F1_collaborative_variable4.csv')
target_data = pd.read_csv('F1_target_variable.csv')

# Extract the target property column from each dataset
data1 = data1['Target Property']
data2 = data2['Target Property']
data3 = data3['Target Property']
data4 = data4['Target Property']
target_data = target_data['Target Property']

# --------------------------
# Data Preprocessing
# --------------------------

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Initialize StandardScaler and MinMaxScaler
scaler = StandardScaler()
min_max_scaler = MinMaxScaler()

# Standardize the data
data1_standardized = scaler.fit_transform(data1.values.reshape(-1, 1))
data2_standardized = scaler.fit_transform(data2.values.reshape(-1, 1))
data3_standardized = scaler.fit_transform(data3.values.reshape(-1, 1))
data4_standardized = scaler.fit_transform(data4.values.reshape(-1, 1))
target_data_standardized = scaler.fit_transform(target_data.values.reshape(-1, 1))

# Normalize the standardized data
data1_normalized = min_max_scaler.fit_transform(data1_standardized)
data2_normalized = min_max_scaler.fit_transform(data2_standardized)
data3_normalized = min_max_scaler.fit_transform(data3_standardized)
data4_normalized = min_max_scaler.fit_transform(data4_standardized)
target_data_normalized = min_max_scaler.fit_transform(target_data_standardized)

# --------------------------
# Pearson Correlation Coefficients
# --------------------------

# Compute the correlation between each variable and the target variable (using standardized data)
correlation1 = np.corrcoef(data1_standardized.flatten(), target_data_standardized.flatten())[0, 1]
correlation2 = np.corrcoef(data2_standardized.flatten(), target_data_standardized.flatten())[0, 1]
correlation3 = np.corrcoef(data3_standardized.flatten(), target_data_standardized.flatten())[0, 1]
correlation4 = np.corrcoef(data4_standardized.flatten(), target_data_standardized.flatten())[0, 1]

# Print the correlation results
print(f'Correlation between Variable 1 and Target: {correlation1}')
print(f'Correlation between Variable 2 and Target: {correlation2}')
print(f'Correlation between Variable 3 and Target: {correlation3}')
print(f'Correlation between Variable 4 and Target: {correlation4}')

# Compute the correlation on the original data (without normalization/standardization)
correlation1_raw = np.corrcoef(data1, target_data)[0, 1]
correlation2_raw = np.corrcoef(data2, target_data)[0, 1]
correlation3_raw = np.corrcoef(data3, target_data)[0, 1]
correlation4_raw = np.corrcoef(data4, target_data)[0, 1]

# Print the raw data correlation results
print(f'\nRaw data correlation between Variable 1 and Target: {correlation1_raw}')
print(f'Raw data correlation between Variable 2 and Target: {correlation2_raw}')
print(f'Raw data correlation between Variable 3 and Target: {correlation3_raw}')
print(f'Raw data correlation between Variable 4 and Target: {correlation4_raw}')

# --------------------------
# QQ-Plot (Normality Check)
# --------------------------

# Generate QQ plots for each variable to check if the data follows a normal distribution
fig, axs = plt.subplots(2, 2, figsize=(12, 10))

# QQ plot for Variable 1
stats.probplot(data1_standardized.flatten(), dist="norm", plot=axs[0, 0])
axs[0, 0].set_title('QQ Plot of Variable 1')

# QQ plot for Variable 2
stats.probplot(data2_standardized.flatten(), dist="norm", plot=axs[0, 1])
axs[0, 1].set_title('QQ Plot of Variable 2')

# QQ plot for Variable 3
stats.probplot(data3_standardized.flatten(), dist="norm", plot=axs[1, 0])
axs[1, 0].set_title('QQ Plot of Variable 3')

# QQ plot for Variable 4
stats.probplot(data4_standardized.flatten(), dist="norm", plot=axs[1, 1])
axs[1, 1].set_title('QQ Plot of Variable 4')

# Save and show the QQ plot
plt.savefig('QQ Plot.png')
plt.tight_layout()
plt.show()

# --------------------------
# Pearson and Spearman Correlation Coefficients (Final)
# --------------------------

# Compute Pearson correlation coefficients for each variable and target (after standardization)
pearson_corr1 = np.corrcoef(data1_standardized.flatten(), target_data_standardized.flatten())[0, 1]
pearson_corr2 = np.corrcoef(data2_standardized.flatten(), target_data_standardized.flatten())[0, 1]
pearson_corr3 = np.corrcoef(data3_standardized.flatten(), target_data_standardized.flatten())[0, 1]
pearson_corr4 = np.corrcoef(data4_standardized.flatten(), target_data_standardized.flatten())[0, 1]

# Print Pearson correlation results
print(f'Pearson correlation between Variable 1 and Target: {pearson_corr1}')
print(f'Pearson correlation between Variable 2 and Target: {pearson_corr2}')
print(f'Pearson correlation between Variable 3 and Target: {pearson_corr3}')
print(f'Pearson correlation between Variable 4 and Target: {pearson_corr4}')

# Compute Spearman correlation coefficients
spearman_corr1 = stats.spearmanr(data1_standardized.flatten(), target_data_standardized.flatten())[0]
spearman_corr2 = stats.spearmanr(data2_standardized.flatten(), target_data_standardized.flatten())[0]
spearman_corr3 = stats.spearmanr(data3_standardized.flatten(), target_data_standardized.flatten())[0]
spearman_corr4 = stats.spearmanr(data4_standardized.flatten(), target_data_standardized.flatten())[0]

# Print Spearman correlation results
print(f'\nSpearman correlation between Variable 1 and Target: {spearman_corr1}')
print(f'Spearman correlation between Variable 2 and Target: {spearman_corr2}')
print(f'Spearman correlation between Variable 3 and Target: {spearman_corr3}')
print(f'Spearman correlation between Variable 4 and Target: {spearman_corr4}')

# --------------------------
# Hexbin Plots (Correlation Visualization)
# --------------------------

# Create a 2x2 subplot for hexbin plots
fig, axs = plt.subplots(2, 2, figsize=(12, 10))

# Hexbin plot for Variable 1 vs Target
hb1 = axs[0, 0].hexbin(data1, target_data, gridsize=50, cmap='Blues')
axs[0, 0].set_title(f'Variable 1 vs Target (Spearman Correlation: {spearman_corr1:.2f})')
axs[0, 0].set_xlabel('Variable 1')
axs[0, 0].set_ylabel('Target')
fig.colorbar(hb1, ax=axs[0, 0], label='Count')

# Hexbin plot for Variable 2 vs Target
hb2 = axs[0, 1].hexbin(data2, target_data, gridsize=50, cmap='Blues')
axs[0, 1].set_title(f'Variable 2 vs Target (Spearman Correlation: {spearman_corr2:.2f})')
axs[0, 1].set_xlabel('Variable 2')
axs[0, 1].set_ylabel('Target')
fig.colorbar(hb2, ax=axs[0, 1], label='Count')

# Hexbin plot for Variable 3 vs Target
hb3 = axs[1, 0].hexbin(data3, target_data, gridsize=50, cmap='Blues')
axs[1, 0].set_title(f'Variable 3 vs Target (Spearman Correlation: {spearman_corr3:.2f})')
axs[1, 0].set_xlabel('Variable 3')
axs[1, 0].set_ylabel('Target')
fig.colorbar(hb3, ax=axs[1, 0], label='Count')

# Hexbin plot for Variable 4 vs Target
hb4 = axs[1, 1].hexbin(data4, target_data, gridsize=50, cmap='Blues')
axs[1, 1].set_title(f'Variable 4 vs Target (Spearman Correlation: {spearman_corr4:.2f})')
axs[1, 1].set_xlabel('Variable 4')
axs[1, 1].set_ylabel('Target')
fig.colorbar(hb4, ax=axs[1, 1], label='Count')

# Save and show the Hexbin plot
plt.savefig('Hexagonal Bin Plot.png')
plt.tight_layout()
plt.show()
