In [15]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
import numpy as np
import matplotlib.ticker as ticker
from scipy.stats import kendalltau
from scipy.stats import pearsonr
import dcor

In [2]:
def calculate_correlation_and_pvalue(dataframe, simlex999, standard_deviation):
    corr, p_value = spearmanr(dataframe[simlex999], dataframe[standard_deviation])
    significance = 'significant' if p_value < 0.05 else 'not significant'
    return {
        'simlex999': simlex999,
        'standard_deviation': standard_deviation,
        'correlation': round(corr, 5),
        'p_value': p_value,
        'significance': significance
    }

In [None]:
def plot_scatter_plot(dataframe, column1, column2, x_label, y_label, x_min, x_max, y_min, y_max):
    # Set size
    fig, ax = plt.subplots(figsize=(7, 5))
    
    # Create scatter plot
    sns.scatterplot(x=column1, y=column2, data=dataframe, color='skyblue', edgecolor='black', ax=ax)
    
    # Plot diagonal line
    ax.plot([x_min, x_max], [y_min, y_max], color='red', linestyle='--')

    # Set axis limits
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    
    # Set labels and title
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    
    # Remove grid
    ax.grid(False)

    # Adjust layout
    plt.tight_layout()

    # Show results
    plt.show()

In [4]:
def print_correlation_and_pvalue(dataframe, column1, column2):
    corr, p_value = spearmanr(dataframe[column1], dataframe[column2])
    print(f"Spearman's Rank correlation: {round(corr, 5)}")
    print(f"P-value: {p_value}")
    if p_value < 0.05:
        print("The correlation is statistically significant.\n")
    else:
        print("The correlation is not statistically significant.\n")

In [None]:
def print_pearson_correlation_and_pvalue(dataframe, column1, column2):
    corr, p_value = pearsonr(dataframe[column1], dataframe[column2])
    print(f"Pearson's correlation: {round(corr, 5)}")
    print(f"P-value: {p_value}")
    if p_value < 0.05:
        print("The correlation is statistically significant.\n")
    else:
        print("The correlation is not statistically significant.\n")

In [5]:
def calculate_outliers(dataframe, column):
    q1 = dataframe[column].quantile(0.25)
    q3 = dataframe[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = dataframe[(dataframe[column] < lower_bound) | (dataframe[column] > upper_bound)]
    return outliers

In [6]:
def print_outlier_counts(dataframe):
    columns_with_no_outliers = []
    for col in dataframe.select_dtypes(include='number').columns:
        outliers = calculate_outliers(dataframe, col)
        if outliers.empty:
            columns_with_no_outliers.append(col)
        else:
            print(f"Outliers in {col}: {outliers.shape[0]}")
    
    if columns_with_no_outliers:
        print(f"No outliers found in columns: {', '.join(columns_with_no_outliers)}")

In [7]:
def print_outlier_data(dataframe, col):
    if col in dataframe.columns:
        outliers = calculate_outliers(dataframe, col)
        if not outliers.empty:
            print(f"\nOutliers in {col}:")
            print(outliers)
        else:
            print(f"No outliers found in {col}")

In [None]:
def print_tau_correlation_and_pvalue(dataframe, column1, column2):
    corr, p_value = kendalltau(dataframe[column1], dataframe[column2])
    print(f"Kendall’s Tau correlation: {round(corr, 5)}")
    print(f"P-value: {p_value}")
    if p_value < 0.05:
        print("The correlation is statistically significant.\n")
    else:
        print("The correlation is not statistically significant.\n")

In [None]:
def print_distance_correlation_and_pvalue(data, col1=None, col2=None, num_resamples=1000, random_state=None):
    # 如果是 DataFrame，提取指定列；否则，直接使用输入数据
    if col1 is not None and col2 is not None:
        x = data[col1]
        y = data[col2]
    else:
        x, y = data

    # 确保输入为 NumPy 数组
    x = np.asarray(x)
    y = np.asarray(y)

    # 计算 Distance Correlation
    dist_corr = dcor.distance_correlation(x, y)

    # 计算 P-value
    p_value = dcor.independence.distance_covariance_test(
        x, y, num_resamples=num_resamples, random_state=random_state
    ).p_value

    # 打印结果
    print(f"Distance Correlation: {round(dist_corr, 5)}")
    print(f"P-value: {p_value}")
    if p_value < 0.05:
        print("The correlation is statistically significant.\n")
    else:
        print("The correlation is not statistically significant.\n")
    
    return dist_corr, p_value