In [11]:
import pandas as pd 
import numpy as np 
import os 
import matplotlib.pyplot as plt 
import seaborn as sns

os.getcwd()

'/home/luke/JaneStreetKaggle'

In [12]:
def get_corr_df(partition_num, threshold):
    corr = pd.read_csv(f'./feature_corr/corr_csv/{partition_num}.csv', index_col=0)
    corr_cut = corr.loc[(corr['correlation'] > abs(threshold)) | (corr['correlation'] < -abs(threshold))]
    return corr_cut

def get_feature_corr(df, feature):
    return df.loc[(df['var2'].str.contains(feature, na=False))]

def get_responder_corr(df):
    return df.loc[(df['var1'].str.contains('responder', na=False)) | (df['var2'].str.contains('responder', na=False))]

df = get_corr_df(1, 0.1)
responder_corr_df = get_responder_corr(df)

In [14]:
get_corr_df(1, 0.)

Unnamed: 0,var1,var2,correlation
3408,feature_75,feature_76,0.959233
3431,feature_77,feature_78,0.958595
3381,feature_73,feature_74,0.957765
980,feature_12,feature_67,0.922588
983,feature_12,feature_70,0.913237
...,...,...,...
2218,feature_37,feature_45,-0.539961
2278,feature_38,feature_56,-0.542600
2133,feature_35,feature_61,-0.544587
2081,feature_34,feature_61,-0.546120


In [4]:
get_feature_corr(responder_corr_df, "responder_6")

Unnamed: 0,var1,var2,correlation
3473,responder_3,responder_6,0.64526
3477,responder_4,responder_6,0.298382
3480,responder_5,responder_6,0.28491


In [5]:
get_feature_corr(responder_corr_df, "responder_3")

Unnamed: 0,var1,var2,correlation
3452,responder_0,responder_3,0.515294
3459,responder_1,responder_3,0.18563
3465,responder_2,responder_3,0.177735
2205,feature_36,responder_3,0.150929


In [6]:
get_feature_corr(responder_corr_df, "responder_0")

Unnamed: 0,var1,var2,correlation
2202,feature_36,responder_0,0.259695
2736,feature_48,responder_0,0.13084
2774,feature_49,responder_0,0.126999
2396,feature_40,responder_0,0.121543
2531,feature_43,responder_0,0.120224
2442,feature_41,responder_0,0.119539
2574,feature_44,responder_0,0.1116
2349,feature_39,responder_0,0.111026
2487,feature_42,responder_0,0.100547


In [7]:
get_responder_corr(get_corr_df(0, 0.1))

Unnamed: 0,var1,var2,correlation
3067,responder_3,responder_5,0.590109
3047,responder_0,responder_3,0.530023
3073,responder_4,responder_7,0.503893
3066,responder_3,responder_4,0.492801
3062,responder_2,responder_5,0.485445
...,...,...,...
2371,feature_49,responder_2,0.102484
2174,feature_44,responder_5,0.101466
1697,feature_34,responder_3,-0.102215
1694,feature_34,responder_0,-0.109308


In [8]:
def get_partition_corr_df(threshold):
    import pandas as pd
    import numpy as np
    partition_corr_df = pd.DataFrame()

    for partition_num in range(10):
        corr_df = get_corr_df(partition_num, threshold)
        
        # feature 쌍 생성
        feature_pairs = corr_df['var1'].astype(str) + '-' + corr_df['var2'].astype(str)
        
        # 현재 파티션의 상관관계를 Series로 만들기
        curr_partition_corr = pd.Series(
            data=corr_df['correlation'].values,
            index=feature_pairs,
            name=f'partition_{partition_num}_corr'
        )
        
        # DataFrame에 새로운 컬럼으로 추가
        # join='outer'로 설정하여 새로운 인덱스가 있을 경우 NaN으로 채움
        partition_corr_df = pd.concat([partition_corr_df, curr_partition_corr], axis=1)
        
        # df 정렬
        # 1. na값이 적은 순서
        # 2. feature 이름 순서
        partition_corr_df = partition_corr_df.loc[partition_corr_df.isnull().sum(axis=1).sort_values(kind='stable').index]
    return partition_corr_df

def get_feature_partition_corr(df, feature):
    return df.loc[df.index.str.contains(feature)]

partition_df = get_partition_corr_df(0.1)
partition_df.head(3)

Unnamed: 0,partition_0_corr,partition_1_corr,partition_2_corr,partition_3_corr,partition_4_corr,partition_5_corr,partition_6_corr,partition_7_corr,partition_8_corr,partition_9_corr
feature_75-feature_76,0.959255,0.959233,0.956333,0.959547,0.959114,0.95742,0.956412,0.956777,0.95745,0.957691
feature_77-feature_78,0.957313,0.958595,0.957977,0.958907,0.955392,0.958103,0.955845,0.956017,0.955487,0.957426
feature_73-feature_74,0.956324,0.957765,0.954481,0.958396,0.957564,0.95593,0.954443,0.954943,0.95616,0.957587


In [191]:
get_feature_partition_corr(partition_df, "responder_0").head()

Unnamed: 0,partition_0_corr,partition_1_corr,partition_2_corr,partition_3_corr,partition_4_corr,partition_5_corr,partition_6_corr,partition_7_corr,partition_8_corr,partition_9_corr
responder_0-responder_2,0.369169,0.37224,0.377248,0.379434,0.389143,0.390155,0.377695,0.389754,0.379999,0.3822
responder_0-responder_1,0.283991,0.295701,0.305925,0.324306,0.352259,0.344863,0.328464,0.352757,0.341577,0.336695
feature_36-responder_0,0.235029,0.259695,0.24986,0.231607,0.211058,0.196047,0.220712,0.161693,0.162607,0.130812
responder_0-responder_3,0.530023,0.515294,0.377265,0.26,0.214886,0.126161,0.13992,0.232893,0.160087,
responder_0-responder_5,0.29247,0.261016,0.213304,0.165609,0.132905,,,0.124299,,


In [189]:
def save_corr_plot(df, nan_threshold, save_dir):
    import matplotlib.pyplot as plt
    import seaborn as sns
    import math

    complete_pairs = df[df.isnull().sum(axis=1) <= nan_threshold]

    # 5개씩 그룹화하기 위한 계산
    n_features = len(complete_pairs)
    features_per_plot = 5
    n_plots = math.ceil(n_features / features_per_plot)

    # 각 그룹별로 그래프 그리기
    for plot_idx in range(n_plots):
        start_idx = plot_idx * features_per_plot
        end_idx = min((plot_idx + 1) * features_per_plot, n_features)
        
        # 현재 그룹의 feature 쌍 선택
        current_pairs = complete_pairs.iloc[start_idx:end_idx]
        
        # 그래프 그리기
        plt.figure(figsize=(15, 8))
        ax = current_pairs.T.plot(
            linewidth=2,
            marker='o',
            markersize=8,
            alpha=0.7,
            grid=True
        )
        
        # 그래프 꾸미기
        plt.title(f'Correlation Changes Across Partitions (Group {plot_idx + 1})', 
                fontsize=15, 
                pad=20)
        plt.xlabel('Partition', fontsize=12)
        plt.ylabel('Correlation', fontsize=12)
        
        # x축 레이블 설정
        plt.xticks(rotation=45)
        
        # 범례 설정
        plt.legend(
            bbox_to_anchor=(1.05, 1),
            loc='upper left',
            borderaxespad=0.,
            fontsize=10,
            title='Feature Pairs',
            title_fontsize=12
        )
        
        # 그리드 설정
        plt.grid(True, linestyle='--', alpha=0.7)
        
        # 레이아웃 조정
        plt.tight_layout()
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        # 저장하기 (show 전에)
        plt.savefig(os.path.join(save_dir, f'{plot_idx}.png'), 
                    dpi=300, 
                    bbox_inches='tight', 
                    format='png')    
        plt.close()

In [196]:
def save_feature_corr_plot(df, feature, corr_threshold,nan_threshold, save_dir):
    import matplotlib.pyplot as plt
    import seaborn as sns
    import math
    
    df = get_partition_corr_df(corr_threshold)
    df = get_feature_partition_corr(df, feature)

    complete_pairs = df[df.isnull().sum(axis=1) <= nan_threshold]

    # 5개씩 그룹화하기 위한 계산
    n_features = len(complete_pairs)
    features_per_plot = 5
    n_plots = math.ceil(n_features / features_per_plot)

    # 각 그룹별로 그래프 그리기
    for plot_idx in range(n_plots):
        start_idx = plot_idx * features_per_plot
        end_idx = min((plot_idx + 1) * features_per_plot, n_features)
        
        # 현재 그룹의 feature 쌍 선택
        current_pairs = complete_pairs.iloc[start_idx:end_idx]
        
        # 그래프 그리기
        plt.figure(figsize=(15, 8))
        ax = current_pairs.T.plot(
            linewidth=2,
            marker='o',
            markersize=8,
            alpha=0.7,
            grid=True
        )
        
        # 그래프 꾸미기
        plt.title(f'Correlation Changes Across Partitions (Group {plot_idx + 1})', 
                fontsize=15, 
                pad=20)
        plt.xlabel('Partition', fontsize=12)
        plt.ylabel('Correlation', fontsize=12)
        
        # x축 레이블 설정
        plt.xticks(rotation=45)
        
        # 범례 설정
        plt.legend(
            bbox_to_anchor=(1.05, 1),
            loc='upper left',
            borderaxespad=0.,
            fontsize=10,
            title='Feature Pairs',
            title_fontsize=12
        )
        
        # 그리드 설정
        plt.grid(True, linestyle='--', alpha=0.7)
        
        # 레이아웃 조정
        plt.tight_layout()
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        # 저장하기 (show 전에)
        plt.savefig(os.path.join(save_dir, f'{plot_idx}.png'), 
                    dpi=300, 
                    bbox_inches='tight', 
                    format='png')    
        plt.close()
        
    return



In [204]:
feature = "responder_0"
nan_threshold = 5

save_feature_corr_plot(df=get_partition_corr_df(0.1), 
                       feature=feature,
                       corr_threshold=0.1,
                       nan_threshold=nan_threshold,
                       save_dir=f"./feature_corr/partition_{feature}_corr")

<Figure size 1500x800 with 0 Axes>