# Analyzing Datasets
Script analyzes shares of different rna, protein and interaction types and outputs figures
Mainly needed for thesis figures :-)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

In [None]:
dataset_path = f"../results/figures"

In [None]:
def compute_shares(df: pd.DataFrame):
    num_positives = df[(df['Sequence_1_shuffle'] == 0) & (df['Sequence_2_shuffle'] == 0)].shape[0]
    num_negatives = df[(df['Sequence_1_shuffle'] == 1) | (df['Sequence_2_shuffle'] == 1)].shape[0]
    assert num_negatives + num_positives == df.shape[0]
    print(f"Num of positive pairs: {num_positives} ({round(num_positives * 100 / df.shape[0], 2)} %)")
    print(f"Num of negative pairs: {num_negatives} ({round(num_negatives * 100 / df.shape[0], 2)} %)")

In [None]:
def analyse_rna_type(df: pd.DataFrame, name: str):
    # use only positive pairs
    df = df[(df['Sequence_1_shuffle'] == 0) & (df['Sequence_2_shuffle'] == 0)]
    group = df.groupby(['Category1']).count().reset_index()[['Category1', 'RNAInterID']]
    categories = group['Category1'].values.tolist()
    counts = group['RNAInterID'].values.tolist()
    entries = [ (x1, x2) for x1, x2 in zip(categories, counts)]
    entries.sort(key= lambda x: x[1], reverse=True)
    counts = [x[1] for x in entries]
    categories = [x[0] for x in entries]
    fig, ax = plt.subplots()


    ax.bar(categories, counts)
    fig.autofmt_xdate(rotation=45)
    ax.set_title(f"RNA categories on {name}")
    ax.set_ylabel('number of positive interactions')
    #plt.yscale("log")
    filename = f"{'-'.join(name.split(' '))}-RNAs.pdf"
    plt.savefig(os.path.join(dataset_path, filename), bbox_inches='tight')
    plt.show()

In [None]:
def analyse_protein_type(df: pd.DataFrame, name: str):
    df = df[(df['Sequence_1_shuffle'] == 0) & (df['Sequence_2_shuffle'] == 0)]
    group = df.groupby(['Category2']).count().reset_index()[['Category2', 'RNAInterID']]
    categories = group['Category2'].values.tolist()
    counts = group['RNAInterID'].values.tolist()
    entries = [ (x1, x2) for x1, x2 in zip(categories, counts)]
    entries.sort(key= lambda x: x[1], reverse=True)
    counts = [x[1] for x in entries]
    categories = [x[0] for x in entries]
    fig, ax = plt.subplots()


    ax.bar(categories, counts)
    fig.autofmt_xdate(rotation=45)
    ax.set_title(f"protein categories on {name}")
    ax.set_ylabel('number of positive interactions')
    #plt.yscale("log")

    filename = f"{'-'.join(name.split(' '))}-proteins.pdf"
    plt.savefig(os.path.join(dataset_path, filename), bbox_inches='tight')
    plt.show()

In [None]:
def analyse_interaction_type(df: pd.DataFrame, name: str):
    df = df[(df['Sequence_1_shuffle'] == 0) & (df['Sequence_2_shuffle'] == 0)]
    group = df.groupby(['Category2', 'Category1']).count().reset_index()
    group['interaction-type'] = group['Category1'].astype(str) + "-" + group['Category2'].astype(str)
    categories = group['interaction-type'].values.tolist()
    counts = group['RNAInterID'].values.tolist()
    entries = [ (x1, x2) for x1, x2 in zip(categories, counts)]
    entries.sort(key= lambda x: x[1], reverse=True)
    counts = [x[1] for x in entries]
    categories = [x[0] for x in entries]
    fig, ax = plt.subplots(figsize=(8, 12))


    ax.barh(categories, counts)
    # fig.autofmt_xdate(rotation=90)
    ax.set_title(f"interaction types on {name}")
    ax.set_ylabel('number of positive interactions')
    plt.xscale("log")

    filename = f"{'-'.join(name.split(' '))}-interaction-type.pdf"

    plt.savefig(os.path.join(dataset_path, filename), bbox_inches='tight')
    plt.show()

In [None]:
def check_protein_appearance(df: pd.DataFrame, df_2: pd.DataFrame):
    # get proteins
    proteins_1 = set(df['Sequence_2_ID_Unique'].unique())
    proteins_2 = set(df_2['Sequence_2_ID_Unique'].unique())
    print(f"Number of unique proteins in set 1: {len(proteins_1)}")
    print(f"Number of unique proteins in set 2: {len(proteins_2)}")
    print(f"Set 1 and Set 2 have {len(proteins_2.intersection(proteins_1))} unique protein in common.")

In [None]:
def check_rna_appearance(df: pd.DataFrame, df_2: pd.DataFrame):
    # get RNAs
    rna_1 = set(df['Sequence_1_ID_Unique'].unique())
    rna_2 = set(df_2['Sequence_1_ID_Unique'].unique())
    print(f"Number of unique rna in set 1: {len(rna_1)}")
    print(f"Number of unique rna in set 2: {len(rna_2)}")
    print(f"Set 1 and Set 2 have {len(rna_1.intersection(rna_2))} unique protein in common.")

In [None]:
sets = [
    (pd.read_parquet('../results/dataset_v4/final_train_set_reduced.parquet', engine='pyarrow'), "Training set"),
    # (pd.read_parquet('../results/final_valid_set.parquet', engine='pyarrow'), "Validation set"),
    (pd.read_parquet('../results/dataset_v4/final_test_set_reduced.parquet', engine='pyarrow'), "Test set"),
]

for dataset, name in sets: 
    analyse_interaction_type(dataset, name)

for dataset, name in sets:
    analyse_rna_type(dataset, name)
    
for dataset, name in sets: 
    analyse_protein_type(dataset, name)

In [None]:
# cross checking datasets
# which proteins from split y do also appear in split x

In [None]:
# Prepare Train Split
train_set = pd.read_parquet("../results/dataset_v4/final/train_set.parquet")
len_train_set = len(train_set)
train_set['interaction-type'] = train_set['Category1'].astype(str) + "-" + train_set['Category2'].astype(str)
train_set = train_set.groupby(['interaction-type']).count().reset_index()
train_set = train_set[['interaction-type', "RNAInterID"]]
train_set = train_set.rename(columns={"RNAInterID": "count-train"})
train_set['Training Set'] = train_set['count-train'] / len_train_set

# Prepare Random Split
random_set = pd.read_parquet("../results/dataset_v4/final/test_set_random.parquet")
len_random_set = len(random_set)
random_set['interaction-type'] = random_set['Category1'].astype(str) + "-" + random_set['Category2'].astype(str)
random_set = random_set.groupby(['interaction-type']).count().reset_index()
random_set = random_set[['interaction-type', "RNAInterID"]]
random_set = random_set.rename(columns={"RNAInterID": "count-random"})
random_set['Random Test Set'] = random_set['count-random'] / len_random_set

test_set = pd.read_parquet("../results/dataset_v4/final/test_set.parquet")
len_test_set = len(test_set)
test_set['interaction-type'] = test_set['Category1'].astype(str) + "-" + test_set['Category2'].astype(str)
test_set = test_set.groupby(['interaction-type']).count().reset_index()
test_set = test_set[['interaction-type', 'RNAInterID']]
test_set = test_set.rename(columns={"RNAInterID": "count-test"})
test_set['Test Set'] = test_set['count-test'] / len_test_set

new_df = train_set.set_index('interaction-type').join(random_set.set_index('interaction-type'), how='outer').fillna(0)
new_df = new_df.join(test_set.set_index('interaction-type'), how='outer').fillna(0)

latex_df = new_df
latex_df['Training Set'] = latex_df['Training Set']  * 100
latex_df['Random Test Set'] = latex_df['Random Test Set']  * 100
latex_df['Test Set'] = latex_df['Test Set']  * 100
latex_df = latex_df.sort_values(by='Training Set', ascending=False)
latex_out = latex_df.head(23).style.format(decimal='.', thousands=',', precision=4).to_latex()
latex_out_2 = latex_df.tail(-23).style.format(decimal='.', thousands=',', precision=4).to_latex()


new_df = new_df[['Training Set', 'Random Test Set', 'Test Set']]
new_df = new_df.sort_values(by='Training Set', ascending=True)
fig, ax = plt.subplots(figsize=(8, 12))
new_df.plot.barh(ax=ax)
ax.set_ylabel('Interaction Types')
ax.set_xlabel('Share of interactions')
plt.xscale("log")
plt.savefig("../results/dataset_v4/figures/dataset-interaction-type-share.pdf", bbox_inches='tight')

In [None]:
# Prepare Train Split
train_set = pd.read_parquet("../results/dataset_v4/final/train_set.parquet")
len_train_set = len(train_set)
train_set = train_set.groupby(['Category1']).count().reset_index()
train_set = train_set[['Category1', "RNAInterID"]]
train_set = train_set.rename(columns={"RNAInterID": "count-train"})
train_set['Training Set'] = train_set['count-train'] / len_train_set

# Prepare Random Split
random_set = pd.read_parquet("../results/dataset_v4/final/test_set_random.parquet")
len_random_set = len(random_set)
random_set = random_set.groupby(['Category1']).count().reset_index()
random_set = random_set[['Category1', "RNAInterID"]]
random_set = random_set.rename(columns={"RNAInterID": "count-random"})
random_set['Random Test Set'] = random_set['count-random'] / len_random_set

test_set = pd.read_parquet("../results/dataset_v4/final/test_set.parquet")
len_test_set = len(test_set)
test_set = test_set.groupby(['Category1']).count().reset_index()
test_set = test_set[['Category1', 'RNAInterID']]
test_set = test_set.rename(columns={"RNAInterID": "count-test"})
test_set['Test Set'] = test_set['count-test'] / len_test_set

new_df = train_set.set_index('Category1').join(random_set.set_index('Category1'), how='outer').fillna(0)
new_df = new_df.join(test_set.set_index('Category1'), how='outer').fillna(0)


latex_df = new_df
latex_df['Training Set'] = latex_df['Training Set']  * 100
latex_df['Random Test Set'] = latex_df['Random Test Set']  * 100
latex_df['Test Set'] = latex_df['Test Set']  * 100
latex_df = latex_df.sort_values(by='Training Set', ascending=False)
latex_out = latex_df.style.format(decimal='.', thousands=',', precision=4).to_latex()

new_df = new_df[['Training Set', 'Random Test Set', 'Test Set']]
new_df = new_df.sort_values(by='Training Set', ascending=True)
fig, ax = plt.subplots(figsize=(8, 4))
new_df.plot.barh(ax=ax)
ax.set_ylabel('RNA interactors')
ax.set_xlabel('Share of RNA interactors')
plt.xscale("log")
plt.savefig("../results/dataset_v4/figures/dataset-rna-interactors-share.pdf", bbox_inches='tight')

In [None]:
# Prepare Train Split
train_set = pd.read_parquet("../results/dataset_v4/final/train_set.parquet")
len_train_set = len(train_set)
train_set = train_set.groupby(['Category2']).count().reset_index()
train_set = train_set[['Category2', "RNAInterID"]]
train_set = train_set.rename(columns={"RNAInterID": "count-train"})
train_set['Training Set'] = train_set['count-train'] / len_train_set

# Prepare Random Split
random_set = pd.read_parquet("../results/dataset_v4/final/test_set_random.parquet")
len_random_set = len(random_set)
random_set = random_set.groupby(['Category2']).count().reset_index()
random_set = random_set[['Category2', "RNAInterID"]]
random_set = random_set.rename(columns={"RNAInterID": "count-random"})
random_set['Random Test Set'] = random_set['count-random'] / len_random_set

test_set = pd.read_parquet("../results/dataset_v4/final/test_set.parquet")
len_test_set = len(test_set)
test_set = test_set.groupby(['Category2']).count().reset_index()
test_set = test_set[['Category2', 'RNAInterID']]
test_set = test_set.rename(columns={"RNAInterID": "count-test"})
test_set['Test Set'] = test_set['count-test'] / len_test_set

new_df = train_set.set_index('Category2').join(random_set.set_index('Category2'), how='outer').fillna(0)
new_df = new_df.join(test_set.set_index('Category2'), how='outer').fillna(0)

latex_df = new_df
latex_df['Training Set'] = latex_df['Training Set']  * 100
latex_df['Random Test Set'] = latex_df['Random Test Set']  * 100
latex_df['Test Set'] = latex_df['Test Set']  * 100
latex_df = latex_df.sort_values(by='Training Set', ascending=False)
latex_out = latex_df.style.format(decimal='.', thousands=',', precision=4).to_latex()

new_df = new_df[['Training Set', 'Random Test Set', 'Test Set']]
new_df = new_df.sort_values(by='Training Set', ascending=True)

fig, ax = plt.subplots(figsize=(8, 2))
new_df.plot.barh(ax=ax)
ax.set_ylabel('Protein interactors')
ax.set_xlabel('Share of protein interactors')
plt.xscale("log")
plt.savefig("../results/dataset_v4/figures/dataset-protein-interactors-share.pdf", bbox_inches='tight')

In [None]:
train_set = pd.read_parquet("../results/dataset_v4/final/train_set.parquet")
print("Len train set")
print(len(train_set))

test_set = pd.read_parquet("../results/dataset_v4/final/test_set.parquet")
print("Len test set")
print(len(test_set))

random_set = pd.read_parquet("../results/dataset_v4/final/test_set_random.parquet")
print("Len random set")
print(len(random_set))

all_df = pd.concat([train_set, random_set, test_set])
print("Len all")
print(len(all_df))
print("positive interactions:")
positive = all_df[(all_df['Sequence_1_shuffle'] == 0) & (all_df['Sequence_2_shuffle'] == 0)]
print(len(positive))
print("negative interactions")
negative = all_df[(all_df['Sequence_1_shuffle'] == 1) | (all_df['Sequence_2_shuffle'] == 1)]
print(len(negative))
print("Unique RNA sequences")
print(all_df['Sequence_1'].nunique())
print("Unique Protein sequences")
print(all_df['Sequence_2'].nunique())

print("Interaction types")
all_df['interaction-type'] = all_df['Category1'].astype(str) + "-" + all_df['Category2'].astype(str)
print(all_df['interaction-type'].nunique())

print("RNA types")
print(all_df['Category1'].nunique())

print("protein types")
print(all_df['Category2'].nunique())

print("Share train")
print(round(len(train_set) / len(all_df) * 100, 2))

print("Share test")
print(round(len(test_set) / len(all_df) * 100, 2))

print("Share random")
print(round( len(random_set) / len(all_df) * 100,2))

In [None]:
# Prepare Random Split
random_set = pd.read_parquet("../results/dataset_v4/final/test_set_random.parquet")
len_random_set = len(random_set)
random_set['interaction-type'] = random_set['Category1'].astype(str) + "-" + random_set['Category2'].astype(str)
random_set = random_set.groupby(['interaction-type']).count().reset_index()
random_set = random_set[['interaction-type', "RNAInterID"]]
random_set = random_set.rename(columns={"RNAInterID": "count-random"})
random_set['Random Test Set'] = random_set['count-random'] / len_random_set

random_set_results = pd.read_json("../results/dataset_v4/final/test-set-random-metrics-per-interaction-type.json")
random_set_results['share'] = random_set_results['total_num'] / len_random_set
df = random_set_results.set_index("interaction-type").join(random_set.set_index("interaction-type"))

In [None]:
# prepare test split
test_set = pd.read_parquet("../results/dataset_v4/final/test_set.parquet")
len_test_set = len(test_set)
test_set['interaction-type'] = test_set['Category1'].astype(str) + "-" + test_set['Category2'].astype(str)
test_set = test_set.groupby(['interaction-type']).count().reset_index()
test_set = test_set[['interaction-type', 'RNAInterID']]
test_set = test_set.rename(columns={"RNAInterID": "count-test"})
test_set['Test Set'] = test_set['count-test'] / len_test_set

test_set_results = pd.read_json("../results/dataset_v4/final/test-set-metrics-per-interaction-type.json")
test_set_results['share'] = test_set_results['total_num'] / len_test_set
df = test_set_results.set_index("interaction-type").join(test_set.set_index("interaction-type"))

In [None]:
test_set = pd.read_parquet("../results/dataset_v4/final/test_set.parquet")
len_test_set = len(test_set)
del test_set

random_set = pd.read_parquet("../results/dataset_v4/final/test_set_random.parquet")
len_random_set = len(random_set)
del random_set

test_set_results = pd.read_json("../results/dataset_v4/final/test-set-metrics-per-interaction-type.json")
test_set_results['share'] = round(test_set_results['total_num'] / len_test_set * 100, 2)
test_set_results['f1'] = test_set_results['f1'].round(4)
test_set_results['accuracy'] = test_set_results['f1'].round(4)

random_set_results = pd.read_json("../results/dataset_v4/final/test-set-random-metrics-per-interaction-type.json")
random_set_results['share'] = round(random_set_results['total_num'] / len_random_set * 100, 2)
random_set_results['f1'] = random_set_results['f1'].round(4)
random_set_results['accuracy'] = random_set_results['accuracy'].round(4)


new_df = test_set_results.set_index("interaction-type").join(random_set_results.set_index("interaction-type"), how="outer", lsuffix=" Test Set", rsuffix=" Random Test Set").fillna(0)
new_df = new_df[['f1 Test Set', 'accuracy Test Set', 'share Test Set', 'f1 Random Test Set', 'accuracy Random Test Set', 'share Random Test Set']]
new_df = new_df.sort_values(by='share Random Test Set', ascending=False)

latex_out = new_df.head(10).style.format(decimal='.', thousands=',', precision=4).to_latex()
latex_out_2 = new_df.tail(-10).style.format(decimal='.', thousands=',', precision=4).to_latex()