# Further Analysis ⚛
In this notebook, we try to provide some insights - also thanks to the use of PyMOL - on why the RING software is not able to classify certain interactions.



In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
import os
import pandas as pd

In [31]:
# Set Pandas to display all columns
pd.set_option('display.max_columns', None)

In [32]:
def load_tsv_as_df(folder, filename):
    if not filename.endswith('.tsv'):
        filename += '.tsv'
    path = os.path.join(folder, filename)
    df = pd.read_csv(path, sep='\t')
    return df

In [33]:
def save_df_as_tsv(df, folder, filename):
    os.makedirs(folder, exist_ok=True)
    if not filename.endswith('.tsv'):
        filename += '.tsv'
    path = os.path.join(folder, filename)
    df.to_csv(path, sep='\t', index=False)
    print(f"Saved TSV to: {path}")

# Original data

In [34]:
# Specify your folder path
folder_path = '/content/drive/MyDrive/Corsi del Semestre/STRUCTURAL BIOINFORMATICS'

# Path to the TSV file
tsv_path = f'{folder_path}/Structural Bioinfo PROJECT/datasets'

# Load df
combined_df = load_tsv_as_df(tsv_path, 'combinated_df_origin.tsv')

In [35]:
combined_df['Interaction'] = combined_df['Interaction'].fillna('MISSING')
combined_df['Interaction'] = combined_df['Interaction'].astype('category')
combined_df['Interaction'].cat.categories.tolist()

['HBOND',
 'IONIC',
 'MISSING',
 'PICATION',
 'PIHBOND',
 'PIPISTACK',
 'SSBOND',
 'VDW']

Creation of a DataFrame containing all the missing interactions.

In [36]:
missing_interactions_df = combined_df[combined_df['Interaction'] == 'MISSING']

In [37]:
# Show the result
print("Shape of the dataset:", missing_interactions_df.shape)
missing_interactions_df.head()

Shape of the dataset: (1089547, 33)


Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,s_3di_state,s_3di_letter,t_ch,t_resi,t_ins,t_resn,t_ss8,t_rsa,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5,t_3di_state,t_3di_letter,Interaction,same_chain
6,1b0y,A,8,,A,P,0.368,-1.494,2.583,-0.591,-1.302,-0.733,1.57,-0.146,2.0,C,A,14,,A,H,0.009,-0.956,-0.846,-0.591,-1.302,-0.733,1.57,-0.146,11.0,L,MISSING,1
9,1b0y,A,49,,M,E,0.197,-1.156,2.443,-0.663,-1.524,2.219,-1.005,1.212,18.0,S,A,62,,G,E,0.012,-1.348,2.896,-0.384,1.652,1.33,1.045,2.064,2.0,C,MISSING,1
18,1b0y,A,17,,L,H,0.189,-1.578,-0.017,-1.019,-0.987,-1.505,1.266,-0.912,4.0,E,A,79,,S,T,0.254,-1.639,0.06,-0.228,1.399,-4.76,0.67,-2.647,12.0,M,MISSING,1
19,1b0y,A,36,,L,S,0.25,-2.841,2.446,-1.019,-0.987,-1.505,1.266,-0.912,2.0,C,A,40,,E,G,0.546,-1.86,-0.062,1.357,-1.453,1.477,0.113,-0.837,12.0,M,MISSING,1
22,1b0y,A,48,,F,T,0.259,-1.496,-0.211,-1.006,-0.59,1.891,-0.397,0.412,13.0,N,A,63,,C,E,0.096,-2.476,2.319,-1.343,0.465,-0.862,-1.02,-0.255,5.0,F,MISSING,1


## Missing values per features
The number of missing values represents only a small fraction of the unclassified interactions, explaining only a limited part of the problem.

In [38]:
# total missing interactions
total_missing_interactions = missing_interactions_df.shape[0]

# Missing interaction with at least one NAN
rows_with_nan = missing_interactions_df.isnull().any(axis=1).sum()

# % of missing interaction with at least one NAN
percent_interactions_with_nan = (rows_with_nan / total_missing_interactions) * 100

print(f"Total number of missing interactions: {total_missing_interactions}")
print(f"Number of missing interactions with at least one NaN: {rows_with_nan}")
print(f"Percentage of missing interactions with at least one NaN: {percent_interactions_with_nan:.2f}%")

Total number of missing interactions: 1089547
Number of missing interactions with at least one NaN: 27555
Percentage of missing interactions with at least one NaN: 2.53%


### Missing interactions subset

In [39]:
missing_counts_undefined = missing_interactions_df.isnull().sum()
print(missing_counts_undefined[missing_counts_undefined > 0])

s_ss8              31
s_rsa              63
s_phi            6856
s_psi            2252
s_3di_state     13005
s_3di_letter    13005
t_ss8              56
t_rsa              66
t_phi            2951
t_psi            7370
t_3di_state     15292
t_3di_letter    15292
dtype: int64


### On the entire dataset

In [40]:
missing_counts = combined_df.isnull().sum()
print(missing_counts[missing_counts > 0])

s_ss8              31
s_rsa              63
s_phi           17807
s_psi            6736
s_3di_state     37025
s_3di_letter    37025
t_ss8              56
t_rsa              75
t_phi            6167
t_psi           21474
t_3di_state     44036
t_3di_letter    44036
dtype: int64


### Comparison
Here is reported the percentage of NaN values per feature present in the missing interactions compared to the total expected in the data.

In [41]:
missing_counts_undefined = missing_interactions_df.isnull().sum() # NaN in every feature
missing_counts = combined_df.isnull().sum()

# % of NANs values in the MISSING interactions over the total
percent_nan_from_total = (missing_counts_undefined / missing_counts) * 100

print(percent_nan_from_total[missing_counts > 0].sort_values(ascending=False))

s_ss8           100.000000
s_rsa           100.000000
t_ss8           100.000000
t_rsa            88.000000
t_phi            47.851467
s_phi            38.501713
s_3di_state      35.124916
s_3di_letter     35.124916
t_3di_letter     34.726133
t_3di_state      34.726133
t_psi            34.320574
s_psi            33.432304
dtype: float64


## Missing interactions with more than one  missing feature
There are also cases where missing interactions have more than one missing feature.

In [42]:
row_nan_counts = missing_interactions_df.isnull().sum(axis=1)
more_than_one_nan = missing_interactions_df[row_nan_counts > 1]
print("Total number of missing interactions:", missing_interactions_df.shape[0])
print(f"Number of missing interactions with more than one NaN: {len(more_than_one_nan)}")
print(f"Percentage of missing interactions with more than one NaN: {(len(more_than_one_nan) / missing_interactions_df.shape[0]) * 100:.2f}%")

Total number of missing interactions: 1089547
Number of missing interactions with more than one NaN: 25896
Percentage of missing interactions with more than one NaN: 2.38%


# Integrated data

In [43]:
# Load df
combined_df = load_tsv_as_df(tsv_path, 'combinated_df_new.tsv')

In [44]:
combined_df['Interaction'] = combined_df['Interaction'].fillna('MISSING')
combined_df['Interaction'] = combined_df['Interaction'].astype('category')
combined_df['Interaction'].cat.categories.tolist()

['HBOND',
 'IONIC',
 'MISSING',
 'PICATION',
 'PIHBOND',
 'PIPISTACK',
 'SSBOND',
 'VDW']

Creation of a DataFrame containing all the missing interactions.

In [45]:
missing_interactions_df = combined_df[combined_df['Interaction'] == 'MISSING']

In [46]:
# Show the result
print("Shape of the dataset:", missing_interactions_df.shape)
missing_interactions_df.head()

Shape of the dataset: (1089547, 44)


Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,s_3di_state,s_3di_letter,t_ch,t_resi,t_ins,t_resn,t_ss8,t_rsa,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5,t_3di_state,t_3di_letter,Interaction,same_chain,delta_rsa,delta_atchley_1,delta_atchley_2,delta_atchley_3,delta_atchley_4,delta_atchley_5,ca_distance,s_centroid_x,s_centroid_y,t_centroid_x,t_centroid_y
6,1b0y,A,8,,A,P,0.368,-1.494,2.583,-0.591,-1.302,-0.733,1.57,-0.146,2.0,C,A,14,,A,H,0.009,-0.956,-0.846,-0.591,-1.302,-0.733,1.57,-0.146,11.0,L,MISSING,1,0.359,0.0,0.0,0.0,0.0,0.0,7.132915,0.494826,-0.420486,1.776884,-1.303724
9,1b0y,A,49,,M,E,0.197,-1.156,2.443,-0.663,-1.524,2.219,-1.005,1.212,18.0,S,A,62,,G,E,0.012,-1.348,2.896,-0.384,1.652,1.33,1.045,2.064,2.0,C,MISSING,1,0.185,0.279,3.176,0.889,2.05,0.852,5.700675,-2.303026,0.38133,0.494826,-0.420486
18,1b0y,A,17,,L,H,0.189,-1.578,-0.017,-1.019,-0.987,-1.505,1.266,-0.912,4.0,E,A,79,,S,T,0.254,-1.639,0.06,-0.228,1.399,-4.76,0.67,-2.647,12.0,M,MISSING,1,0.065,0.791,2.386,3.255,0.596,1.735,6.6636,-1.662143,-0.425868,0.690114,-1.255422
19,1b0y,A,36,,L,S,0.25,-2.841,2.446,-1.019,-0.987,-1.505,1.266,-0.912,2.0,C,A,40,,E,G,0.546,-1.86,-0.062,1.357,-1.453,1.477,0.113,-0.837,12.0,M,MISSING,1,0.296,2.376,0.466,2.982,1.153,0.075,6.793983,0.494826,-0.420486,0.690114,-1.255422
22,1b0y,A,48,,F,T,0.259,-1.496,-0.211,-1.006,-0.59,1.891,-0.397,0.412,13.0,N,A,63,,C,E,0.096,-2.476,2.319,-1.343,0.465,-0.862,-1.02,-0.255,5.0,F,MISSING,1,0.163,0.337,1.055,2.753,0.623,0.667,5.470046,-1.106118,-1.339661,2.139425,0.048612


## Missing values per features
The number of missing values represents only a small fraction of the unclassified interactions, explaining only a limited part of the problem.

In [47]:
# total missing interactions
total_missing_interactions = missing_interactions_df.shape[0]

# Missing interaction with at least one NAN
rows_with_nan = missing_interactions_df.isnull().any(axis=1).sum()

# % of missing interaction with at least one NAN
percent_interactions_with_nan = (rows_with_nan / total_missing_interactions) * 100

print(f"Total number of missing interactions: {total_missing_interactions}")
print(f"Number of missing interactions with at least one NaN: {rows_with_nan}")
print(f"Percentage of missing interactions with at least one NaN: {percent_interactions_with_nan:.2f}%")

Total number of missing interactions: 1089547
Number of missing interactions with at least one NaN: 51546
Percentage of missing interactions with at least one NaN: 4.73%


### Missing interactions subset

In [48]:
missing_counts_undefined = missing_interactions_df.isnull().sum()
print(missing_counts_undefined[missing_counts_undefined > 0])

s_ss8              31
s_rsa              63
s_phi            6856
s_psi            2252
s_3di_state     13005
s_3di_letter    13005
t_ss8              56
t_rsa              66
t_phi            2951
t_psi            7370
t_3di_state     15292
t_3di_letter    15292
delta_rsa         129
ca_distance     24338
s_centroid_x    13005
s_centroid_y    13005
t_centroid_x    15292
t_centroid_y    15292
dtype: int64


### On the entire dataset

In [49]:
missing_counts = combined_df.isnull().sum()
print(missing_counts[missing_counts > 0])

s_ss8              31
s_rsa              63
s_phi           17807
s_psi            6736
s_3di_state     37025
s_3di_letter    37025
t_ss8              56
t_rsa              75
t_phi            6167
t_psi           21474
t_3di_state     44036
t_3di_letter    44036
delta_rsa         138
ca_distance     50115
s_centroid_x    37025
s_centroid_y    37025
t_centroid_x    44036
t_centroid_y    44036
dtype: int64


### Comparison
Here is reported the percentage of NaN values per feature present in the missing interactions compared to the total expected in the data.

In [50]:
missing_counts_undefined = missing_interactions_df.isnull().sum() # NaN in every feature
missing_counts = combined_df.isnull().sum()

# % of NANs values in the MISSING interactions over the total
percent_nan_from_total = (missing_counts_undefined / missing_counts) * 100

print(percent_nan_from_total[missing_counts > 0].sort_values(ascending=False))

s_ss8           100.000000
s_rsa           100.000000
t_ss8           100.000000
delta_rsa        93.478261
t_rsa            88.000000
ca_distance      48.564302
t_phi            47.851467
s_phi            38.501713
s_3di_state      35.124916
s_3di_letter     35.124916
s_centroid_y     35.124916
s_centroid_x     35.124916
t_3di_state      34.726133
t_3di_letter     34.726133
t_centroid_x     34.726133
t_centroid_y     34.726133
t_psi            34.320574
s_psi            33.432304
dtype: float64


## Missing interactions with more than one missing feature
There are also cases where missing interactions have more than one missing feature.

In [51]:
row_nan_counts = missing_interactions_df.isnull().sum(axis=1)
more_than_one_nan = missing_interactions_df[row_nan_counts > 1]
print("Total number of missing interactions:", missing_interactions_df.shape[0])
print(f"Number of missing interactions with more than one NaN: {len(more_than_one_nan)}")
print(f"Percentage of missing interactions with more than one NaN: {(len(more_than_one_nan) / missing_interactions_df.shape[0]) * 100:.2f}%")

Total number of missing interactions: 1089547
Number of missing interactions with more than one NaN: 25913
Percentage of missing interactions with more than one NaN: 2.38%


# Comments
For both datasets, it is observed that:

- The features **s_ss8**, **s_rsa** and **t_ss8** exhibit 100% of their NaN values within unclassified interactions;

- The feature **t_rsa** shows a similar trend, with approximately 88% of its missing values found in unclassified interactions;

- Other features, such as **s_phi**, **t_phi** and **3Di_state**, have missing values distributed across both classified and unclassified interactions. However, a substantial proportion (30–50%) of their NaNs are still associated with MISSING interactions, suggesting a partial link between these features and unclassified cases.

It is very likely that the lack of certain *structural/chemical* features (especially rsa and ss8) makes it difficult for RING to classify some interactions. In contrast, the lack of features related to *distance/3D space* seems to have a lesser impact instead.



However, it is important to note that the presence of NaN values in the features is not sufficient to explain the occurrence of missing interactions. In particular, most of the missing interactions do not contain any missing feature values.

# Experiments

## Missing values in some features
In most of the experiments of this kind, visualization of the sequences and corresponding structures in PyMOL revealed the following noteworthy aspects:

- **Unresolved regions** (red dots): residues present in the theoretical sequence (i.e., listed in the PDB file) but not observed in the 3D structure;
- **Gaps** (black holes): missing positions absent both in the sequence and in the 3D model.

This may suggest the presence of disordered or flexible regions within the protein structure.

In [52]:
missing_s_ss8 = missing_interactions_df[missing_interactions_df['s_ss8'].isna()]
missing_s_ss8.head() # with PYMOL is possible to see "red and black point" in the sequence (i.e unmapped position and gaps)

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,s_3di_state,s_3di_letter,t_ch,t_resi,t_ins,t_resn,t_ss8,t_rsa,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5,t_3di_state,t_3di_letter,Interaction,same_chain,delta_rsa,delta_atchley_1,delta_atchley_2,delta_atchley_3,delta_atchley_4,delta_atchley_5,ca_distance,s_centroid_x,s_centroid_y,t_centroid_x,t_centroid_y
121594,2xj3,B,1,,M,,,,,-0.663,-1.524,2.219,-1.005,1.212,,,B,63,,W,-,0.181,-1.772,2.438,-0.595,0.009,0.672,-2.128,-0.184,2.0,C,MISSING,1,,0.068,1.533,1.547,1.123,1.396,5.917708,,,0.494826,-0.420486
369205,4xpz,A,372,,P,,,,,0.189,2.081,-1.628,0.421,-1.392,,,A,378,,H,-,0.56,-2.505,2.62,0.336,-0.417,-1.673,-1.474,-0.078,2.0,C,MISSING,1,,0.147,2.498,0.045,1.895,1.314,8.322514,,,0.494826,-0.420486
369267,4xpz,A,372,,P,,,,,0.189,2.081,-1.628,0.421,-1.392,,,A,375,,T,-,0.232,-1.566,2.332,-0.032,0.326,2.213,0.908,1.313,5.0,F,MISSING,1,,0.221,1.755,3.841,0.487,2.705,5.401435,,,2.139425,0.048612
510411,6gaj,A,158,,I,,,-1.521,-1.227,-1.239,-0.547,2.131,0.393,0.816,,,C,157,,L,H,0.707,-1.421,-0.286,-1.019,-0.987,-1.505,1.266,-0.912,,,MISSING,0,,0.22,0.44,3.636,0.873,1.728,7.014524,,,,
529502,6l4v,A,180,,H,,,,,0.336,-0.417,-1.673,-1.474,-0.078,,,A,450,,Q,H,0.298,-1.083,-0.683,0.931,-0.179,-3.005,-0.503,-1.853,12.0,M,MISSING,1,,0.595,0.238,1.332,0.971,1.775,6.488179,,,0.690114,-1.255422


In [53]:
missing_t_phi = missing_interactions_df[missing_interactions_df['t_phi'].isna()]
missing_t_phi.head() # also here many unmapped position

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,s_3di_state,s_3di_letter,t_ch,t_resi,t_ins,t_resn,t_ss8,t_rsa,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5,t_3di_state,t_3di_letter,Interaction,same_chain,delta_rsa,delta_atchley_1,delta_atchley_2,delta_atchley_3,delta_atchley_4,delta_atchley_5,ca_distance,s_centroid_x,s_centroid_y,t_centroid_x,t_centroid_y
3054,1ekq,B,195,,G,T,0.048,1.589,0.13,-0.384,1.652,1.33,1.045,2.064,12.0,M,B,199,,L,H,0.0,,-0.673,-1.019,-0.987,-1.505,1.266,-0.912,,,MISSING,1,0.048,0.635,2.639,2.835,0.221,2.976,5.745141,0.690114,-1.255422,,
3408,1ekq,A,31,,F,H,0.076,-1.11,-0.731,-1.006,-0.59,1.891,-0.397,0.412,17.0,R,A,199,,L,H,0.0,,-0.648,-1.019,-0.987,-1.505,1.266,-0.912,,,MISSING,1,0.076,0.013,0.397,3.396,1.663,1.324,7.255955,0.778631,-2.165999,,
3419,1ekq,A,195,,G,T,0.024,1.618,-0.077,-0.384,1.652,1.33,1.045,2.064,11.0,L,A,199,,L,H,0.0,,-0.648,-1.019,-0.987,-1.505,1.266,-0.912,,,MISSING,1,0.024,0.635,2.639,2.835,0.221,2.976,5.688924,1.776884,-1.303724,,
4843,1fg7,A,234,,N,H,0.618,-1.108,-0.534,0.945,0.828,1.299,-0.169,0.933,17.0,R,A,238,,K,H,0.849,,-0.333,1.831,-0.561,0.533,-0.277,1.648,,,MISSING,1,0.231,0.886,1.389,0.766,0.108,0.715,6.55437,0.778631,-2.165999,,
4861,1fg7,A,262,,I,H,0.089,-1.176,-0.7,-1.239,-0.547,2.131,0.393,0.816,17.0,R,A,266,,R,H,0.605,,-0.552,1.538,-0.055,1.502,0.44,2.897,,,MISSING,1,0.516,2.777,0.492,0.629,0.047,2.081,6.197097,0.778631,-2.165999,,


In [54]:
missing_ca_distance = missing_interactions_df[missing_interactions_df['ca_distance'].isna()]
missing_ca_distance # this depend because the pdb_id is not valid or bcs the structure is not load fron biophyton
                    # (-) in SS8: disorder region

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,s_3di_state,s_3di_letter,t_ch,t_resi,t_ins,t_resn,t_ss8,t_rsa,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5,t_3di_state,t_3di_letter,Interaction,same_chain,delta_rsa,delta_atchley_1,delta_atchley_2,delta_atchley_3,delta_atchley_4,delta_atchley_5,ca_distance,s_centroid_x,s_centroid_y,t_centroid_x,t_centroid_y
2469,1e+29,A,5,,E,T,0.619,-1.075,-0.694,1.357,-1.453,1.477,0.113,-0.837,13.0,N,A,9,,T,E,0.246,-1.834,2.135,-0.032,0.326,2.213,0.908,1.313,18.0,S,MISSING,1,0.373,1.389,1.779,0.736,0.795,2.150,,-1.106118,-1.339661,-2.303026,0.381330
2477,1e+29,A,63,,E,S,0.613,-2.091,1.977,1.357,-1.453,1.477,0.113,-0.837,17.0,R,A,83,,D,T,0.589,-1.387,-0.018,1.050,0.302,-3.656,-0.259,-3.242,12.0,M,MISSING,1,0.024,0.307,1.755,5.133,0.372,2.405,,0.778631,-2.165999,0.690114,-1.255422
2479,1e+29,A,65,,R,-,0.444,-1.143,2.526,1.538,-0.055,1.502,0.440,2.897,14.0,O,A,71,,A,H,0.189,-1.165,-0.617,-0.591,-1.302,-0.733,1.570,-0.146,15.0,P,MISSING,1,0.255,2.129,1.247,2.235,1.130,3.043,,2.149514,-0.802992,2.305979,-1.498816
2482,1e+29,A,57,,A,H,0.792,-1.119,-0.682,-0.591,-1.302,-0.733,1.570,-0.146,17.0,R,A,61,,G,T,0.524,-1.601,-0.049,-0.384,1.652,1.330,1.045,2.064,13.0,N,MISSING,1,0.268,0.207,2.954,2.063,0.525,2.210,,0.778631,-2.165999,-1.106118,-1.339661
2483,1e+29,A,73,,V,H,0.035,-1.020,-0.800,-1.337,-0.279,-0.544,1.242,-1.262,9.0,J,A,76,,L,H,0.159,-1.367,-0.180,-1.019,-0.987,-1.505,1.266,-0.912,13.0,N,MISSING,1,0.124,0.318,0.708,0.961,0.024,0.350,,-1.140001,-2.006822,-1.106118,-1.339661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2937590,8s77,B,391,,G,T,0.476,1.863,-0.451,-0.384,1.652,1.330,1.045,2.064,12.0,M,B,395,,A,H,0.594,-1.222,-0.732,-0.591,-1.302,-0.733,1.570,-0.146,17.0,R,MISSING,1,0.118,0.207,2.954,2.063,0.525,2.210,,0.690114,-1.255422,0.778631,-2.165999
2937593,8s77,A,238,,S,E,0.008,-1.363,2.155,-0.228,1.399,-4.760,0.670,-2.647,2.0,C,A,253,,V,E,0.007,-1.948,2.643,-1.337,-0.279,-0.544,1.242,-1.262,16.0,Q,MISSING,1,0.001,1.109,1.678,4.216,0.572,1.385,,0.494826,-0.420486,2.552175,0.604622
2937595,8s77,A,425,,A,H,0.415,-1.124,-0.510,-0.591,-1.302,-0.733,1.570,-0.146,1.0,B,A,430,,V,T,0.465,-0.942,-0.702,-1.337,-0.279,-0.544,1.242,-1.262,15.0,P,MISSING,1,0.050,0.746,1.023,0.189,0.328,1.116,,-0.135632,-1.891373,2.305979,-1.498816
2937596,8s77,B,330,,V,E,0.035,-1.374,2.323,-1.337,-0.279,-0.544,1.242,-1.262,0.0,A,B,354,,A,E,0.000,-2.880,2.768,-0.591,-1.302,-0.733,1.570,-0.146,3.0,D,MISSING,1,0.035,0.746,1.023,0.189,0.328,1.116,,-1.072910,-0.359984,-0.987449,0.812764


In [55]:
missing_t_3di_state = missing_interactions_df[missing_interactions_df['t_3di_state'].isna()]
missing_t_3di_state.head() # disorder region and many unmapped point

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,s_3di_state,s_3di_letter,t_ch,t_resi,t_ins,t_resn,t_ss8,t_rsa,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5,t_3di_state,t_3di_letter,Interaction,same_chain,delta_rsa,delta_atchley_1,delta_atchley_2,delta_atchley_3,delta_atchley_4,delta_atchley_5,ca_distance,s_centroid_x,s_centroid_y,t_centroid_x,t_centroid_y
361,1bs9,A,200,,A,H,0.547,-1.022,-0.748,-0.591,-1.302,-0.733,1.57,-0.146,17.0,R,A,203,,K,H,0.532,-1.008,-0.698,1.831,-0.561,0.533,-0.277,1.648,,,MISSING,1,0.015,2.422,0.741,1.266,1.847,1.794,5.075436,0.778631,-2.165999,,
840,1byi,A,26,,A,H,0.0,-1.159,-0.704,-0.591,-1.302,-0.733,1.57,-0.146,1.0,B,A,221,,L,G,0.189,-1.137,-0.338,-1.019,-0.987,-1.505,1.266,-0.912,,,MISSING,1,0.189,0.428,0.315,0.772,0.304,0.766,7.178825,-0.135632,-1.891373,,
1514,1c0p,A,1357,,Q,H,0.626,-1.048,-0.79,0.931,-0.179,-3.005,-0.503,-1.853,17.0,R,A,1361,,G,-,1.0,1.498,,-0.384,1.652,1.33,1.045,2.064,,,MISSING,1,0.374,1.315,1.831,4.335,1.548,3.917,6.072919,0.778631,-2.165999,,
1590,1c0p,A,1356,,F,H,0.091,-1.094,-0.751,-1.006,-0.59,1.891,-0.397,0.412,1.0,B,A,1361,,G,-,1.0,1.498,,-0.384,1.652,1.33,1.045,2.064,,,MISSING,1,0.909,0.622,2.242,0.561,1.442,1.652,6.307737,-0.135632,-1.891373,,
2149,1dj0,A,119,,R,E,0.06,-2.0,2.431,1.538,-0.055,1.502,0.44,2.897,8.0,I,A,270,,D,-,0.785,-1.096,,1.05,0.302,-3.656,-0.259,-3.242,,,MISSING,1,0.725,0.488,0.357,5.158,0.699,6.139,11.019987,-2.881375,0.995632,,


## Absence of missing values

Here we examine the missing interactions that do not have any missing values within the features (the majority). Nevertheless, the use of PyMOL was not sufficient for us to reach definitive conclusions.

In [56]:
missing_complete = missing_interactions_df.dropna()
missing_complete.head()

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,s_3di_state,s_3di_letter,t_ch,t_resi,t_ins,t_resn,t_ss8,t_rsa,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5,t_3di_state,t_3di_letter,Interaction,same_chain,delta_rsa,delta_atchley_1,delta_atchley_2,delta_atchley_3,delta_atchley_4,delta_atchley_5,ca_distance,s_centroid_x,s_centroid_y,t_centroid_x,t_centroid_y
6,1b0y,A,8,,A,P,0.368,-1.494,2.583,-0.591,-1.302,-0.733,1.57,-0.146,2.0,C,A,14,,A,H,0.009,-0.956,-0.846,-0.591,-1.302,-0.733,1.57,-0.146,11.0,L,MISSING,1,0.359,0.0,0.0,0.0,0.0,0.0,7.132915,0.494826,-0.420486,1.776884,-1.303724
9,1b0y,A,49,,M,E,0.197,-1.156,2.443,-0.663,-1.524,2.219,-1.005,1.212,18.0,S,A,62,,G,E,0.012,-1.348,2.896,-0.384,1.652,1.33,1.045,2.064,2.0,C,MISSING,1,0.185,0.279,3.176,0.889,2.05,0.852,5.700675,-2.303026,0.38133,0.494826,-0.420486
18,1b0y,A,17,,L,H,0.189,-1.578,-0.017,-1.019,-0.987,-1.505,1.266,-0.912,4.0,E,A,79,,S,T,0.254,-1.639,0.06,-0.228,1.399,-4.76,0.67,-2.647,12.0,M,MISSING,1,0.065,0.791,2.386,3.255,0.596,1.735,6.6636,-1.662143,-0.425868,0.690114,-1.255422
19,1b0y,A,36,,L,S,0.25,-2.841,2.446,-1.019,-0.987,-1.505,1.266,-0.912,2.0,C,A,40,,E,G,0.546,-1.86,-0.062,1.357,-1.453,1.477,0.113,-0.837,12.0,M,MISSING,1,0.296,2.376,0.466,2.982,1.153,0.075,6.793983,0.494826,-0.420486,0.690114,-1.255422
22,1b0y,A,48,,F,T,0.259,-1.496,-0.211,-1.006,-0.59,1.891,-0.397,0.412,13.0,N,A,63,,C,E,0.096,-2.476,2.319,-1.343,0.465,-0.862,-1.02,-0.255,5.0,F,MISSING,1,0.163,0.337,1.055,2.753,0.623,0.667,5.470046,-1.106118,-1.339661,2.139425,0.048612


# Possible suggestion
It is evident that no single, definitive explanation exists. Multiple factors may lead RING to leave some interactions unclassified (some more clear then others):

- The type of encoded features, since the absence of certain information may negatively affect the outcome to varying degrees;

- The presence of missing residues, unresolved regions, and structural disorder, as highlighted by PyMOL analyses;

- Inaccuracies in the software’s handling of long-distance interactions or limitations due to the geometrical constraints on which it is based.


Given the large number of interactions, it is also challenging (only using visulization tools), to identify a rule robust enough to generalize across all the data.




