# UniProt Data Preprocessing

This notebook is for preprocessing a UniProt TSV file with columns (Protein families, Binding site, Active site, Sequence). If the family annotation is missing, the code will filter out this sequence. Missing binding sites are not acceptable for this notebook, so make sure all of your suequences have binding site annotations. If the Active site annotation is missing, the sequence will be included without issue. Missing sequences are not handled by this notebook. 

In [1]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'uniprotkb_family_AND_ft_binding_AND_pro_2023_09_19.tsv'
data = pd.read_csv(file_path, sep='\t')

# Display the first few rows of the dataframe
data.head()

In [2]:
data.shape[0]

In [4]:
import pandas as pd

# Load the dataset
file_path = 'uniprotkb_family_AND_ft_binding_AND_pro_2023_09_19.tsv'
data = pd.read_csv(file_path, sep='\t')

# Filter out rows with NaN values in the 'Protein families' column
data = data[pd.notna(data['Protein families'])]

# Display the first few rows of the modified dataframe
data.head()

In [5]:
data.shape[0]

In [6]:
# Group the data by 'Protein families' and get the size of each group
family_sizes = data.groupby('Protein families').size()

# Create a new column with the size of each family
data['Family size'] = data['Protein families'].map(family_sizes)

# Sort the data by 'Family size' in descending order and then by 'Protein families'
data_sorted = data.sort_values(by=['Family size', 'Protein families'], ascending=[False, True])

# Drop the 'Family size' column as it is no longer needed
data_sorted.drop(columns='Family size', inplace=True)

# Define a function to extract the location from the binding and active site columns
def extract_location(site_info):
    if pd.isnull(site_info):
        return None
    locations = []
    for info in site_info.split(';'):
        if 'BINDING' in info or 'ACT_SITE' in info:
            locations.append(info.split()[1])
    return '; '.join(locations)

# Apply the function to the 'Binding site' and 'Active site' columns to extract the locations
data_sorted['Binding site'] = data_sorted['Binding site'].apply(extract_location)
data_sorted['Active site'] = data_sorted['Active site'].apply(extract_location)

# Display the first few rows of the modified dataframe
data_sorted.head()

In [7]:
# Create a new column that combines the 'Binding site' and 'Active site' columns
data_sorted['Binding-Active site'] = data_sorted['Binding site'].astype(str) + '; ' + data_sorted['Active site'].astype(str)

# Replace 'nan' values with None
data_sorted['Binding-Active site'] = data_sorted['Binding-Active site'].replace('nan; nan', None)

# Display the first few rows of the updated dataframe
data_sorted.head()

In [8]:
# Find entries in the "Binding-Active site" column containing '<' or '>'
entries_with_angle_brackets = data_sorted['Binding-Active site'].str.contains('<|>', na=False)

# Get the number of such entries
num_entries_with_angle_brackets = entries_with_angle_brackets.sum()

# Display the number of entries containing '<' or '>'
print(f"Number of entries with angle brackets: {num_entries_with_angle_brackets}")

# Remove all rows where the "Binding-Active site" column contains '<' or '>'
data_filtered = data_sorted[~entries_with_angle_brackets]

# Get the number of remaining rows
num_remaining_rows = data_filtered.shape[0]

# Display the number of remaining rows
print(f"Number of remaining rows: {num_remaining_rows}")

# Get the number of distinct protein families
num_distinct_families = data_filtered['Protein families'].nunique()

# Display the number of distinct protein families
# Display the number of distinct protein families
print(f"Number of distinct protein families: {num_distinct_families}")

# Define the target number of rows for the test set (approximately 20% of the data)
target_test_rows = int(0.20 * num_remaining_rows)

# Get unique protein families
unique_families = data_filtered['Protein families'].unique()

# Shuffle the unique families to randomize the selection
np.random.shuffle(unique_families)

# Initialize variables to keep track of the selected rows for the test and train sets
test_rows = []
current_test_rows = 0

# Loop through the shuffled families and add rows to the test set until we reach the target number of rows
for family in unique_families:
    family_rows = data_filtered[data_filtered['Protein families'] == family].index.tolist()
    if current_test_rows + len(family_rows) < target_test_rows:
        test_rows.extend(family_rows)
        current_test_rows += len(family_rows)
    else:
        # If adding the current family exceeds the target, we add it anyway and break the loop
        test_rows.extend(family_rows)
        break

# Get the indices of the rows for the train set (all rows not in the test set)
train_rows = [i for i in data_filtered.index if i not in test_rows]

# Create the test and train datasets
test_df = data_filtered.loc[test_rows]
train_df = data_filtered.loc[train_rows]

test_df.shape[0], train_df.shape[0]

In [9]:
# Print the first few rows of each dataset to understand their structure
test_df.head()

In [10]:
train_df.head()

In [11]:
# Find rows where the "Binding-Active site" column contains the character "?", treating "?" as a literal character
test_rows_with_question_mark = test_df[test_df['Binding-Active site'].str.contains('\?', na=False, regex=True)]
train_rows_with_question_mark = train_df[train_df['Binding-Active site'].str.contains('\?', na=False, regex=True)]

# Get the number of such rows in both datasets
num_test_rows_with_question_mark = len(test_rows_with_question_mark)
num_train_rows_with_question_mark = len(train_rows_with_question_mark)

print(f"Number of test rows with question mark: {num_test_rows_with_question_mark}")
print(f"Number of train rows with question mark: {num_train_rows_with_question_mark}")

# Delete the rows containing '?' in the "Binding-Active site" column
test_df = test_df.drop(test_rows_with_question_mark.index)
train_df = train_df.drop(train_rows_with_question_mark.index)

# Check the number of remaining rows in both datasets
remaining_test_rows = test_df.shape[0]
remaining_train_rows = train_df.shape[0]

print(f"Number of remaining test rows: {remaining_test_rows}")
print(f"Number of remaining train rows: {remaining_train_rows}")

import re

def expand_ranges(s):
    """Expand ranges in a string."""
    return re.sub(r'(\d+)\.\.(\d+)', lambda m: ', '.join(map(str, range(int(m.group(1)), int(m.group(2))+1))), str(s))

# Apply the function to expand ranges in the "Binding-Active site" column in both datasets
test_df['Binding-Active site'] = test_df['Binding-Active site'].apply(expand_ranges)
train_df['Binding-Active site'] = train_df['Binding-Active site'].apply(expand_ranges)

# Display the first few rows of each dataset to verify the changes
# print(test_df.head())
# print(train_df.head())

In [12]:
def convert_to_binary_list(binding_active_str, sequence_len):
    """Convert a Binding-Active site string to a binary list based on the sequence length."""
    # Step 2: Create a list of 0s with length equal to the sequence length
    binary_list = [0] * sequence_len
    
    # Step 3: Retrieve the indices and set the corresponding positions to 1
    if pd.notna(binding_active_str):
        # Get the indices from the binding-active site string
        indices = [int(x) - 1 for segment in binding_active_str.split(';') for x in segment.split(',') if x.strip().isdigit()]
        for idx in indices:
            # Ensure the index is within the valid range
            if 0 <= idx < sequence_len:
                binary_list[idx] = 1
                
    # Step 4: Return the binary list
    return binary_list

# Apply the function to both datasets
test_df['Binding-Active site'] = test_df.apply(lambda row: convert_to_binary_list(row['Binding-Active site'], len(row['Sequence'])), axis=1)
train_df['Binding-Active site'] = train_df.apply(lambda row: convert_to_binary_list(row['Binding-Active site'], len(row['Sequence'])), axis=1)


In [13]:
test_df.head()

In [14]:
train_df.head()

In [15]:
import pickle
import random

def split_into_chunks(sequences, labels):
    """Split sequences and labels into chunks of size 1000 or less."""
    chunk_size = 1000
    new_sequences = []
    new_labels = []
    
    for seq, lbl in zip(sequences, labels):
        if len(seq) > chunk_size:
            # Split the sequence and labels into chunks of size 1000 or less
            for i in range(0, len(seq), chunk_size):
                new_sequences.append(seq[i:i+chunk_size])
                new_labels.append(lbl[i:i+chunk_size])
        else:
            new_sequences.append(seq)
            new_labels.append(lbl)
            
    return new_sequences, new_labels

# Extract the necessary columns to create lists of sequences and labels
test_sequences_by_family = test_df['Sequence'].tolist()
test_labels_by_family = test_df['Binding-Active site'].tolist()
train_sequences_by_family = train_df['Sequence'].tolist()
train_labels_by_family = train_df['Binding-Active site'].tolist()

# Get the number of samples in each dataset
num_test_samples = len(test_sequences_by_family)
num_train_samples = len(train_sequences_by_family)

# Generate random indices representing 50% of each dataset
random_test_indices = random.sample(range(num_test_samples), num_test_samples // 26.66)
random_train_indices = random.sample(range(num_train_samples), num_train_samples // 26.66)

# Create smaller datasets using the random indices
test_sequences_small = [test_sequences_by_family[i] for i in random_test_indices]
test_labels_small = [test_labels_by_family[i] for i in random_test_indices]
train_sequences_small = [train_sequences_by_family[i] for i in random_train_indices]
train_labels_small = [train_labels_by_family[i] for i in random_train_indices]

# Apply the function to create new datasets with chunks of size 1000 or less
test_sequences_chunked, test_labels_chunked = split_into_chunks(test_sequences_small, test_labels_small)
train_sequences_chunked, train_labels_chunked = split_into_chunks(train_sequences_small, train_labels_small)

# Paths to save the new chunked pickle files
test_labels_chunked_path = '600K_data/test_labels_chunked_by_family.pkl'
test_sequences_chunked_path = '600K_data/test_sequences_chunked_by_family.pkl'
train_labels_chunked_path = '600K_data/train_labels_chunked_by_family.pkl'
train_sequences_chunked_path = '600K_data/train_sequences_chunked_by_family.pkl'

# Save the chunked datasets as new pickle files
with open(test_labels_chunked_path, 'wb') as file:
    pickle.dump(test_labels_chunked, file)
with open(test_sequences_chunked_path, 'wb') as file:
    pickle.dump(test_sequences_chunked, file)
with open(train_labels_chunked_path, 'wb') as file:
    pickle.dump(train_labels_chunked, file)
with open(train_sequences_chunked_path, 'wb') as file:
    pickle.dump(train_sequences_chunked, file)

test_labels_chunked_path, test_sequences_chunked_path, train_labels_chunked_path, train_sequences_chunked_path


In [16]:
# Load each pickle file and get the number of entries in each
with open(test_labels_chunked_path, 'rb') as file:
    test_labels_chunked = pickle.load(file)
    num_test_labels_chunked = len(test_labels_chunked)

with open(test_sequences_chunked_path, 'rb') as file:
    test_sequences_chunked = pickle.load(file)
    num_test_sequences_chunked = len(test_sequences_chunked)

with open(train_labels_chunked_path, 'rb') as file:
    train_labels_chunked = pickle.load(file)
    num_train_labels_chunked = len(train_labels_chunked)

with open(train_sequences_chunked_path, 'rb') as file:
    train_sequences_chunked = pickle.load(file)
    num_train_sequences_chunked = len(train_sequences_chunked)

num_test_labels_chunked, num_test_sequences_chunked, num_train_labels_chunked, num_train_sequences_chunked
