# Collect Data
For speed, we will use downsampled data.
You can use any gene fast file with this script.

In [None]:
!pip install biopython

In [None]:
!pip install pandas

In [None]:
! pip install numpy==1.23.4

In [None]:
! pip install tensorflow

Collecting protobuf<3.20,>=3.9.2
  Downloading protobuf-3.19.6-cp39-cp39-win_amd64.whl (895 kB)
     -------------------------------------- 895.9/895.9 kB 5.1 MB/s eta 0:00:00
Collecting flatbuffers<2,>=1.12
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting keras-preprocessing>=1.1.1
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
     ---------------------------------------- 42.6/42.6 kB ? eta 0:00:00
Collecting gast<=0.4.0,>=0.2.1
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1
  Downloading tensorflow_io_gcs_filesystem-0.31.0-cp39-cp39-win_amd64.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 8.6 MB/s eta 0:00:00
Collecting libclang>=13.0.0
  Downloading libclang-18.1.1-py2.py3-none-win_amd64.whl (26.4 MB)
     ---------------------------------------- 26.4/26.4 MB 9.0 MB/s eta 0:00:00
Installing collected packages: libclang, flatbuffers, tensorflow-io-gcs-filesystem, pr

In [None]:
! pip install matplotlib

In [None]:
# Import necessary libraries
from Bio import SeqIO
import pandas as pd
import numpy as np
import tensorflow.compat.v1 as tf
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Disable TensorFlow v2 behavior
tf.compat.v1.disable_v2_behavior()

# Enable plotting in a separate window
%matplotlib qt

# Function to convert FASTA file to DataFrame
def fasta2df(infile):
    records = SeqIO.parse(infile, 'fasta')
    seqList = []
    for record in records:
        desp = record.description
        seq = str(record.seq).upper()  # Convert the sequence directly to a string and make it uppercase
        seqList.append([desp] + [seq])
    seq_df = pd.DataFrame(seqList, columns=['strainName', 'seq'])
    return seq_df

# Load FASTA file and convert to DataFrame
df = fasta2df("data/alternative_splicing_human_10541.fasta")

# Display the first few rows of the DataFrame
df.head()


In [None]:
# Extract sequences from the DataFrame into a list (corpus)
corpus = list(df['seq'])

# Print the first 10 sequences from the corpus
print(corpus[:10])

# Print the total number of sequences in the corpus
print(len(corpus))


In [None]:
# Define base mapping dictionaries for encoding and decoding
__mapping = {"A": 8, "C": 4, "G": 2, "T": 1, "N": 15, "E": 0}
__rmapping = {8: "A", 4: "C", 2: "G", 1: "T", 15: "N", 0: "E"}

# Define the base size for encoding
base_size = 2**8

# Function to encode a DNA sequence into an integer
def encode_sequence(sequence):
    val = 0
    for i, base in enumerate(sequence):
        print(base)
        val += __mapping[base] * (2**8**i)
    return val

# Function to decode an integer back into a DNA sequence
def decode_sequence(val):
    import math

    sequence = ""
    n = math.floor(math.log(val) / math.log(base_size))

    while val > 0:
        next_layer = val % base_size**n
        sequence = str(__rmapping[int((val - next_layer) / base_size**n)]) + sequence
        n -= 1
        val = next_layer

    return sequence


# Downsample: A Larger n Results in a Longer Training Time

In [None]:
n=2500

# Remove _

In [None]:
# Function to remove null amino acids from sequences
def remove_null_AA(corpus_dna_new):
    null_AAs = ['_', '_', "_", "_", "_", "_", "_"]
    results = []
    print(len(corpus_dna_new))
    for text in corpus_dna_new:
        tmp = list(text)
        for null_AA in null_AAs:
            if null_AA in tmp:
                tmp.remove(null_AA)
        results.append("".join(tmp))
    
    return results


In [None]:
# Function to extract unique amino acids from sequences
def amin(corpus_dna_new):
    amino_acids = []
    for text in corpus_dna_new:
        for AA in list(text):
            amino_acids.append(AA)
    
    amino_acids = set(amino_acids)
    return list(amino_acids)


In [None]:
# Extract and display unique amino acids from the corpus
amino_acids = amin(corpus)
amino_acids


# data generation

In [None]:
# Function to convert amino acids to integers and generate training data
def data_out(amino_acids, corpus_dna_new):
    AA2int = {}

    # Create a mapping from amino acids to integers
    for i, AA in enumerate(amino_acids):
        AA2int[AA] = i

    sentences = []
    for sentence in corpus_dna_new:
        sentences.append(list(sentence))

    WINDOW_SIZE = 2

    data = []
    for sentence in sentences:
        for idx, AA in enumerate(sentence):
            for neighbor in sentence[max(idx - WINDOW_SIZE, 0) : min(idx + WINDOW_SIZE, len(sentence)) + 1]:
                if neighbor != AA:
                    data.append([AA, neighbor])
    
    return AA2int, data


In [None]:
# Generate the amino acid to integer mapping and training data
AA2int, data = data_out(amino_acids, corpus)

# Display the AA to integer mapping and training data
print(AA2int)
print(data)


In [None]:
# Function to convert data into a pandas DataFrame
def pandify(data):
    df = pd.DataFrame(data, columns=['input', 'label'])
    return df


In [None]:
# Convert the data into a pandas DataFrame
df = pandify(data)

# Downsample the DataFrame to the first 'n' rows
df_downsampled = df.head(n)

# Display the downsampled DataFrame
df_downsampled


# Define Tensorflow Graph

In [None]:
# Function to convert numbers to one hot vectors
def to_one_hot_encoding(data_point_index, amino_acids):
    ONE_HOT_DIM = len(amino_acids)
    one_hot_encoding = np.zeros(ONE_HOT_DIM)
    one_hot_encoding[data_point_index] = 1
    return one_hot_encoding


In [None]:
# Function to define the computational graph for training the model
def define_graph(AA2int, amino_acids, df):
    ONE_HOT_DIM = len(amino_acids)
    X = []  # input amino acid
    Y = []  # target amino acid

    for x, y in zip(df['input'], df['label']):
        X.append(to_one_hot_encoding(AA2int[x], amino_acids))
        Y.append(to_one_hot_encoding(AA2int[y], amino_acids))

    # Convert them to numpy arrays
    X_train = np.asarray(X)
    Y_train = np.asarray(Y)

    # Placeholders for X_train and Y_train
    x = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))
    y_label = tf.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))

    # AA embedding will be 3 dimension for 3D visualization
    EMBEDDING_DIM = 3

    # Hidden layer: which represents AA vector eventually
    W1 = tf.Variable(tf.random_normal([ONE_HOT_DIM, EMBEDDING_DIM]))
    b1 = tf.Variable(tf.random_normal([1]))  # Bias
    hidden_layer = tf.add(tf.matmul(x, W1), b1)

    # Output layer
    W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, ONE_HOT_DIM]))
    b2 = tf.Variable(tf.random_normal([1]))
    prediction = tf.nn.softmax(tf.add(tf.matmul(hidden_layer, W2), b2))

    # Loss function: cross entropy
    loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), axis=[1]))

    # Training operation
    train_op = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
    
    return X_train, Y_train, x, y_label, W1, b1, W2, b2, loss, train_op


In [None]:
# Define the computational graph using the downsampled DataFrame
X_train_downsampled, Y_train_downsampled, x_downsampled, y_label_downsampled, W1_downsampled, b1_downsampled, W2_downsampled, b2_downsampled, loss_downsampled, train_op_downsampled = define_graph(AA2int, amino_acids, df_downsampled)

# Display the shapes of the training data arrays
print(f"X_train_downsampled shape: {X_train_downsampled.shape}")
print(f"Y_train_downsampled shape: {Y_train_downsampled.shape}")


# Downsampled

## Train

In [None]:
# Initialize and run the TensorFlow session
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

# Training loop
iteration = 20000
for i in range(iteration):
    # Input is X_train which is one hot encoded AA
    # Label is Y_train which is one hot encoded neighbor AA
    sess.run(train_op_downsampled, feed_dict={x_downsampled: X_train_downsampled, y_label_downsampled: Y_train_downsampled})
    if i % 3000 == 0:
        loss_value = sess.run(loss_downsampled, feed_dict={x_downsampled: X_train_downsampled, y_label_downsampled: Y_train_downsampled})
        print(f'Iteration {i}, loss: {loss_value}')


In [None]:
# Now the hidden layer (W1 + b1) is actually the AA lookup table
vectors_downsampled = sess.run(W1_downsampled + b1_downsampled)

# Optionally, print the vectors
# print(vectors_downsampled)


## AA vector in table

In [None]:
# Create a DataFrame with the amino acid vectors
w2v_df_downsampled = pd.DataFrame(vectors_downsampled, columns=['x1', 'x2', 'x3'])
w2v_df_downsampled['AA'] = amino_acids
w2v_df_downsampled = w2v_df_downsampled[['AA', 'x1', 'x2', 'x3']]

# Optionally, display the DataFrame
# w2v_df_downsampled


In [None]:
# Drop rows where the amino acid is "_"
w2v_df_downsampled.drop(w2v_df_downsampled[w2v_df_downsampled['AA'] == "_"].index, inplace=True)

# Display the cleaned DataFrame
w2v_df_downsampled


In [None]:
# Create a list of amino acids from the cleaned DataFrame
AA_lst_downsampled = list(w2v_df_downsampled['AA'])


In [None]:
# Define a color mapping for amino acids
color_mp = {
    'D': 'b', 'E': 'b',   # Blue for acidic amino acids
    'R': 'r', 'K': 'r', 'H': 'r',  # Red for basic amino acids
    'N': 'y', 'Q': 'y', 'S': 'y', 'T': 'y', 'Y': 'y',  # Yellow for polar uncharged amino acids
    'A': 'g', 'V': 'g', 'L': 'g', 'I': 'g', 'P': 'g', 'F': 'g', 'M': 'g', 'W': 'g', 'C': 'g', 'G': 'g'  # Green for nonpolar amino acids
}


In [None]:
# Generate a list of color codes for each amino acid in the list
color_code_downsampled = []
for i, elt in enumerate(AA_lst_downsampled):
    color_code_downsampled.append(color_mp.get(elt, 'w'))  # Default to 'w' (white) if amino acid not found in color_mp
AA_lst_downsampled

In [None]:
# Create a zip object with amino acid labels and their corresponding vectors
z_data_downsampled = zip(w2v_df_downsampled['AA'], w2v_df_downsampled['x1'], w2v_df_downsampled['x2'])

# Convert to a list if you need to view or iterate multiple times
z_data_downsampled = list(z_data_downsampled)

# Display the zipped data
z_data_downsampled


In [None]:
# Iterate through the zipped data and print the amino acids and their corresponding vectors
for (AA_downsampled, x1_downsampled, x2_downsampled) in z_data_downsampled:
    print(AA_downsampled, x1_downsampled, x2_downsampled)


### 3D chart

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.gca(projection='3d')

for i in range(len(w2v_df_downsampled)):
    sc = ax.scatter(w2v_df_downsampled['x1'][i], w2v_df_downsampled['x2'][i], w2v_df_downsampled['x3'][i], c=color_code_downsampled[i], marker=r"$ {} $".format(AA_lst_downsampled[i]), s=100)
    
# for i in range(len(w2v_df_downsampled)):
#     sc = ax.scatter(w2v_df_downsampled['x1'][i], w2v_df_downsampled['x2'][i], w2v_df_downsampled['x3'][i], c=color_code_downsampled[i], marker=r"$ {} $".format(AA_lst_downsampled[i]), s=100, cmap="Spectral")    

# plt.colorbar(sc)
plt.show()