this first part is just binary encoding

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib as plt
from tqdm import tqdm
import math

for dirname, _, filenames in os.walk("/kaggle/input"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

print("Loading datasets...  ", end = "")
gdf = pd.read_csv("/kaggle/input/rice-genotype/dataset/genotype_data/12k_ld_imputed.csv")
tdf = pd.read_csv("/kaggle/input/rice-genotype/dataset/trait_data/quantitative_traits.csv")
gdf = gdf.rename(columns = {gdf.columns[0]: "ID"}).set_index("ID")
tdf = tdf.rename(columns = {tdf.columns[0]: "ID"}).set_index("ID").fillna(method = "ffill")
gdf = gdf[gdf.index.isin(tdf.index.values)]
print("Done")

enc_map = {0.0: "00", -1.0: "01", 1.0: "10"}
loc_map = list(gdf.columns)
enc_op = np.array(list(enc_map.keys()))

print("Cleaning up...  ", end = "")
cround = np.vectorize(lambda val: enc_op[np.abs(enc_op - val).argmin()] if val not in enc_op else val)
gdf = pd.DataFrame(cround(gdf.values), columns = gdf.columns, index = gdf.index)
print("Done")

/kaggle/input/rice-genotype/dataset/genotype_data/12k_ld_imputed.csv
/kaggle/input/rice-genotype/dataset/genotype_data/12k_unimputed.csv
/kaggle/input/rice-genotype/dataset/trait_data/readme.txt
/kaggle/input/rice-genotype/dataset/trait_data/quantitative_traits.csv
Loading datasets...  Done
Cleaning up...  Done


In [2]:
trait_map = {}

print("COLUMN NAME\tMIN\tMAX\tRANGE\tMEAN\n---------------------------------------------")
for column in tdf.columns:
    print(f"{column}     \t{tdf[column].min()}\t{tdf[column].max()}\t{round(tdf[column].max() - tdf[column].min())}\t{round(tdf[column].mean())}")
    
    max_val = math.ceil(tdf[column].max())
    #req_dec = not all(np.mod(x, 1) == 0 for x in tdf[column])
    req_dec = max_val < 15
    if (req_dec): max_val *= 10
    bit_length = round(math.log([n for n in [2**n for n in range(1, 100)] if n > max_val][0], 2))
    trait_map.update({column: {"bits": bit_length, "dec": req_dec}})

COLUMN NAME	MIN	MAX	RANGE	MEAN
---------------------------------------------
CUDI_REPRO     	2.0	9.1	7	5
CULT_REPRO     	27.0	204.0	177	114
CUNO_REPRO     	5.0	40.0	35	16
GRLT     	4.7	12.7	8	9
GRWD     	1.5	4.4	3	3
GRWT100     	1.0	5.0	4	2
HDG_80HEAD     	50.0	184.0	134	102
LIGLT     	4.0	47.0	43	18
LLT     	13.0	93.0	80	55
LWD     	0.4	2.5	2	1
PLT_POST     	13.0	37.0	24	25
SDHT     	12.0	74.0	62	39


In [3]:
def encode_gene(row):
    string = ""
    for value in row:
        string += enc_map[float(value)]
    return string

def decode_gene(b_string):
    gene_map = {}
    for i in range(0, len(string), 2):
        target_encoded = string[i:i + 2]
        gene = list(enc_map.keys())[list(enc_map.values()).index(target_encoded)]
        gene_map.update({loc_map[round(i/2)]: gene})
    return gene_map

def encode_trait(row):
    string = ""
    for column in trait_map.keys():
        bits = trait_map[column]["bits"]
        dec = trait_map[column]["dec"]
        val = round(row[column]*(10 if dec else 1))
        string += "{0:b}".format(val).zfill(bits)
    return string

def decode_trait(b_string):
    dec_map = {}
    p = 0
    for column in trait_map.keys():
        bits = trait_map[column]["bits"]
        dec = trait_map[column]["dec"]
        b_seq = b_string[p:p + bits]
        trait = int(b_seq, 2)/(10 if dec else 1)
        dec_map.update({column: trait})
        p += bits
    return dec_map

In [4]:
master = pd.DataFrame(columns = ["GENE_STRING", "TRAIT_STRING"])

print("Generating master...  ")
for index, row in tqdm(gdf.iterrows(), total = gdf.shape[0]):
    master.loc[index] = [encode_gene(row), encode_trait(tdf.loc[index])]
print("Done")

print(master.head())

Generating master...  


100%|██████████| 2266/2266 [00:15<00:00, 143.52it/s]

Done
                                                     GENE_STRING  \
IRIS_313.7684  1000011010100100100000100010101010010100100100...   
IRIS_313.7685  1000010000000100100010100010101010010101100100...   
IRIS_313.7688  1000010010100100010001010000010110001010010110...   
IRIS_313.7689  1001010010100100100010100010101010010100100100...   
IRIS_313.7690  1000010010000000100000100001010101100110100101...   

                                                    TRAIT_STRING  
IRIS_313.7684  0101000001111000100110101100101100001010101001...  
IRIS_313.7685  0101000001110100110000101101001010101011001010...  
IRIS_313.7688  0101000010100110100000101110101101001100101011...  
IRIS_313.7689  0101000010001000101010101100101100001100001010...  
IRIS_313.7690  0011110010001110100010110010001110001011101001...  





In [5]:
predict_reserve = 10

print("Generating samples...")
b_inputs = master["GENE_STRING"].tolist()[predict_reserve + 1:]
b_outputs = master["TRAIT_STRING"].tolist()[predict_reserve + 1:]
print(f"{len(b_inputs)} input samples, {len(b_outputs)} output samples")

Generating samples...
2255 input samples, 2255 output samples


In [6]:
from tensorflow import keras
from sklearn.model_selection import train_test_split

inputs = np.array([[int(digit) for digit in string] for string in b_inputs])
outputs = np.array([[int(digit) for digit in string] for string in b_outputs])

train_inputs, test_inputs, train_outputs, test_outputs = train_test_split(inputs, outputs, test_size = 0.2, random_state = 42)
train_inputs = np.reshape(train_inputs, (train_inputs.shape[0], train_inputs.shape[1], 1))
test_inputs = np.reshape(test_inputs, (test_inputs.shape[0], test_inputs.shape[1], 1))

model = keras.Sequential([
    keras.layers.Conv1D(16, kernel_size=3, activation='relu', input_shape=(train_inputs.shape[1], 1)),
    keras.layers.MaxPooling1D(pool_size=2),
    keras.layers.Flatten(),
    keras.layers.Dense(train_outputs.shape[1], activation='sigmoid')
])

model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])
model.fit(train_inputs, train_outputs, epochs = 10, batch_size = 32)

loss, accuracy = model.evaluate(test_inputs, test_outputs)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.8869703412055969
Test Accuracy: 0.008869179524481297


In [7]:
test_input = [master.loc[master.index[n]]["GENE_STRING"] for n in range(predict_reserve)]
test_inputs = np.array([[int(digit) for digit in string] for string in test_input])
test_inputs = np.reshape(test_inputs, (test_inputs.shape[0], test_inputs.shape[1], 1))
predictions = model.predict(test_inputs)
predicted_strings = [decode_trait("".join(str(int(round(value))) for value in prediction)) for prediction in predictions]
actual_strings = [decode_trait(master.loc[master.index[n]]["TRAIT_STRING"]) for n in range(predict_reserve)]
print("Actual Labels:", list(actual_strings[0].values()))
print("Predicted Labels:", list(predicted_strings[0].values()))

Actual Labels: [4.0, 60.0, 19.0, 8.9, 2.4, 2.1, 75.0, 14.0, 39.0, 1.0, 25.0, 30.0]
Predicted Labels: [5.8, 74.0, 19.0, 9.0, 2.8, 2.4, 88.0, 27.0, 34.0, 1.2, 19.0, 50.0]


Different approach - no binary encoding

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow import keras
# Merge the SNP and trait data based on the sample ID
merged_data = gdf.merge(tdf, on="ID")

# Extract SNP features and trait labels
snp_features = merged_data.iloc[:, :-12].values
trait_labels = merged_data.iloc[:, -12:].values  # Assuming the last 12 columns are trait labels

# Step 2: Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(
    snp_features, trait_labels, test_size=0.2, random_state=42
)

# Step 3: Design and train the model
model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(train_features.shape[1],)),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(train_labels.shape[1])
])

model.compile(optimizer='adam', loss='mean_squared_error', metrics = ["accuracy"])  # Use appropriate loss function based on your task

model.fit(train_features, train_labels, epochs=20, batch_size=16)

# Step 4: Evaluate the model
loss, accuracy = model.evaluate(test_features, test_labels)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Loss: 84.39857482910156
Test Accuracy: 0.7268722653388977


In [9]:
sample = test_features[0].reshape(1, -1)
actual_labels = test_labels[0]
prediction = model.predict(sample)
print("Actual Labels:", actual_labels)
print("Predicted Labels:", [round(p, 1) for p in prediction[0]])

Actual Labels: [  4.  112.   11.    7.7   2.9   1.9  82.   19.   46.    1.3  20.   42. ]
Predicted Labels: [4.6, 95.0, 15.0, 8.6, 3.4, 2.8, 82.8, 13.8, 47.5, 1.4, 23.4, 33.8]
