<a href="https://colab.research.google.com/github/cod3astro/ML_practice/blob/main/codon_usage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [77]:
df = pd.read_csv('codon_usage.csv', index_col='SpeciesID')
df.drop_duplicates(inplace=True)
df.head()

Unnamed: 0_level_0,Kingdom,DNAtype,Ncodons,SpeciesName,UUU,UUC,UUA,UUG,CUU,CUC,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
SpeciesID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100217,vrl,0,1995,Epizootic haematopoietic necrosis virus,0.01654,0.01203,0.0005,0.00351,0.01203,0.03208,...,0.00451,0.01303,0.03559,0.01003,0.04612,0.01203,0.04361,0.00251,0.0005,0.0
100220,vrl,0,1474,Bohle iridovirus,0.02714,0.01357,0.00068,0.00678,0.00407,0.02849,...,0.00136,0.01696,0.03596,0.01221,0.04545,0.0156,0.0441,0.00271,0.00068,0.0
100755,vrl,0,4862,Sweet potato leaf curl virus,0.01974,0.0218,0.01357,0.01543,0.00782,0.01111,...,0.00596,0.01974,0.02489,0.03126,0.02036,0.02242,0.02468,0.00391,0.0,0.00144
100880,vrl,0,1915,Northern cereal mosaic virus,0.01775,0.02245,0.01619,0.00992,0.01567,0.01358,...,0.00366,0.0141,0.01671,0.0376,0.01932,0.03029,0.03446,0.00261,0.00157,0.0
100887,vrl,0,22831,Soil-borne cereal mosaic virus,0.02816,0.01371,0.00767,0.03679,0.0138,0.00548,...,0.00604,0.01494,0.01734,0.04148,0.02483,0.03359,0.03679,0.0,0.00044,0.00131


In [78]:
df.shape

(13028, 68)

In [79]:
df.columns

Index(['Kingdom', 'DNAtype', 'Ncodons', 'SpeciesName', 'UUU', 'UUC', 'UUA',
       'UUG', 'CUU', 'CUC', 'CUA', 'CUG', 'AUU', 'AUC', 'AUA', 'AUG', 'GUU',
       'GUC', 'GUA', 'GUG', 'GCU', 'GCC', 'GCA', 'GCG', 'CCU', 'CCC', 'CCA',
       'CCG', 'UGG', 'GGU', 'GGC', 'GGA', 'GGG', 'UCU', 'UCC', 'UCA', 'UCG',
       'AGU', 'AGC', 'ACU', 'ACC', 'ACA', 'ACG', 'UAU', 'UAC', 'CAA', 'CAG',
       'AAU', 'AAC', 'UGU', 'UGC', 'CAU', 'CAC', 'AAA', 'AAG', 'CGU', 'CGC',
       'CGA', 'CGG', 'AGA', 'AGG', 'GAU', 'GAC', 'GAA', 'GAG', 'UAA', 'UAG',
       'UGA'],
      dtype='object')

In [80]:
df.isnull().any().sum()

np.int64(0)

In [81]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE

In [82]:
features = ['DNAtype', 'Ncodons', 'UUU', 'UUC', 'UUA',
       'UUG', 'CUU', 'CUC', 'CUA', 'CUG', 'AUU', 'AUC', 'AUA', 'AUG', 'GUU',
       'GUC', 'GUA', 'GUG', 'GCU', 'GCC', 'GCA', 'GCG', 'CCU', 'CCC', 'CCA',
       'CCG', 'UGG', 'GGU', 'GGC', 'GGA', 'GGG', 'UCU', 'UCC', 'UCA', 'UCG',
       'AGU', 'AGC', 'ACU', 'ACC', 'ACA', 'ACG', 'UAU', 'UAC', 'CAA', 'CAG',
       'AAU', 'AAC', 'UGU', 'UGC', 'CAU', 'CAC', 'AAA', 'AAG', 'CGU', 'CGC',
       'CGA', 'CGG', 'AGA', 'AGG', 'GAU', 'GAC', 'GAA', 'GAG', 'UAA', 'UAG',
       'UGA']
mi_features = ['DNAtype', 'Ncodons', 'UUU', 'UUC']
target = 'Kingdom'
X = df[features]
y = df[target]
smote = SMOTE(random_state=42)
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=42, stratify=y, test_size=0.2)

In [83]:
for col in X:
  if X[col].dtype == 'object':
    print(col)

UUU
UUC


In [84]:
for col in X.columns:
    if X[col].dtype == 'object':
        # Find rows with non-numeric values
        non_numeric = X[col][~X[col].apply(lambda x: str(x).replace(';', '').isdigit())]
        if not non_numeric.empty:
            print(f"Column '{col}' has non-numeric values:")
            print(non_numeric.unique())

Column 'UUU' has non-numeric values:
['0.01654' '0.02714' '0.01974' ... 0.0138 0.01757 0.01778]
Column 'UUC' has non-numeric values:
['0.01203' '0.01357' '0.0218' ... 0.02591 0.03446 0.03555]


In [85]:
# Convert object-type codons to numeric if possible
X['UUU'] = pd.to_numeric(X['UUU'], errors='coerce')  # 'coerce' turns invalid strings to NaNs
X['UUC'] = pd.to_numeric(X['UUC'], errors='coerce')  # 'coerce' turns invalid strings to NaNs

In [86]:
print("train_X dtypes:", train_X.dtypes)
print("val_X dtypes:", val_X.dtypes)

train_X dtypes: DNAtype      int64
Ncodons      int64
UUU         object
UUC         object
UUA        float64
            ...   
GAA        float64
GAG        float64
UAA        float64
UAG        float64
UGA        float64
Length: 66, dtype: object
val_X dtypes: DNAtype      int64
Ncodons      int64
UUU         object
UUC         object
UUA        float64
            ...   
GAA        float64
GAG        float64
UAA        float64
UAG        float64
UGA        float64
Length: 66, dtype: object


In [87]:
print(X['UUU'].describe())
print(X['UUC'].describe())

count    13026.000000
mean         0.024818
std          0.017628
min          0.000000
25%          0.013910
50%          0.021750
75%          0.031308
max          0.217300
Name: UUU, dtype: float64
count    13027.000000
mean         0.023442
std          0.011599
min          0.000000
25%          0.015380
50%          0.021910
75%          0.029215
max          0.091690
Name: UUC, dtype: float64


In [88]:
columns_to_fill = ['UUU', 'UUC']
for col in columns_to_fill:
    X[col].fillna(X[col].mean(), inplace=True)

In [89]:
copy_X = X.copy()
copy_y = y.copy()
for col in copy_X.select_dtypes('float'):
    copy_X[col] = copy_X[col].round().astype(int)
for char in copy_X.select_dtypes('object'):
  copy_X.loc[:, char], _ = copy_X[char].factorize()
discrete_values = copy_X.dtypes == int
copy_y, _ = pd.factorize(y)

def make_mi_score(copy_X, copy_y, discrete_values):
    mi_scores = mutual_info_classif(copy_X, copy_y, discrete_features=discrete_values)
    mi_scores = pd.Series(mi_scores, name='MI SCORES', index=copy_X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_score(copy_X, copy_y, discrete_values)
mi_scores[:7]

Unnamed: 0,MI SCORES
Ncodons,1.384182
DNAtype,0.403521
UUU,0.0
UUC,0.0
UUA,0.0
UUG,0.0
CUU,0.0


In [90]:
y.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Kingdom,Unnamed: 1_level_1
bct,0.224133
vrl,0.217378
pln,0.19366
vrt,0.159426
inv,0.103239
mam,0.043905
phg,0.016887
rod,0.016503
pri,0.013816
arc,0.009671


In [91]:
X_resampled, y_resampled = smote.fit_resample(X, y)
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, random_state=42, stratify=y_resampled, test_size=0.2)