<a href="https://colab.research.google.com/github/crissy09/Machine_Learning_Crissy_Rani/blob/main/Copy_of_Machine_Learning(Crisy_Rani).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10,6)

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV


In [None]:
# Mount Google Drive (Colab)
from google.colab import drive
drive.mount('/content/drive')

# Load dataset
CSV_PATH = '/content/drive/MyDrive/codon_usage.csv'
df = pd.read_csv(CSV_PATH)
print("Dataset shape:", df.shape)
display(df.head())

In [None]:
# Define meta and codon columns
meta_cols = ['Kingdom', 'DNAtype', 'SpeciesID', 'Ncodons', 'SpeciesName']
codon_cols = [c for c in df.columns if c not in meta_cols]
print("Meta columns:", meta_cols)
print("Number of codon feature columns:", len(codon_cols))

# Preprocessing

**Check for missing values**

In [None]:
print("MISSING VALUES")
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
})
print(missing_df[missing_df['Missing Values'] > 0])


**Outlier Handling**

In [None]:
# BOXPLOTS BEFORE outlier handling
df_num = df[codon_cols].apply(pd.to_numeric, errors='coerce')
sample_cols = codon_cols[:6]
plt.figure()
sns.boxplot(data=df_num[sample_cols])
plt.title("Boxplots BEFORE Outlier Handling (numeric coercion applied)")
plt.xticks(rotation=45)
plt.show()

# IQR capping function and application (winsorize)

def cap_iqr(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return series.clip(lower, upper)

df_capped = df_num.copy()
for col in codon_cols:
    if df_capped[col].notna().sum() == 0:
        continue
    df_capped[col] = cap_iqr(df_capped[col])

# BOXPLOTS AFTER outlier handling
plt.figure()
sns.boxplot(data=df_capped[sample_cols])
plt.title("Boxplots AFTER Outlier Handling (IQR capped)")
plt.xticks(rotation=45)
plt.show()

**Mean Imputation**

In [None]:
# Impute any remaining NaNs in codon features (mean imputation)
imputer = SimpleImputer(strategy='mean')
df_capped[codon_cols] = imputer.fit_transform(df_capped[codon_cols])
print("Mean imputation done for codon features.")
print("Remaining NaNs in codon features:", df_capped[codon_cols].isna().sum().sum())

**Label Encoding for Target Variable**

In [None]:
target_col = 'Kingdom'
df_capped[target_col] = df[target_col]

le = LabelEncoder()
df_capped['y'] = le.fit_transform(df_capped[target_col].astype(str))

print("Target classes:", list(le.classes_))
print("Encoded mapping:", dict(zip(le.classes_, le.transform(le.classes_))))


**Feature Scaling**

In [None]:
X = df_capped[codon_cols].astype(float)
y = df_capped['y']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Scaling completed. Sample scaled values (first row, first 10 features):")
print(X_scaled[0][:10])

**Splitting data into training and testing**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.20, random_state=42, stratify=y
)
print("Train/Test shapes:", X_train.shape, X_test.shape)

In [None]:
# Distribution of Kingdom Classes
plt.figure(figsize=(8,5))
sns.countplot(x=df['Kingdom'], palette="viridis")
plt.title("Distribution of Kingdom Classes")
plt.xlabel("Kingdom")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Correlation Matrix
codon_numeric = df[codon_cols].apply(pd.to_numeric, errors='coerce')
plt.figure(figsize=(16,12))
corr = codon_numeric.corr()
sns.heatmap(corr, cmap='coolwarm', linewidths=0.2)
plt.title("Correlation Matrix of Codon Frequency Features (64 Codons)")
plt.show()


# CLASSIFICATION MODELS

**LOGISTIC REGRESSION**

In [None]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

cm = confusion_matrix(y_test, y_pred_lr)
plt.figure(figsize=(7,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Oranges', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Logistic Regression - Confusion Matrix")
plt.xlabel("Predicted"); plt.ylabel("Actual")
plt.show()


**LINEAR DISCRIMINANT ANALYSIS**

In [None]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_pred_lda = lda.predict(X_test)

print("Linear Discriminant Analysis Accuracy:", round(accuracy_score(y_test, y_pred_lda),4))
print(classification_report(y_test, y_pred_lda))
cm = confusion_matrix(y_test, y_pred_lda)
plt.figure(figsize=(7,6)); sns.heatmap(cm, annot=True, fmt='d', cmap='BuPu'); plt.title("LDA - Confusion Matrix"); plt.show()


**DECISION TREE**

In [None]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

print("Decision Tree Accuracy:", round(accuracy_score(y_test, y_pred_dt),4))
print(classification_report(y_test, y_pred_dt))
cm = confusion_matrix(y_test, y_pred_dt)
plt.figure(figsize=(7,6)); sns.heatmap(cm, annot=True, fmt='d', cmap='Purples'); plt.title("Decision Tree - Confusion Matrix"); plt.show()


**HYPERPARAMETER TUNING: LOGISTIC REGRESSION (GridSearchCV)**

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'max_iter': [300]
}

logreg = LogisticRegression(
    random_state=42,
    multi_class='multinomial',
    warm_start=True
)

gs = GridSearchCV(
    logreg, param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

gs.fit(X_train, y_train)

best_lr = gs.best_estimator_
print("Best Logistic Regression params:", gs.best_params_)
print("Best CV score:", round(gs.best_score_, 4))

y_pred_best = best_lr.predict(X_test)
print("\nTUNED LOGISTIC REGRESSION - Test Accuracy:", round(accuracy_score(y_test, y_pred_best), 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred_best, target_names=le.classes_))

# Confusion matrix plot
cm = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Oranges', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Tuned Logistic Regression - Confusion Matrix")
plt.xlabel("Predicted"); plt.ylabel("Actual")
plt.xticks(rotation=45); plt.yticks(rotation=45)
plt.show()