<a href="https://colab.research.google.com/github/erfanzohrabi/Bioinformatics-Analysis-Aplication-Work-with-Machine-Learning-/blob/main/DNA_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import KFold, train_test_split, cross_val_score

# Import necessary libraries

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
names = ['Class', 'id', 'Sequence']

# Load the dataset
data = pd.read_csv(url, names=names)

# Extract the 'Class' column and count the occurrences of each class
classes = data.loc[:, 'Class']
class_counts = classes.value_counts()
print(class_counts)

# Generate a list of DNA sequences
sequences = data.loc[:, 'Sequence'].tolist()

# Create a dictionary to store the dataset
dataset = {}
for i, seq in enumerate(sequences):
    nucleotides = list(seq)
    nucleotides = [x for x in nucleotides if x != '\t']
    nucleotides.append(classes[i])
    dataset[i] = nucleotides

# Print the first entry in the dataset dictionary
print(dataset[0])

# Create a DataFrame from the dataset dictionary
df = pd.DataFrame(dataset).T
df.rename(columns={57: 'Class'}, inplace=True)

# Generate value counts for each column in the DataFrame
series = []
for name in df.columns:
    series.append(df[name].value_counts())

# Create a DataFrame to store the value counts
info = pd.DataFrame(series)
details = info.T

# Convert categorical variables into numerical representation
numerical_df = pd.get_dummies(df)
df = numerical_df.drop(columns=['Class_-'])
df.rename(columns={'Class_+': 'Class'}, inplace=True)

# Create X and y dataset for training
X = df.drop(['Class'], axis=1).to_numpy()
y = df['Class'].to_numpy()

# Split the data into training and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)

# Define scoring method
scoring = 'accuracy'

# Define models to train
names = ['K Nearest Neighbors', 'Gaussian Process', 'Decision Tree', 'Random Forest',
         'Neural Network', 'AdaBoost', 'Naive Bayes', 'SVM Linear', 'SVM RBF', 'SVM Sigmoid']

classifiers = [
    KNeighborsClassifier(n_neighbors=3),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=500),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel='linear'),
    SVC(kernel='rbf'),
    SVC(kernel='sigmoid')
]

# Evaluate each model in turn
results = []
for name, model in zip(names, classifiers):
    kfold = KFold(n_splits=10, shuffle=True)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    msg = '{0}: {1} ({2})'.format(name, cv_results.mean(), cv_results.std())
    print(msg)

# Test the algorithm on the validation dataset
for name, model in zip(names, classifiers):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(name)
    print("Accuracy:", accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions))


+    53
-    53
Name: Class, dtype: int64
['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g', 't', '+']
K Nearest Neighbors: 0.8196428571428571 (0.11136044162620619)
Gaussian Process: 0.7982142857142857 (0.08146067662826459)
Decision Tree: 0.7125 (0.18582585934148133)
Random Forest: 0.5089285714285714 (0.13628872787922988)
Neural Network: 0.9125 (0.09762812094883318)
AdaBoost: 0.7964285714285715 (0.12931333564451195)
Naive Bayes: 0.8839285714285714 (0.09355847783878382)
SVM Linear: 0.8839285714285714 (0.10898710371190805)
SVM RBF: 0.9125 (0.08003905296791061)
SVM Sigmoid: 0.9107142857142858 (0.09845749108635873)
K Nearest Neighbors
Accuracy: 0.7407407407407407
              precision    recall  f1-score   support

           0       1.00      0.56      0.72    