In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/ML datasets'
%ls

import warnings
warnings.filterwarnings('ignore')



Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/ML datasets
 abalone.data               mnist_train.csv
 data_1.csv                 more_than_50k.csv
 data_2.csv                 output1.csv
'Dataset Description.csv'   output.csv
 diabetes2.csv              output_log.csv
 DT_A_1.png                 output_reg.csv
 DT_B_1.png                 output_svm.csv
 DT-B-2-CC.png              population.csv
 DT-B-2-XX.png              PRSA_data_2010.1.1-2014.12.31.csv
 DT_C_1.pkl                'ROC Curve.png'
 fashion-mnist_test.csv     sigmoid
 fashion-mnist_train.csv    test.csv
 imagename.png              test_data.csv
 mbti_1.csv                 train.csv
 [0m[01;34mml-latest-small[0m/           train_data.csv
 mnist_test.csv             [01;34mweights[0m/


In [None]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from collections import Counter
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from keras.preprocessing.sequence import pad_sequences

In [None]:
#loading the training dataset
data=pd.read_csv('train.csv')

#loading the testing dataset
data_test=pd.read_csv('test.csv')

#storing IDs from testing dataset
test_data_id= []
for id,seq in zip(data_test['ID'], data_test[' Sequence']):
    test_data_id.append(id)


In [None]:
#Null values detection
print("Number of null values:")
print(data.isnull().sum())
#Train data is balanced
print("Checking data imbalance:")
print(Counter(data[' Label']))
print("\n\n")

Number of null values:
Sequence    0
 Label      0
dtype: int64
Checking data imbalance:
Counter({0: 3197, 1: 3197})





In [None]:
#Analysis of length of each sequence (to find max_length)
import statistics

#Analysis of length of each sequence in train dataset
a=[]
for seq in data['Sequence']:
  a.append(len(seq))
print(Counter(a))
print(statistics.mean(a))

#Analysis of length of each sequence in test dataset
b=[]
for seq in data_test[' Sequence']:
  b.append(len(seq))
print(Counter(b))
print(statistics.mean(b))
print("\n\n")

Counter({15: 1035, 13: 631, 14: 600, 12: 550, 18: 395, 11: 393, 16: 393, 17: 384, 19: 357, 20: 344, 22: 303, 21: 268, 25: 268, 24: 245, 23: 228})
16.69659055364404
Counter({15: 208, 12: 203, 11: 170, 13: 159, 14: 153, 16: 94, 17: 78, 19: 78, 18: 75, 20: 71, 23: 69, 25: 63, 21: 61, 22: 59, 24: 57})
16.161451814768462





In [None]:
#Pre-Processing

#Integer Encoding
char_dict= {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'X':20, 'Y': 21}
def feature_extraction(data, column):
  encode_list = []
  for row in data[column].values:
    row_encode = []
    for code in row:
      row_encode.append(char_dict.get(code, 0))
    encode_list.append(np.array(row_encode))
  #Sequence Padding 
  encode_list= pad_sequences(encode_list, maxlen=25, padding='post', truncating='post')
  encode_list= feature_extraction_2(encode_list)
  return encode_list

#Feature Extraction: Binary Profiling, Frequency count, Molecular Weight
molecular_weight_dict={1:89.1,15:174.2,12:132.1,3:133.1,2:121.2,4:147.1,14:146.2,6:75.1,7:155.2,8:131.2,10:131.2,9:146.2,11:149.2,5:165.2,13:115.1,16:105.1,17:119.1,19:204.2,21:181.2,18:117.1}
def feature_extraction_2(e_list):
  encode_list = []
  for row in e_list:
    row_encode = []
    temp_array=[]
    weight=0
    for d in list(range(1, 22)):
      temp_array.append((row == d).sum())
    for code in row:
      gene_string=[0 for k in range(21)]
      gene_string[code-1]=1
      temp_array.extend(gene_string)
      weight+=molecular_weight_dict.get(code, 0)
    row_encode.append(weight)
    row_encode.extend(temp_array)
    encode_list.append(np.array(row_encode))
  return encode_list


In [None]:
#Features and Labels splitting
X_data = feature_extraction(data, 'Sequence')
Y_data = data[' Label'].values
X_data_test = feature_extraction(data_test, ' Sequence')

#Splitting data into train and validation dataset to check accuracy
X_train, X_val, Y_train, Y_val = train_test_split(X_data, Y_data, test_size=0.2, random_state=123,stratify= Y_data)

#Best Model:

In [None]:
#Best model: RandomForestRegressor

print("Best model: RandomForestRegressor: \n")
#min_samples_split=10 and n_estimators=100 to avoid overfitting and underfitting
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1) 

#GridSearchCV to apply  K-fold cross Validation and Hypeparameter tuning
grid = GridSearchCV(estimator = rf, param_grid = {}, cv = 3, verbose=2, n_jobs = -1)
grid.fit(X_data, Y_data)

#Best estimator from GridSearchCV
print(grid.best_estimator_)
forest= grid.best_estimator_

#Predicting values
predictions = forest.predict(X_data_test)

#Converting output into csv file
output_data= []
for i in range(len(predictions)):
    output_data.append([test_data_id[i],predictions[i]])
output_df=pd.DataFrame(output_data,columns=['ID','Label'])
output_df.to_csv('output_RFR.csv', sep=',', index=False)
print("\n Output File: \n")
print(output_df)
print("\n\n\n")


Best model: RandomForestRegressor: 

Fitting 3 folds for each of 1 candidates, totalling 3 fits
RandomForestRegressor(n_jobs=-1)

 Output File: 

         ID  Label
0     10001   0.28
1     10002   0.14
2     10003   0.41
3     10004   0.17
4     10005   0.29
...     ...    ...
1593  11594   0.50
1594  11595   0.68
1595  11596   0.73
1596  11597   0.85
1597  11598   0.73

[1598 rows x 2 columns]






#Other Models:

In [None]:
#Best model:  RandomForestClassifier

print("Model:  RandomForestClassifier \n")
#min_samples_split=10 and n_estimators=100 to avoid overfitting and underfitting
rc = RandomForestClassifier(n_estimators=100, n_jobs=-1) 

#GridSearchCV to apply  K-fold cross Validation and Hypeparameter tuning
grid = GridSearchCV(estimator = rc, param_grid = {}, cv = 3, verbose=2, n_jobs = -1)
grid.fit(X_train, Y_train)

#Best estimator from GridSearchCV
print(grid.best_estimator_)
forestc= grid.best_estimator_

#Chceking accuracy on val data
Y_pred_val= forestc.predict(X_val)
print("\n Classification Report for Validation dataset: \n")
print(classification_report(Y_val, Y_pred_val))


#Predicting values
predictions = forestc.predict(X_data_test)

#Converting output into csv file
output_data= []
for i in range(len(predictions)):
    output_data.append([test_data_id[i],predictions[i]])
output_df=pd.DataFrame(output_data,columns=['ID','Label'])
output_df.to_csv('output_RFC.csv', sep=',', index=False)
print("\n Output File: \n")
print(output_df)
print("\n\n\n")

Model:  RandomForestClassifier 

Fitting 3 folds for each of 1 candidates, totalling 3 fits
RandomForestClassifier(n_jobs=-1)

 Classification Report for Validation dataset: 

              precision    recall  f1-score   support

           0       0.69      0.71      0.70       320
           1       0.70      0.68      0.69       320

    accuracy                           0.70       640
   macro avg       0.70      0.70      0.70       640
weighted avg       0.70      0.70      0.70       640


 Output File: 

         ID  Label
0     10001      0
1     10002      0
2     10003      0
3     10004      0
4     10005      0
...     ...    ...
1593  11594      0
1594  11595      1
1595  11596      1
1596  11597      1
1597  11598      1

[1598 rows x 2 columns]






In [None]:
#Model: Logistic Regression

print("Model: Logistic Regression: \n")

log = LogisticRegression(n_jobs=-1) 

#GridSearchCV to apply  K-fold cross Validation and Hypeparameter tuning
grid = GridSearchCV(estimator = log, param_grid = {}, cv = 10, verbose=2, n_jobs = -1)
grid.fit(X_train, Y_train)

#Best estimator from GridSearchCV
print(grid.best_estimator_)
logistic= grid.best_estimator_

#Chceking accuracy on val data
Y_pred_val= logistic.predict(X_val)
print("\n Classification Report for Validation dataset: \n")
print(classification_report(Y_val, Y_pred_val))


#Predicting values
predictions = logistic.predict(X_data_test)

#Converting output into csv file
output_data= []
for i in range(len(predictions)):
    output_data.append([test_data_id[i],predictions[i]])
output_df=pd.DataFrame(output_data,columns=['ID','Label'])
output_df.to_csv('output_Logistic.csv', sep=',', index=False)
print("\n Output File: \n")
print(output_df)
print("\n\n\n")




Model: Logistic Regression: 

Fitting 10 folds for each of 1 candidates, totalling 10 fits
LogisticRegression(n_jobs=-1)

 Classification Report for Validation dataset: 

              precision    recall  f1-score   support

           0       0.70      0.70      0.70       320
           1       0.70      0.69      0.70       320

    accuracy                           0.70       640
   macro avg       0.70      0.70      0.70       640
weighted avg       0.70      0.70      0.70       640


 Output File: 

         ID  Label
0     10001      0
1     10002      0
2     10003      0
3     10004      0
4     10005      0
...     ...    ...
1593  11594      1
1594  11595      1
1595  11596      1
1596  11597      1
1597  11598      1

[1598 rows x 2 columns]






In [None]:
#Model: SVM

print("Model: SVM: \n")
svm = make_pipeline(StandardScaler(), SVC(gamma='auto'))

#training the model
svm.fit(X_train, Y_train)

#Chceking accuracy on val data
Y_pred_val= svm.predict(X_val)
print("\n Classification Report for Validation dataset: \n")
print(classification_report(Y_val, Y_pred_val))

#Predicting values
predictions = svm.predict(X_data_test)

#Converting output into csv file
output_data= []
for i in range(len(predictions)):
    output_data.append([test_data_id[i],int(predictions[i])])
output_df=pd.DataFrame(output_data,columns=['ID','Label'])
output_df.to_csv('output_SVM.csv', sep=',', index=False)
print("\n Output File: \n")
print(output_df)
print("\n\n\n")




Model: SVM: 


 Classification Report for Validation dataset: 

              precision    recall  f1-score   support

           0       0.71      0.68      0.70       320
           1       0.69      0.72      0.71       320

    accuracy                           0.70       640
   macro avg       0.70      0.70      0.70       640
weighted avg       0.70      0.70      0.70       640


 Output File: 

         ID  Label
0     10001      1
1     10002      0
2     10003      0
3     10004      0
4     10005      0
...     ...    ...
1593  11594      0
1594  11595      1
1595  11596      1
1596  11597      1
1597  11598      1

[1598 rows x 2 columns]






In [None]:
#Model: Kth Nearest Neighbour

print("Model: KNN \n")
knn = KNeighborsClassifier(n_neighbors=20)

#training the model
knn.fit(X_train, Y_train)

#Chceking accuracy on val data
Y_pred_val= knn.predict(X_val)
print("\n Classification Report for Validation dataset: \n")
print(classification_report(Y_val, Y_pred_val))

#Predicting values
predictions = knn.predict(X_data_test)

#Converting output into csv file
output_data= []
for i in range(len(predictions)):
    output_data.append([test_data_id[i],int(predictions[i])])
output_df=pd.DataFrame(output_data,columns=['ID','Label'])
output_df.to_csv('output_KNN.csv', sep=',', index=False)
print("\n Output File: \n")
print(output_df)
print("\n\n\n")




Model: KNN 


 Classification Report for Validation dataset: 

              precision    recall  f1-score   support

           0       0.59      0.62      0.61       320
           1       0.60      0.56      0.58       320

    accuracy                           0.59       640
   macro avg       0.59      0.59      0.59       640
weighted avg       0.59      0.59      0.59       640


 Output File: 

         ID  Label
0     10001      0
1     10002      0
2     10003      0
3     10004      0
4     10005      0
...     ...    ...
1593  11594      0
1594  11595      1
1595  11596      1
1596  11597      0
1597  11598      1

[1598 rows x 2 columns]




