In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from tqdm.notebook import tqdm
tqdm.pandas()
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
# Importing files

X_train_0 = pd.read_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\X_train_kaggle.csv')
y_train_0 = pd.read_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\y_train_kaggle.csv')
X_test_0 = pd.read_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\X_test_kaggle.csv')
X_train_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33029 entries, 0 to 33028
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   CunstructedAASeq_cln  33029 non-null  object
 1   Id                    33029 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 516.2+ KB


In [3]:
# Discarding records with length 235 
length = [len(X_train_0.CunstructedAASeq_cln[i]) for i in range(len(X_train_0))]
X_train_0['_length'] = length
X_train_0 = X_train_0[X_train_0['_length'] == 237]

# removed 2000 records with length 235, only ones with 237 are remaining
X_train_0['_length'].unique()
X_train_0.drop(columns = ['_length'] , inplace = True)
X_train_0 = X_train_0.sort_values(by = ["Id"])

## Use X_train to get only IDs that we want for the model
y_train = y_train_0[y_train_0['Id'].isin(list(X_train_0.Id))].sort_values(by = ["Id"])


In [4]:
# Reading descriptor files

zscale = pd.read_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\descriptors\Z-scale.csv',skiprows=2,usecols = range(1,5))
dpps = pd.read_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\descriptors\DPPS.csv',skiprows=2,usecols = range(2,12))
mswhim = pd.read_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\descriptors\MS-WHIM.csv',skiprows=2,usecols = range(2,5))
physical = pd.read_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\descriptors\Physical.csv',skiprows=2,usecols = range(2,4))
stscale = pd.read_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\descriptors\ST-scale.csv',skiprows=2,usecols = range(2,10))
tscale = pd.read_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\descriptors\T-scale.csv',skiprows=2,usecols = range(2,7))
vhsescale = pd.read_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\descriptors\VHSE-scale.csv',skiprows=2,usecols = range(2,10))


In [5]:
# Creating a single file with all descriptors

descriptor_tables = [zscale, dpps, mswhim, physical, stscale, tscale, vhsescale]
consol_desc = pd.concat(descriptor_tables, axis = 'columns')
consol_desc = consol_desc.set_index(consol_desc.columns[0])

In [6]:
# Creating function to assign descriptor according to amino acid

consol_desc = consol_desc.T
def encode(seq):    
    x = pd.DataFrame([consol_desc[i] for i in seq]).reset_index(drop=True)
    x = x.T
    e = x.values.flatten()
    e = list(e)
    return e

In [7]:
# Using the function created above to create a matrix for the training data
X_train = X_train_0.copy()

X_train['features'] = X_train['CunstructedAASeq_cln'].progress_apply(encode)
X_train = pd.DataFrame(X_train['features'].to_list(), columns=range(0,9243))
X_train.to_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\X_train_output.csv',index=None, header=True)

  0%|          | 0/31029 [00:00<?, ?it/s]

In [8]:
X_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9233,9234,9235,9236,9237,9238,9239,9240,9241,9242
0,1.96,2.84,2.23,3.08,3.08,-4.19,-1.39,0.92,2.23,-2.69,...,-0.13,0.39,0.65,-1.34,-0.68,0.56,0.02,3.56,-0.52,0.13
1,1.96,2.84,2.23,3.08,3.08,-4.19,-4.92,0.92,2.23,-2.69,...,-0.13,0.39,0.65,-1.34,-0.68,0.56,0.02,-0.62,-0.52,0.13
2,1.96,2.84,2.23,3.08,3.08,-4.19,-4.92,0.92,2.23,-2.69,...,-0.13,0.39,0.65,-1.34,-0.68,0.56,0.02,-0.62,-0.52,0.13
3,1.96,2.84,2.23,3.08,2.23,-4.19,-4.92,0.92,2.23,-2.69,...,-0.13,0.39,0.65,-1.34,-0.68,0.56,0.02,-0.62,-0.52,0.13
4,1.96,2.84,2.23,3.08,3.08,-4.19,-4.92,0.92,2.23,-2.69,...,-0.13,0.39,0.65,-1.34,-0.68,0.56,0.02,-0.62,-0.52,0.13


In [9]:
# Similarly, creating matrix for the test data
X_test = X_test_0.copy()

X_test['features'] = X_test['CunstructedAASeq_cln'].progress_apply(encode)
X_test = pd.DataFrame(X_test['features'].to_list(), columns=range(0,9243))
X_test.to_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\X_test_output.csv',index=None, header=True)

  0%|          | 0/20686 [00:00<?, ?it/s]

In [10]:
X_test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9233,9234,9235,9236,9237,9238,9239,9240,9241,9242
0,1.96,2.84,2.23,3.08,3.08,-4.19,-4.92,0.92,2.23,-2.69,...,-0.13,0.39,0.65,-1.34,-0.68,0.56,0.02,-0.62,-0.52,0.13
1,1.96,2.84,2.23,3.08,3.08,-4.19,-4.92,0.92,2.23,-2.69,...,-0.13,0.39,0.65,-1.34,-0.68,0.56,0.02,-0.62,-0.52,0.13
2,1.96,2.84,2.23,3.08,3.08,-4.19,-4.92,0.92,2.23,-2.69,...,-0.13,0.39,0.65,-1.34,-0.68,0.56,0.02,-0.62,-0.52,0.13
3,1.96,2.84,2.23,3.08,3.08,-4.19,-4.92,0.92,2.23,-2.69,...,-0.13,0.39,0.65,-1.34,-0.68,0.56,0.02,-0.62,-0.52,0.13
4,1.96,2.84,2.23,3.08,3.08,-4.19,-4.92,0.92,2.23,-2.69,...,-0.13,0.39,0.65,-1.34,-0.68,0.56,0.02,-0.62,-0.52,0.13


In [11]:
# Importing training and test data matrix from local folder

X_train = pd.read_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\X_train_output.csv')
X_test = pd.read_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\X_test_output.csv')

In [12]:
# Checking for nulls
X_train.isnull().any().sum()

0

In [14]:
#Scaling the training and test data set

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train),columns= X_train.columns, index = X_train.index) 

scaler = preprocessing.StandardScaler().fit(X_test)
X_test = pd.DataFrame(scaler.transform(X_test),columns= X_test.columns, index = X_test.index)

yy_train = y_train[['Brightness_Class']]


In [81]:
# Splitting into training and test data set for model validation and tuning

X_trn, X_tt, y_trn, y_tt = train_test_split(X_train, yy_train, test_size=0.3, random_state=18)

In [87]:
# Logistic Regression
# Kaggle public score base model: 0.87619

log_reg_base = LogisticRegression(random_state=6, max_iter=10000)
log_reg_base.fit(X_trn, y_trn.values.ravel())

#Predicting the test set results
y_pred = log_reg_base.predict(X_tt)

# Obtaining the precision, recall, f1-score respectively for test data
precision_recall_fscore_support(y_tt, y_pred, pos_label=1, average='binary')


(0.8485971413446268, 0.8851463279955826, 0.8664864864864865, None)

In [95]:
# Logistic regression with hyperparameter tuning
# Kaggle public score tuned model: 0.87283

param_dist = {"C": [0.5,1,2],
              "random_state": [4,6],
              "max_iter": [8000]}

# Instantiating logistic regression
log_reg = LogisticRegression()

# Instantiating RandomizedSearchCV object
random_search = RandomizedSearchCV(log_reg, param_dist, cv = 2).fit(X_trn, y_trn.values.ravel())

# Access the best set of parameters
best_params = random_search.best_params_
print(best_params)
# Stores the optimum model in best_pipe
best_pipe = random_search.best_estimator_
print(best_pipe)

y_pred = best_pipe.predict(X_tt)

# Obtaining the precision, recall, f1-score respectively for best model
precision_recall_fscore_support(y_tt, y_pred, pos_label=1, average='binary')




{'random_state': 4, 'max_iter': 8000, 'C': 0.5}
LogisticRegression(C=0.5, max_iter=8000, random_state=4)


(0.8489018258798624, 0.8856985091109884, 0.8669098770436428, None)

In [114]:
# Getting csv output for the model

y_test = best_pipe.predict(X_test)

y_test = pd.DataFrame(y_test, columns = ['Brightness_Class'])
y_test_final = pd.concat([X_test_0['Id'], y_test['Brightness_Class']], axis = 1)
y_test_final.to_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\y_test result.csv',index=None)

In [117]:
# Random forest
# Kaggle public score base model : 0.82283

ran_for_base = RandomForestClassifier(random_state=6)
ran_for_base.fit(X_trn, y_trn.values.ravel())

#Predicting the test set results
y_pred = ran_for_base.predict(X_tt)

# Obtaining the precision, recall, f1-score respectively for test data
precision_recall_fscore_support(y_tt, y_pred, pos_label=1, average='binary')


(0.7702569169960475, 0.860850358917725, 0.8130378096479791, None)

In [121]:
# Random forest with hyperparameter tuning
# Kaggle public score tuned model: 0.82176

param_dist = {"n_estimators": [200,600,1000],
              "random_state": [6],
              "n_jobs": [-1],
              "min_samples_split": [2,4]}

# Instantiating logistic regression
ran_for = RandomForestClassifier()

# Instantiating RandomizedSearchCV object
random_search = RandomizedSearchCV(ran_for, param_dist, cv = 2).fit(X_trn, y_trn.values.ravel())

# Access the best set of parameters
best_params = random_search.best_params_
print(best_params)
# Stores the optimum model in best_pipe
best_pipe = random_search.best_estimator_
print(best_pipe)

y_pred = best_pipe.predict(X_tt)

# Obtaining the precision, recall, f1-score respectively for best model
precision_recall_fscore_support(y_tt, y_pred, pos_label=1, average='binary')



{'random_state': 6, 'n_jobs': -1, 'n_estimators': 1000, 'min_samples_split': 4}
RandomForestClassifier(min_samples_split=4, n_estimators=1000, n_jobs=-1,
                       random_state=6)


(0.8086910439851617, 0.8426283821093319, 0.8253109789075176, None)

In [122]:
# Getting csv output for the model
y_test = best_pipe.predict(X_test)

y_test = pd.DataFrame(y_test, columns = ['Brightness_Class'])
y_test_final = pd.concat([X_test_0['Id'], y_test['Brightness_Class']], axis = 1)
y_test_final.to_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\y_test result.csv',index=None)

In [123]:
# Decision Tree
# Test Accuracy : 0.77804 

dec_tree = DecisionTreeClassifier(random_state=6)
dec_tree.fit(X_train, yy_train.values.ravel())

#Predicting the test set results
y_pred = dec_tree.predict(X_tt)
# Obtaining the precision, recall, f1-score respectively for test data
print(precision_recall_fscore_support(y_tt, y_pred, pos_label=1, average='binary'))

#Predicting the test set results
y_test = dec_tree.predict(X_test)
y_test = pd.DataFrame(y_test, columns = ['Brightness_Class'])
y_test_final = pd.concat([X_test_0['Id'], y_test['Brightness_Class']], axis = 1)
y_test_final.to_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\y_test result.csv',index=None)

(1.0, 1.0, 1.0, None)


In [78]:
# Support Vector Classifier
# Test Accuracy : 

model = SVC(random_state=6)
model.fit(X_train, yy_train.values.ravel())

#Predicting the test set results
y_test = model.predict(X_test)

y_test = pd.DataFrame(y_test, columns = ['Brightness_Class'])
y_test_final = pd.concat([X_test_0['Id'], y_test['Brightness_Class']], axis = 1)
y_test_final.to_csv(r'C:\Users\kantg\OneDrive\Desktop\CMU\ML for Science\HW4\y_test result.csv',index=None)

Training_accuracy = model.score(X_train, yy_train)
print(Training_accuracy)

0.9318379580392536
