### Problem-1

You are provided with a protein dataset. Learn a Decision Tree regressor/classifier 
and Random Forest (RF) regressor/classifier on the dataset separately and report 
your results with observation as mentioned below. You should optimize 
hyperparameters available for both Decision tree and RF regressor/classifier 
should report best results only.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
#importing train data
X_train = pd.read_csv('Dataset-assignment-6/Train data/X_train.csv', sep=' ')
y_train = pd.read_csv('Dataset-assignment-6/Train data/Y_train.csv')
X_train.head(), y_train.head()

(     gauss1      gauss2  repulsion  hydrophobic  hydrogen
 0  48.02108   434.90009    1.00229     17.16027   1.04153
 1  45.86394   906.54910    4.54990      0.00000   7.21115
 2  49.45446   708.90695    4.56065     10.12192   5.42312
 3  54.99922   768.05907    5.70052     31.01157   2.34365
 4  53.45864  1053.90858    1.63114      0.00000   2.94989,
    affinity
 0 -2.745424
 1 -2.745424
 2 -2.745424
 3 -2.745424
 4 -2.814060)

In [5]:
#importing test data
X_test = pd.read_csv('Dataset-assignment-6/test data/X_test.csv', sep=' ')
y_test = pd.read_csv('Dataset-assignment-6/test data/Y_test.csv')
X_test.head(), y_test.head()

(     gauss1      gauss2  repulsion  hydrophobic  hydrogen
 0  89.12005  1011.21787    4.03982     20.12528   2.20970
 1  77.28506   885.98707    7.25603      0.00000   7.58789
 2  41.04995   572.64999    0.76219      6.28351   1.36514
 3  91.11996   979.83764    7.17185      0.00000  10.09426
 4  65.07444   776.79753    5.83319      0.00000   8.87687,
    affinity
 0 -2.841514
 1 -3.061148
 2 -3.116057
 3 -3.129784
 4 -3.225874)

In [22]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)

X_train

array([[0.13948503, 0.0419371 , 0.0410062 , 0.0959188 , 0.04567277],
       [0.13206004, 0.16187828, 0.2061618 , 0.        , 0.31622054],
       [0.1444188 , 0.11161753, 0.20666226, 0.05657734, 0.23781254],
       ...,
       [0.44118946, 0.31607728, 0.21063937, 0.19760863, 0.16320429],
       [0.46194503, 0.41481286, 0.16549963, 0.30679929, 0.14498921],
       [0.31672902, 0.2813191 , 0.38883103, 0.03427746, 0.42804302]])

In [23]:
from sklearn.metrics import mean_squared_error

#defining function for calculating mse for regressor
def mse_calculation(model, X, y):
    return mean_squared_error(y, model.predict(X))
    

### Decision Tree Regression

###### Without hyper parameter tuning

In [25]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

tree = DecisionTreeRegressor(random_state=10)
tree.fit(X_train, y_train)

print("MSE for train data   :{}".format(mse_calculation(tree, X_train, y_train)))
print("MSE for test data    :{}".format(mse_calculation(tree, X_test, y_test)))

MSE for train data   :0.0
MSE for test data    :11.065065213808497


###### With hyper parameter tuning

In [26]:
#parameter for tuning
tree_params = {
    "criterion":("mse", "mae"), 
    "splitter":("best", "random"), 
    "max_depth":(list(range(1, 20))), 
    "min_samples_split":[2, 3, 4], 
    "min_samples_leaf":list(range(1, 20)), 
}

#grid search for best parameters
model_tree = DecisionTreeRegressor(random_state=10)
tree_cv = GridSearchCV(model_tree, tree_params, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=1)
tree_cv.fit(X_train, y_train)

best_tree_params = tree_cv.best_params_
print('\nBest Parameters for Decision Tree:\n{}'.format(best_tree_params))

Fitting 3 folds for each of 4332 candidates, totalling 12996 fits

Best Parameters for Decision Tree:
{'criterion': 'mae', 'max_depth': 10, 'min_samples_leaf': 15, 'min_samples_split': 2, 'splitter': 'random'}


In [27]:
best_tree = DecisionTreeRegressor(**best_tree_params)                  #best DT regressor
best_tree.fit(X_train, y_train)

#mse for train and test data
print("MSE for train data   :{}".format(mse_calculation(best_tree, X_train, y_train)))
print("MSE for test data    :{}".format(mse_calculation(best_tree, X_test, y_test)))

MSE for train data   :4.841743362893848
MSE for test data    :6.928168829427344


### Random Forest Regression

###### Without hyper parameter tuning

In [28]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(random_state=10)
forest.fit(X_train, y_train)

#mse for train and test data
print("MSE for train data   :{}".format(mse_calculation(forest, X_train, y_train)))
print("MSE for test data    :{}".format(mse_calculation(forest, X_test, y_test)))

  forest.fit(X_train, y_train)


MSE for train data   :0.6231256019693173
MSE for test data    :6.630470026107928


###### With hyper parameter tuning

In [17]:
#parameters for tuning
forest_params = {
    "n_estimators":list(range(40,101,20)), 
    "max_depth":list(range(1, 20)), 
    "min_samples_split":[2, 3, 4], 
    "min_samples_leaf":list(range(1, 20)), 
}

#grid search for best parametere
model_forest = RandomForestRegressor(random_state=1)
forest_cv = GridSearchCV(model_forest, forest_params, scoring='neg_mean_squared_error', cv=3, n_jobs=-1,verbose=1)
forest_cv.fit(X_train, y_train)

best_forest_params = forest_cv.best_params_
print('\nBest Parameters for Random Forest:\n{}'.format(best_forest_params))

Fitting 3 folds for each of 4332 candidates, totalling 12996 fits

Best Parameters for Random Forest:
{'max_depth': 13, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 80}


In [18]:
best_forest = RandomForestRegressor(**best_forest_params)                #best random forest regressor
best_forest.fit(X_train, y_train)

#calculation of mse for train and test data
print("MSE for train data    :{}".format(mse_calculation(best_forest, X_train, y_train)))
print("MSE for test data     :{}".format(mse_calculation(best_forest, X_test, y_test)))

MSE for train data    :0.024854683729660536
MSE for test data     :0.06975705797089972


### Problem-2

Generate a random n-class classification problem(Hint: may use 
make_classification method from sklearn.datasets) and implement 
AdaBoostClassifier on this custom dataset.

In [93]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#define a function for printing different classification performace parameters
def print_score(clf, X, y):
    y_pred = clf.predict(X)
    clf_report = pd.DataFrame(classification_report(y, y_pred,output_dict=True)).round(2)
    print(f"Accuracy Score: {accuracy_score(y, y_pred) * 100:.2f}%")
    print(f"\nClassification Report:\n{clf_report}")
    print(f"\nConfusion Matrix:\n{confusion_matrix(y, y_pred)}")
       

In [104]:
from sklearn.datasets import make_classification
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split

#creating a dataset with 3 classes and 6 features
X, y = make_classification(n_samples=3000, n_classes=2, n_features=6, n_informative=3, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [105]:
#Ada boost classificaion
clf = AdaBoostClassifier(n_estimators=100, random_state=42)              
clf.fit(X_train, y_train)

#printing performance parameters
print('Train Data\n______________________')
print_score(clf, X_train, y_train)

print('\nTest Data\n______________________')
print_score(clf, X_test, y_test)


Train Data
______________________
Accuracy Score: 96.71%

Classification Report:
                 0        1  accuracy  macro avg  weighted avg
precision     0.96     0.97      0.97       0.97          0.97
recall        0.97     0.96      0.97       0.97          0.97
f1-score      0.97     0.97      0.97       0.97          0.97
support    1045.00  1055.00      0.97    2100.00       2100.00

Confusion Matrix:
[[1016   29]
 [  40 1015]]

Test Data
______________________
Accuracy Score: 94.44%

Classification Report:
                0       1  accuracy  macro avg  weighted avg
precision    0.95    0.94      0.94       0.94          0.94
recall       0.94    0.95      0.94       0.94          0.94
f1-score     0.94    0.94      0.94       0.94          0.94
support    450.00  450.00      0.94     900.00        900.00

Confusion Matrix:
[[421  29]
 [ 21 429]]
