## CS677 - MACHINE LEARNING
## EXTRA CREDITS ASSIGNMENT - 1
### AAYUSH DESAI

## TASK 1 Train and fine-tune a Decision Tree for the moons dataset

In [25]:
from sklearn.datasets import make_moons
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

### Generate a moons dataset use make_moons(n_samples=10000, noise=0.4)  

In [26]:
x, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

### Spliting into two Test and Train.

In [27]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### Use grid search with cross-validation (with the help of the GridSearchCV class) to find good hyperparameter values for a DecisionTreeClassifier.

In [28]:
param_grid = {'max_leaf_nodes': list(range(5, 100)), 'min_samples_split': [2, 4, 6]}

In [29]:
hyperparameter_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), param_grid=param_grid, n_jobs=-1, verbose=1, cv=3)

In [30]:
hyperparameter_search.fit(x_train, y_train)

Fitting 3 folds for each of 285 candidates, totalling 855 fits


In [31]:
hyperparameter_search.best_estimator_

In [32]:
grid_pred = hyperparameter_search.predict(x_test)
accuracy_score(y_test, grid_pred)

0.8695

## TASK 2 Grow a forest.

#### Train one Decision Tree on each subset, using the best hyperparameter values found above. Evaluate these 1,000 Decision Trees on the test set. Since they were trained on smaller sets, these Decision Trees will likely perform worse than the first Decision Tree, achieving only about 80% accuracy.


In [33]:
from sklearn.model_selection import ShuffleSplit
import numpy as np
from sklearn.base import clone
from scipy.stats import mode

In [34]:
dataset_collection = []

In [35]:
trees = 1000
instances = 100

In [36]:
splitter = ShuffleSplit(n_splits=trees, test_size=len(x_train) - instances, random_state=42)

In [37]:
for train_index, test_index in splitter.split(x_train):
    x_train_split = x_train[train_index]
    y_train_split = y_train[train_index]
    dataset_collection.append((x_train_split, y_train_split))

In [38]:
f_pred = [clone(hyperparameter_search.best_estimator_) for _ in range(trees)]

In [39]:
accuracy_scores = []

In [40]:
for tree_estimator, (x_train_split, y_train_split) in zip(f_pred, dataset_collection):
    tree_estimator.fit(x_train_split, y_train_split)
    y_predict = tree_estimator.predict(x_test)
    accuracy_scores.append(accuracy_score(y_test, y_predict))

In [41]:
average_accuracy = np.mean(accuracy_scores)
average_accuracy

0.805471

#### For each test set instance, generate the predictions of the 1,000 Decision Trees, and keep only the most frequent prediction. This gives you majority-vote predictions over the test set.

In [42]:
predict_trees = np.empty([trees, len(x_test)], dtype=np.uint8)

for tree_index, tree_estimator in enumerate(f_pred):
    predict_trees[tree_index] = tree_estimator.predict(x_test)

In [43]:
predictions_trees_majority, n_votes = mode(predict_trees, axis=0)

#### Evaluate these predictions on the test set

In [44]:
accuracy_score(y_test, predictions_trees_majority.reshape([-1]))

0.872