<a href="https://colab.research.google.com/github/fabiorodp/IN5550_Neural_Methods_in_Natural_Language_Processing/blob/main/assignment2/IN_STK5000_9000_Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Assignment 2 - IN-STK5000/9000 - Autumn21**

**Students**: 
- Fábio Rodrigues Pereira
- Nicholas Walker
- Aurora Poggi

**Data**: 
- Ionosphere
- URL for description: https://archive.ics.uci.edu/ml/datasets/ionosphere
- URL for data: https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data
- URL for names: https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.names

**Task type**:
- Binary Classification



# Tasks
In this file, we will set up a pipeline for
1. Collecting data for an experiment **(DONE)**
2. Processing the data **(DONE)**
3. Running a machine learning algorithm **(DONE)**
4. Evaluating the algorithm **(DONE)**
5. Choosing the best algorithm and hyperparameters **(DONE)**

The main questions we need to answer are
1. How will data be collected?
2. How much data would we need? **(2x224 samples was enough to achieve 97% of accuracy)**
3. What algorithm would be best? **(rf)**
4. How would the amount of data influence algorithm selection? **(yes, oversampling the minority class up to the same qtd of the other class (balanced classes distribution) increased the accuracy from 93 to 97%)**
5. How robust is our procedure to assumptions?

## Dependencies

In [None]:
!pip install pipelinehelper
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from pipelinehelper import PipelineHelper
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
seed = 1
n_jobs = -1

import numpy as np
import abc
import scipy.stats

# This is a base class for generating classification problems
class BaseClassificationGenerator:
    @abc.abstractmethod
    ## generate n_points (x,y) pairs
    def generate(self, n_points):
        pass
    ## generate an (x,y) at x.
    def generate_at_point(self, x):
        pass

# Generates data where $X|Y=i~Normal(\mu_i, \Sigma_i)$
class GaussianClassificationGenerator(BaseClassificationGenerator):
    # Initialise the class centers
    def __init__(self, n_dimensions, class_proportions):
        self.n_dimensions = n_dimensions
        self.class_proportions = class_proportions
        self.n_classes = class_proportions.shape[0]
        self.means = np.zeros([self.n_classes, self.n_dimensions])
        self.covariances = np.zeros([self.n_classes, self.n_dimensions, self.n_dimensions])
        # Generate means   $\mu_i \sim Uniform([0,1]^n)$
        # and covariances  $\Sigma_i \sim Gamma(1)$
        for i in range(self.n_classes):
            self.means[i] = np.random.uniform(size=self.n_dimensions)
            self.covariances[i] = scipy.stats.wishart.rvs(self.n_dimensions, np.identity(self.n_dimensions))
    # generate data
    def generate(self, n_points):
        Y = np.random.choice(self.n_classes, p=self.class_proportions, size=n_points)
        X = np.zeros([n_points, self.n_dimensions])
        for t in range(n_points):
            X[t]= np.random.multivariate_normal(self.means[Y[t]], self.covariances[Y[t]])
        return [X, Y]



## Importing data

In [None]:
# importing data
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data")

## Dealing with raw data and resampling

In [None]:
# checking if there are only 2 labels
df.g.unique()

array(['b', 'g'], dtype=object)

In [None]:
# converting to binary values 0 and 1
df.g = df.g.apply(lambda x: 0 if x=='b' else 1)

# checking the balance between classes
df.g.value_counts()  # unbalanced

1    224
0    126
Name: g, dtype: int64

In [None]:
# balancing classes
# oversampling data for minority class from 126 to 224
df_minority = resample(
    df[df.g == 0], 
    replace=True, 
    n_samples=224, 
    random_state=seed, 
    stratify=None
)

# putting both together
df_resampled = pd.concat([df_minority, df[df.g == 1]])
del df_minority

# checking the balance between classes
df_resampled.g.value_counts()  # balanced

1    224
0    224
Name: g, dtype: int64

In [None]:
total = df.g.value_counts()[0] + df.g.value_counts()[1]
generator = GaussianClassificationGenerator(34, np.array([df.g.value_counts()[0]/total, df.g.value_counts()[1]/total]))
df_artificial = generator.generate(500)

In [None]:
# splitting features and classes
X = df.iloc[:, :-1]
Y = df.g

X_resampled = df_resampled.iloc[:, :-1]
Y_resampled = df_resampled.g
del df, df_resampled

In [None]:
X_artificial = df_artificial[0]
Y_artificial = df_artificial[1]
del df_artificial

## Creating pipelines

In [None]:
# creating pipeline among pre-processings and classifiers
pipe = Pipeline([
    ('scaler', PipelineHelper([
        ('std', StandardScaler()),
        ('maxabs', MaxAbsScaler()),
        ('minmax', MinMaxScaler()),
    ])),
    ('classifier', PipelineHelper([
        ('lr', LogisticRegression(solver='saga', random_state=seed, n_jobs=n_jobs)),
        ('rf', RandomForestClassifier(random_state=seed, n_jobs=n_jobs)),
    ])),
])

params = {
    'scaler__selected_model': pipe.named_steps['scaler'].generate({
        'std__with_mean': [True, False],
        'std__with_std': [True, False],
        'maxabs__copy': [True],
        'minmax__copy': [True],
    }),
    'classifier__selected_model': pipe.named_steps['classifier'].generate({
        'lr__penalty': ['l1', 'l2'],
        'lr__max_iter': [100, 300, 500],
        'rf__n_estimators': [100, 300, 500],
        'rf__criterion': ['gini', 'entropy'],
        'rf__bootstrap': [True, False]
    })
}

## Creating random grid search with cross validation

In [None]:
rgs = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=params, 
    n_iter=100,
    scoring='accuracy', 
    n_jobs=n_jobs, 
    refit=True, 
    cv=10,
    random_state=seed
)

rgs.fit(X, Y)  # using unbalanced classes
print(rgs.best_params_)
print(rgs.best_score_)

{'scaler__selected_model': ('minmax', {'copy': True}), 'classifier__selected_model': ('rf', {'bootstrap': False, 'criterion': 'gini', 'n_estimators': 500})}
0.9371428571428572


In [None]:
rgs1 = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=params, 
    n_iter=100,
    scoring='accuracy', 
    n_jobs=n_jobs, 
    refit=True, 
    cv=10,
    random_state=seed
)

rgs1.fit(X_resampled, Y_resampled)  # using balanced classes
print(rgs1.best_params_)
print(rgs1.best_score_)

{'scaler__selected_model': ('std', {'with_mean': True, 'with_std': False}), 'classifier__selected_model': ('rf', {'bootstrap': False, 'criterion': 'gini', 'n_estimators': 100})}
0.9798989898989898


In [None]:
rgs2 = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=params, 
    n_iter=100,
    scoring='accuracy', 
    n_jobs=n_jobs, 
    refit=True, 
    cv=10,
    random_state=seed
)

rgs2.fit(X_artificial, Y_artificial)  # using artificial classes
print(rgs2.best_params_)
print(rgs2.best_score_)

{'scaler__selected_model': ('minmax', {'copy': True}), 'classifier__selected_model': ('rf', {'bootstrap': False, 'criterion': 'gini', 'n_estimators': 500})}
0.8520000000000001


## Evaluating results

In [None]:
# for unbalanced data
# taking only some importante dimensions for evaluation
# sorting mean test scores in decending fashion

rts = pd.DataFrame(rgs.cv_results_)[['mean_test_score', 'param_scaler__selected_model', 'param_classifier__selected_model']]
rts = rts.sort_values('mean_test_score', ascending=False)
rts.head(50)

Unnamed: 0,mean_test_score,param_scaler__selected_model,param_classifier__selected_model
44,0.937143,"(std, {'with_mean': True, 'with_std': False})","(rf, {'bootstrap': False, 'criterion': 'gini',..."
90,0.937143,"(std, {'with_mean': True, 'with_std': True})","(rf, {'bootstrap': False, 'criterion': 'gini',..."
3,0.937143,"(minmax, {'copy': True})","(rf, {'bootstrap': False, 'criterion': 'gini',..."
6,0.937143,"(maxabs, {'copy': True})","(rf, {'bootstrap': False, 'criterion': 'gini',..."
23,0.937143,"(std, {'with_mean': False, 'with_std': True})","(rf, {'bootstrap': False, 'criterion': 'gini',..."
12,0.937143,"(std, {'with_mean': False, 'with_std': False})","(rf, {'bootstrap': False, 'criterion': 'gini',..."
46,0.934286,"(std, {'with_mean': False, 'with_std': False})","(rf, {'bootstrap': False, 'criterion': 'gini',..."
77,0.934286,"(std, {'with_mean': False, 'with_std': False})","(rf, {'bootstrap': True, 'criterion': 'entropy..."
65,0.934286,"(std, {'with_mean': False, 'with_std': False})","(rf, {'bootstrap': True, 'criterion': 'entropy..."
22,0.934286,"(minmax, {'copy': True})","(rf, {'bootstrap': True, 'criterion': 'entropy..."


In [None]:
# for balanced data
# taking only some importante dimensions for evaluation
# sorting mean test scores in decending fashion

rts_balanced = pd.DataFrame(rgs1.cv_results_)[['mean_test_score', 'param_scaler__selected_model', 'param_classifier__selected_model']]
rts_balanced = rts_balanced.sort_values('mean_test_score', ascending=False)
rts_balanced

Unnamed: 0,mean_test_score,param_scaler__selected_model,param_classifier__selected_model
97,0.979899,"(maxabs, {'copy': True})","(rf, {'bootstrap': False, 'criterion': 'gini',..."
34,0.979899,"(std, {'with_mean': True, 'with_std': False})","(rf, {'bootstrap': False, 'criterion': 'gini',..."
45,0.979899,"(std, {'with_mean': False, 'with_std': True})","(rf, {'bootstrap': False, 'criterion': 'gini',..."
66,0.979899,"(minmax, {'copy': True})","(rf, {'bootstrap': False, 'criterion': 'gini',..."
0,0.975455,"(maxabs, {'copy': True})","(rf, {'bootstrap': False, 'criterion': 'entrop..."
...,...,...,...
47,0.892929,"(minmax, {'copy': True})","(lr, {'max_iter': 300, 'penalty': 'l2'})"
89,0.892929,"(minmax, {'copy': True})","(lr, {'max_iter': 100, 'penalty': 'l2'})"
25,0.892929,"(minmax, {'copy': True})","(lr, {'max_iter': 500, 'penalty': 'l2'})"
87,0.884040,"(minmax, {'copy': True})","(lr, {'max_iter': 500, 'penalty': 'l1'})"


In [None]:
# for artificial data

rts_artificial = pd.DataFrame(rgs2.cv_results_)[['mean_test_score', 'param_scaler__selected_model', 'param_classifier__selected_model']]
rts_artificial = rts_artificial.sort_values('mean_test_score', ascending=False)
rts_artificial

Unnamed: 0,mean_test_score,param_scaler__selected_model,param_classifier__selected_model
90,0.852,"(std, {'with_mean': True, 'with_std': True})","(rf, {'bootstrap': False, 'criterion': 'gini',..."
44,0.852,"(std, {'with_mean': True, 'with_std': False})","(rf, {'bootstrap': False, 'criterion': 'gini',..."
3,0.852,"(minmax, {'copy': True})","(rf, {'bootstrap': False, 'criterion': 'gini',..."
23,0.852,"(std, {'with_mean': False, 'with_std': True})","(rf, {'bootstrap': False, 'criterion': 'gini',..."
6,0.852,"(maxabs, {'copy': True})","(rf, {'bootstrap': False, 'criterion': 'gini',..."
...,...,...,...
69,0.662,"(std, {'with_mean': False, 'with_std': False})","(lr, {'max_iter': 100, 'penalty': 'l1'})"
83,0.662,"(std, {'with_mean': True, 'with_std': False})","(lr, {'max_iter': 300, 'penalty': 'l1'})"
94,0.662,"(std, {'with_mean': True, 'with_std': False})","(lr, {'max_iter': 500, 'penalty': 'l1'})"
98,0.662,"(std, {'with_mean': True, 'with_std': False})","(lr, {'max_iter': 100, 'penalty': 'l1'})"
