### Estid Lozano
### David Herrera
### Nicolas Gonzalez

In [None]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import openml as oml
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
# from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Exercise 1 (Up-Sampling)

Hint: You might want to have a look at the imblearn.over_sampling package

**1.1.** write a function visualize_data(df, class_att) that receives a dataframe with three columns (the one named class_att is the label column) and creates two plots: One shows a scatter plot of the data in the first two attributes. The second shows a bar-chart with the class distribution.

In [None]:
def visualize_data(df, class_att, dfUp = None):
    """
    grouped = df.groupby(class_att)
    labels = df[class_att].unique()
    colors = dict(zip(labels, ["#d22", "#2d2", "#22d", "#dd2", "#d2d", "#2dd"][:len(labels)]))
    if ax == None:
        fig, ax = plt.subplots()
    for key, group in grouped:
        group.plot(ax=ax, kind='scatter', x=df.columns[0], y=df.columns[1], label=key, color=colors[key])
    plt.show()
    """
    fig, ax = plt.subplots()
    df.loc[:,df.columns!=class_att].plot(ax=ax,kind='scatter',x=df.columns[0],y=df.columns[1], label="original")
    d2 = df[class_att].value_counts()
    if dfUp is not None:
        dfUp = dfUp[~dfUp.isin(df)].dropna(how = 'all')
        dfUp.loc[:,df.columns!=class_att].plot(ax=ax,kind='scatter',x=dfUp.columns[0],y=dfUp.columns[1],color="orange",label="up")
        d2 = pd.concat([d2,dfUp[class_att].value_counts()],axis=1)
        d2.columns = ["original", "up"]
    plt.show()
    d2.plot(kind="bar",title=class_att)
    plt.show()

**1.2.** Load the rmftsa_sleepdata dataset from openml.org and visualize it with the above method. Then use the SMOTE method from scikit-learn to up-sample the minority class. Use the above function to plot the data again. Use a different color/symbol for the up-sampled instances.

Was the upsampling successful in that it generated reasonable new instances?

In [None]:
# Load and visualize
dtset = oml.datasets.get_dataset(679) # rmftsa_sleepdata
X, y, catInd, attrs = dtset.get_data(
    target=dtset.default_target_attribute)
df = pd.concat([X,pd.DataFrame({"label":y})],axis=1)
visualize_data(df, "label")

In [None]:
# Smote and visualize
XUp, yUp = SMOTE(random_state=0).fit_resample(X, y)
dfUp = pd.concat([XUp,pd.DataFrame({"label":yUp})],axis=1)
visualize_data(df, "label", dfUp)

**Answer:** 

# Exercise 2 (Feature Scaling)

Load the amazon-commerce-reviews dataset (1457). Compare the prediction accuracy (5-fold CV) of a decision tree and logistic regression when using none or any of the feature scaling techniques seen in class.

Report the performance of all these combinations. Does feature scaling bring an advantage?

In [None]:
# Load
dtset = oml.datasets.get_dataset(1457) # amazon-commerce-reviews
X, y, catInd, attrs = dtset.get_data(
    target=dtset.default_target_attribute)

In [None]:
def predictAndCompare(technique, X):
    print("Technique:", technique)
    clf = DecisionTreeClassifier(max_leaf_nodes=3, min_samples_split = 5)
    scores = cross_val_score(clf, X, y, cv=5)
    print("DecisionTree error: mean of %0.2f with stand. dev. of %0.2f"  % (scores.mean(), scores.std()))
    clf = LogisticRegression()
    scores = cross_val_score(clf, X, y, cv=5)
    print("LogisticRegresion error: mean of %0.2f with stand. dev. of %0.2f"  % (scores.mean(), scores.std()))

In [None]:
predictAndCompare("None", X)

In [None]:
predictAndCompare("MinMax", preprocessing.MinMaxScaler().fit_transform(X))

In [None]:
predictAndCompare("Standard", preprocessing.StandardScaler().fit_transform(X))

In [None]:
predictAndCompare("Mean", preprocessing.MinMaxScaler(feature_range=(-1,1)).fit_transform(X))

In [None]:
predictAndCompare("Sqrt", np.sqrt(X))

In [None]:
predictAndCompare("Power", preprocessing.PowerTransformer().fit_transform(X))

In [None]:
predictAndCompare("Unit-Length", preprocessing.Normalizer().fit_transform(X))

**Answer:**

# Exercise 3 (Feature Extraction)

Load the amazon-commerce-reviews dataset. Compare the prediction accuracy (5-fold CV) of a decision tree and logistic regression when using the original against landmark features (using all the (training) datapoints as landmarks).

Report the performances. Does landmarking bring an advantage?

# Exercise 4 (Feature selection)

Load the madelon dataset (1485). Compare the prediction accuracy (5-fold CV) of a decision tree and logistic regression when using none or the sklearn.feature_selection.chi2 or the sklearn.feature_selection.mutual_info_classif
criterion.

Use different selectors, e.g., SelectKBest, SelectPercentile, and GenericUnivariateSelect with different parameters.

Report the performance of all these combinations. Does feature scaling bring an advantage?