In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.ticker import FormatStrFormatter

from preamble import *
import pandas as pd
import graphviz
import math
from scipy.stats import wilcoxon
from scipy import arange

from sklearn import preprocessing
from sklearn.preprocessing import Imputer, LabelEncoder, MinMaxScaler, StandardScaler, Normalizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score  

from pymongo import MongoClient
import json

The following Landmarking meta-features are calculated: (Matthias Reif et al. 2012, Abdelmessih et al. 2010)

The accuracy values of the following simple learners are used: Naive Bayes, Linear Discriminant Analysis, One-Nearest Neighbor, Decision Node, Random Node.

- **Naive Bayes Learner** is a probabilistic classifier, based on Bayes’ Theorem:
$$ p(X|Y ) = \frac{p(Y|X) \cdot p(X)}{p(Y )} $$

    where p(X) is the prior probability and p(XjY ) is the posterior probability. It is called naive, because it
    assumes independence of all attributes to each other.
- **Linear Discriminant Learner** is a type of discriminant analysis, which is understood as the grouping and separation of categories according to specific features. Linear discriminant is basically finding a linear combination of features that separates the classes best. The resulting separation model is a line, a plane, or a hyperplane, depending on the number of features combined. 

- **One Nearest Neighbor Learner** is a classifier based on instance-based learning. A test point is assigned to the class of the nearest point within the training set. 

- **Decision Node Learner** is a classifier based on the information gain of attributes. The information gain indicates how informative an attribute is with respect to the classification task using its entropy. The higher the variability of the attribute values, the higher its information gain. This learner selects the attribute with the highest information gain. Then, it creates a single node decision tree consisting of the chosen attribute as a split node. 

- **Randomly Chosen Node Learner** is a classifier that results also in a single decision node, based on a randomly chosen attribute.

In [None]:
def pipeline(dataset, X, y, estimator):
    pipe = Pipeline([('Imputer', preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)),
#                     ("scaler", MinMaxScaler()),
#                     ('OneHotEncoder', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')),
                     ('classifiers', estimator)])
    score = np.mean(cross_val_score(pipe, X, y, cv=10, scoring='roc_auc', n_jobs=-1))
    scores.append(score)
    score_tmp.append(score)
    classifier.append(str(estimator.__class__.__name__))

scores = []
classifier = []
datasets = np.genfromtxt('datasetID2.csv', delimiter=',', dtype=int)
db = connet_mongoclient('109.238.10.185')

for dataset in datasets:
    score_tmp = []
    data = oml.datasets.get_dataset(dataset) 
    X, y = data.get_data(target=data.default_target_attribute)
    if len(set(y)) <= 2:
        pipeline(dataset, X, y, KNeighborsClassifier(n_neighbors = 1)) # One-nearest neighbor
        pipeline(dataset, X, y, LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')) # Linear Discriminant Analysis
        pipeline(dataset, X, y, GaussianNB()) # Gaussian Naive Bayes
        pipeline(dataset, X, y, DecisionTreeClassifier(criterion='entropy', splitter='best', 
                                    max_depth=1, random_state=0)) # Decision Node Learner
        pipeline(dataset, X, y, DecisionTreeClassifier(criterion='entropy', splitter='random',
                                    max_depth=1, random_state=0)) # Randomly Chosen Node Learner
        print(dataset, score_tmp)
        db.landmarkers.insert_one({'dataset': str(dataset),
                                   'score': score_tmp})
    else: 
        multiclass.append(dataset)