# *Aufgabe: SVM und Interpretation ML*

0. Imports

In [17]:
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
import time
from sklearn.datasets import load_diabetes
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from scipy import stats
import os

### Aufgabe 1. SVM

1. Load and split the data

In [18]:
data = pd.read_csv(os.path.join("..", "..", "..", "data", "mpg.csv"))
X  = data.drop(["mpg"],axis=1)
x_train, x_test, y_train, y_test = train_test_split(X, data["mpg"])

minmax = MinMaxScaler()

x_train = pd.DataFrame(minmax.fit_transform(x_train), columns=X.columns)
x_test = minmax.fit_transform(x_test)

print('X Train: {}'.format(x_train.shape)) 
print('Y Train: {}'.format(y_train.shape)) 
print('X Test: {}'.format(x_test.shape)) 
print('Y Test: {}'.format(y_test.shape))

X Train: (298, 7)
Y Train: (298,)
X Test: (100, 7)
Y Test: (100,)


In [11]:
# An alternative data
features, target = load_diabetes(return_X_y=True, as_frame=True)

X = features
x_train, x_test, y_train, y_test = train_test_split(features, target)

minmax = MinMaxScaler()

x_train = pd.DataFrame(minmax.fit_transform(x_train), columns=X.columns)
x_test = minmax.fit_transform(x_test)
print('X Train: {}'.format(x_train.shape)) 
print('Y Train: {}'.format(y_train.shape)) 
print('X Test: {}'.format(x_test.shape)) 
print('Y Test: {}'.format(y_test.shape))

X Train: (331, 10)
Y Train: (331,)
X Test: (111, 10)
Y Test: (111,)


2. Train and evaluate all the models

In [12]:
import warnings
warnings.filterwarnings('ignore')

def mse(y_pred,y):
    return np.mean((y_pred-y)**2)

def train_evaluate_time(method):
    start_time = time.time()
    methods[method].fit(x_train, y_train)
    running_time = round(time.time() - start_time,2)
    model_predict = methods[method].predict(x_test)
    model_mse = mse(model_predict,y_test)
    print("Genauigkeit von {}: {}; running time: {}".format(method, model_mse, running_time))


methods = {"knn": KNeighborsRegressor(), "linregr": LinearRegression(), "tree": DecisionTreeRegressor(), 
           "forest": RandomForestRegressor(), "ada": AdaBoostRegressor(), "svm": svm.SVR()}


for method in methods:
    train_evaluate_time(method)


Genauigkeit von knn: 4478.1070270270275; running time: 0.0
Genauigkeit von linregr: 4207.273198744859; running time: 0.0
Genauigkeit von tree: 8631.81081081081; running time: 0.0
Genauigkeit von forest: 3613.722252252252; running time: 0.23
Genauigkeit von ada: 3678.2506772723686; running time: 0.06
Genauigkeit von svm: 5288.983567312791; running time: 0.0


### Aufgabe 2: Probabilistische Methoden

### Aufgabe 1. Gaussian Naive Bayes

1. Import and Train/Test split

In [19]:
census = pd.read_csv(os.path.join("..", "..", "..", "data", "census.csv"))[['age', 'hours-per-week', 'target']]
census["target"] = census["target"].map({" <=50K":0," >50K":1})
census.head(5)

Unnamed: 0,age,hours-per-week,target
0,39,40,0
1,50,13,0
2,38,40,0
3,53,40,0
4,28,40,0


In [14]:
x_train, x_test, y_train, y_test = train_test_split(census.drop(["target"],axis=1),census["target"])
print('X Train: {}'.format(x_train.shape))
print('Y Train: {}'.format(y_train.shape))
print('X Test: {}'.format(x_test.shape))
print('Y Test: {}'.format(y_test.shape))

# Normalize: Age und hours-per-week
minmax = MinMaxScaler()

x_train['age'] = minmax.fit_transform(x_train['age'].to_numpy().reshape(-1,1))
x_train['hours-per-week'] = minmax.fit_transform(x_train['hours-per-week'].to_numpy().reshape(-1,1))

x_test['age'] = minmax.fit_transform(x_test['age'].to_numpy().reshape(-1,1))
x_test['hours-per-week'] = minmax.fit_transform(x_test['hours-per-week'].to_numpy().reshape(-1,1))

X Train: (24420, 2)
Y Train: (24420,)
X Test: (8141, 2)
Y Test: (8141,)


2. Gaussian Naive Bayes:

In [15]:
def calculate_probability(prior_high, prior_low, x):
    proba_high = np.log(prior_high*age_density_high(x[0])*hours_density_high(x[1]))
    proba_low = np.log(prior_low*age_density_low(x[0])*hours_density_low(x[1]))
    if proba_high >= proba_low:
        return 1
    else:
        return 0

# Prior Probabilities:

data_shape = y_train.shape[0]
prior_probability_high_income = y_train.loc[y_train == 1].shape[0]/data_shape
prior_probability_low_income = y_train.loc[y_train == 0].shape[0]/data_shape
print("Prior probability high income: {:.2f}".format(prior_probability_high_income*100))
print("Prior probability low income: {:.2f}".format(prior_probability_low_income*100))

data_train = x_train
data_train['target'] = y_train
age_density_high = stats.kde.gaussian_kde(data_train.loc[data_train["target"]==1]["age"])
age_density_low = stats.kde.gaussian_kde(data_train.loc[data_train["target"]==0]["age"])
hours_density_high = stats.kde.gaussian_kde(data_train.loc[data_train["target"]==1]["hours-per-week"])
hours_density_low = stats.kde.gaussian_kde(data_train.loc[data_train["target"]==0]["hours-per-week"])

# Prediction:
predicted = []
for element in x_test.to_numpy():
    predicted.append(calculate_probability(prior_probability_high_income, prior_probability_low_income, element))

print("Accuracy score: {}".format(accuracy_score(predicted, y_test)))

Prior probability high income: 23.94
Prior probability low income: 76.06
Accuracy score: 0.7640339024689842
