## Imports

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn import svm
from sklearn.linear_model import LogisticRegression, LinearRegression
from numpy import mean, std
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

## Local Options

In [19]:
local_options = {
    "plot_confusion": True,
    "multi_class"   : True,
    "k_fold_value"  : 5,
    "scoring"       : 'f1_macro',
    "parallel_jobs" : 8
    }

## Dataset Loading

In [20]:
# read dataframe from...
df = pd.read_csv('ml_dataset.csv')
#df.head()
y = df.Sensor
X = df.drop(['Type', 'Sensor'] , axis=1)
#.3 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=8675309, shuffle=True)

## Grid search wrapper functions

In [21]:
def kfold_grid_search(X_train, y_train, parameters, model):
    print('Grid Search started...')
    cross_validator = KFold(n_splits=5, random_state=405782, shuffle=True)
    grid = GridSearchCV(estimator=model, param_grid=parameters, n_jobs=8, cv=cross_validator, scoring=local_options['scoring'])
    return grid.fit(X_test,y_test)

def print_search_results(res):
    print("Best: %f using %s" % (res.best_score_, res.best_params_))
    means = res.cv_results_['mean_test_score']
    stds = res.cv_results_['std_test_score']
    params = res.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    return

## Support Vector Machine
Tuning using __grid search__ for both _linear_ and _poly_ kernel with logarithmic _C_

In [None]:
kernel = ['linear', 'poly','rbf']#, 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
#gamma = ['scale']
p_span = dict(kernel=kernel,C=C)#,gamma=gamma)

svc = svm.SVC()
svc_search_result = kfold_grid_search(X_train, y_train, p_span, svc)

print_search_results(svc_search_result)


Grid Search started...


## Logistic Regression

In [22]:
#grid definition
#solvers = ['newton-cg', 'lbfgs', 'liblinear']
solvers = ['liblinear', 'saga']
penalty = ['l1', 'l2', 'elasticnet']
c_values = [1000, 100]
param_grid = dict(solver=solvers,penalty=penalty,C=c_values)

logreg = LogisticRegression()
lgrg_search_results = kfold_grid_search(X_train, y_train, param_grid, logreg)
print_search_results(lgrg_search_results)

Grid Search started...


 0.51662218 0.51248004 0.51251686 0.51199788        nan        nan]


Best: 0.519135 using {'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
0.519135 (0.006812) with: {'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
0.512673 (0.006785) with: {'C': 1000, 'penalty': 'l1', 'solver': 'saga'}
0.514053 (0.006602) with: {'C': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
0.512741 (0.006923) with: {'C': 1000, 'penalty': 'l2', 'solver': 'saga'}
nan (nan) with: {'C': 1000, 'penalty': 'elasticnet', 'solver': 'liblinear'}
nan (nan) with: {'C': 1000, 'penalty': 'elasticnet', 'solver': 'saga'}
0.516622 (0.005282) with: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
0.512480 (0.007166) with: {'C': 100, 'penalty': 'l1', 'solver': 'saga'}
0.512517 (0.006470) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.511998 (0.007278) with: {'C': 100, 'penalty': 'l2', 'solver': 'saga'}
nan (nan) with: {'C': 100, 'penalty': 'elasticnet', 'solver': 'liblinear'}
nan (nan) with: {'C': 100, 'penalty': 'elasticnet', 'solver': 'saga'}


## Linear Regression
Actually does not need tuning

In [10]:
# evaluate a logistic regression model using k-fold cross-validation

# prepare the cross-validation procedure
cv = KFold(n_splits=5, random_state=405782, shuffle=True)
# create model
model = LinearRegression()
# evaluate model
scores = cross_val_score(model, X_train, y_train, scoring=local_options['scoring'], cv=cv, n_jobs=-1)
# report performance
print('%s: %.3f (%.3f)' % (local_options['scoring'],mean(scores), std(scores)))

accuracy: nan (nan)


## Random Forest

In [23]:
n_estimators = [10, 100, 1000, 5000]
max_features = ['sqrt', 'log2']
grid = dict(n_estimators=n_estimators,max_features=max_features)

rfc=RandomForestClassifier()

rf_search_results = kfold_grid_search(X_train, y_train, grid, rfc)
print_search_results(rf_search_results)

Grid Search started...




Best: 0.613925 using {'max_features': 'log2', 'n_estimators': 5000}
0.603317 (0.008591) with: {'max_features': 'sqrt', 'n_estimators': 10}
0.613650 (0.009961) with: {'max_features': 'sqrt', 'n_estimators': 100}
0.613239 (0.008575) with: {'max_features': 'sqrt', 'n_estimators': 1000}
0.613765 (0.009541) with: {'max_features': 'sqrt', 'n_estimators': 5000}
0.605166 (0.009663) with: {'max_features': 'log2', 'n_estimators': 10}
0.611823 (0.008666) with: {'max_features': 'log2', 'n_estimators': 100}
0.613265 (0.008654) with: {'max_features': 'log2', 'n_estimators': 1000}
0.613925 (0.009942) with: {'max_features': 'log2', 'n_estimators': 5000}


## K-Nearest Neighbors

In [25]:
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)

knn = KNeighborsClassifier()
knn_search_results = kfold_grid_search(X_train, y_train, grid, knn)
print_search_results(knn_search_results)

Grid Search started...
Best: 0.564423 using {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}
0.548791 (0.009565) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.548791 (0.009565) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.560359 (0.005047) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.550043 (0.006886) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.553291 (0.011118) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.549142 (0.009914) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.553627 (0.008545) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.550103 (0.009029) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.560427 (0.004968) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.551810 (0.006498) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights':

# Results so far
## Accuracy
### Logistic Regression
`Best: 0.696873 using {'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'}`

### Support Vector Machine

### Random Forest
`Best: 0.728772 using {'max_features': 'log2', 'n_estimators': 5000}`

### K-Nearest Neighbors
`Best: 0.687071 using {'metric': 'manhattan', 'n_neighbors': 17, 'weights': 'uniform'}`

## f1_macro
### Logistic Regression
`Best: 0.519135 using {'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'}`

### Support Vector Machine

### Random Forest
`Best: 0.613925 using {'max_features': 'log2', 'n_estimators': 5000}`

### K-Nearest Neighbors
`Best: 0.564423 using {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}`

