In [1]:
import sqlite3
from ast import literal_eval
import pandas
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, matthews_corrcoef
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

## numerator_np

In [2]:
with sqlite3.connect("CI.db") as db:
    c = db.cursor()
    df = pandas.read_sql_query("SELECT hs,ci FROM numerator_np", db)

# cast 'hilb'-column from a column of strings into a column of lists of ints
df['hs'] = df['hs'].transform(literal_eval)

hs=df['hs'].to_list();
ci=df['ci'].to_list();

hs_train, hs_test, ci_train, ci_test = train_test_split(\
    df['hs'].to_list(), df['ci'].to_list(), test_size=0.2,
    shuffle=True)

# MLP classifier:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)
clf.fit(hs_train, ci_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [3]:
# predictions and metrics:
ci_pred = clf.predict(hs_test)
print(f'MCC: {matthews_corrcoef(ci_test, ci_pred)}')
print(f'accuracy score: {accuracy_score(ci_test, ci_pred)}')

MCC: 0.9960337469185694
accuracy score: 0.9980732177263969


## taylor_np_20

In [26]:
with sqlite3.connect("CI.db") as db:
    c = db.cursor()
    df = pandas.read_sql_query("SELECT hs,ci FROM taylor_np_20", db)

# cast 'hilb'-column from a column of strings into a column of lists of ints
df['hs'] = df['hs'].transform(literal_eval)

# normalisation: divide entries by last entry and remove last entry
df['hs'] = df['hs'].transform(lambda h: [h[i] / h[-1]
                                             for i in range(0, len(h) - 1)])

hs=df['hs'].to_list();
ci=df['ci'].to_list();

hs_train, hs_test, ci_train, ci_test = train_test_split(\
    df['hs'].to_list(), df['ci'].to_list(), test_size=0.2,
    shuffle=True)

# MLP classifier:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)
clf.fit(hs_train, ci_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [12]:
# predictions and metrics:
ci_pred = clf.predict(hs_test)
print(f'MCC: {matthews_corrcoef(ci_test, ci_pred)}')
print(f'accuracy score: {accuracy_score(ci_test, ci_pred)}')

MCC: 0.25447126372869
accuracy score: 0.6107404936624417


In [27]:
# random forest classifier. n_estimators random tree of max_depth:
clf = RandomForestClassifier(n_estimators=100, max_depth=100)
clf.fit(hs_train, ci_train)

# predictions and metrics:
ci_pred = clf.predict(hs_test)
print(f'MCC: {matthews_corrcoef(ci_test, ci_pred)}')
print(f'accuracy score: {accuracy_score(ci_test, ci_pred)}')

MCC: 0.45735164298039954
accuracy score: 0.7174783188792528


## numerator

In [4]:
with sqlite3.connect("CI.db") as db:
    c = db.cursor()
    df = pandas.read_sql_query("SELECT hs,ci FROM numerator", db)

# cast 'hilb'-column from a column of strings into a column of lists of ints
df['hs'] = df['hs'].transform(literal_eval)

hs=df['hs'].to_list();
ci=df['ci'].to_list();

hs_train, hs_test, ci_train, ci_test = train_test_split(\
    df['hs'].to_list(), df['ci'].to_list(), test_size=0.1,
    shuffle=True)

# MLP classifier:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)
clf.fit(hs_train, ci_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [5]:
# predictions and metrics:
ci_pred = clf.predict(hs_test)
print(f'MCC: {matthews_corrcoef(ci_test, ci_pred)}')
print(f'accuracy score: {accuracy_score(ci_test, ci_pred)}')

MCC: -0.029640861239295824
accuracy score: 0.5232432432432432


In [6]:
# random forest classifier. n_estimators random tree of max_depth:
clf = RandomForestClassifier(n_estimators=70, max_depth=70)
clf.fit(hs_train, ci_train)

# predictions and metrics:
ci_pred = clf.predict(hs_test)
print(f'MCC: {matthews_corrcoef(ci_test, ci_pred)}')
print(f'accuracy score: {accuracy_score(ci_test, ci_pred)}')

MCC: 0.7844349597692939
accuracy score: 0.8918918918918919


### Optional: find the best hyperparameters for random forest

In [49]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 150, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(50, 150, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [50, 61, 72, 83, 94, 105, 116, 127, 138, 150, None],
 'max_features': ['auto', 'sqrt', 'log2'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [50, 61, 72, 83, 94, 105, 116, 127, 138, 150]}


In [51]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using cv fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=0, 
                               random_state=None, n_jobs = None)
# Fit the random search model
rf_random.fit(hs, ci)

RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [52]:
rf_random.best_params_

{'n_estimators': 105,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 150,
 'bootstrap': False}

In [7]:
# random forest classifier. n_estimators random tree of max_depth:
clf = RandomForestClassifier(n_estimators=105, max_depth=150,min_samples_split=2,min_samples_leaf=1,bootstrap=False)
clf.fit(hs_train, ci_train)

# predictions and metrics:
ci_pred = clf.predict(hs_test)
print(f'MCC: {matthews_corrcoef(ci_test, ci_pred)}')
print(f'accuracy score: {accuracy_score(ci_test, ci_pred)}')

MCC: 0.8038251714399988
accuracy score: 0.9016216216216216


## taylor_20

In [31]:
with sqlite3.connect("CI.db") as db:
    c = db.cursor()
    df = pandas.read_sql_query("SELECT hs,ci FROM taylor_20", db)

# cast 'hilb'-column from a column of strings into a column of lists of ints
df['hs'] = df['hs'].transform(literal_eval)

# normalisation: divide entries by last entry and remove last entry
df['hs'] = df['hs'].transform(lambda h: [h[i] / h[-1]
                                             for i in range(0, len(h) - 1)])

hs=df['hs'].to_list();
ci=df['ci'].to_list();

hs_train, hs_test, ci_train, ci_test = train_test_split(\
    df['hs'].to_list(), df['ci'].to_list(), test_size=0.2,
    shuffle=True)

# MLP classifier:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)
clf.fit(hs_train, ci_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [32]:
# predictions and metrics:
ci_pred = clf.predict(hs_test)
print(f'MCC: {matthews_corrcoef(ci_test, ci_pred)}')
print(f'accuracy score: {accuracy_score(ci_test, ci_pred)}')

MCC: 0.11409832748943859
accuracy score: 0.5579991375592928


In [33]:
# random forest classifier. n_estimators random tree of max_depth:
clf = RandomForestClassifier(n_estimators=100, max_depth=100)
clf.fit(hs_train, ci_train)

# predictions and metrics:
ci_pred = clf.predict(hs_test)
print(f'MCC: {matthews_corrcoef(ci_test, ci_pred)}')
print(f'accuracy score: {accuracy_score(ci_test, ci_pred)}')

MCC: 0.4916855468240053
accuracy score: 0.7460112117291936


## taylor_100

In [2]:
with sqlite3.connect("CI.db") as db:
    c = db.cursor()
    df = pandas.read_sql_query("SELECT hs,ci FROM taylor_100", db)

# cast 'hilb'-column from a column of strings into a column of lists of ints
df['hs'] = df['hs'].transform(literal_eval)

# normalisation: divide entries by last entry and remove last entry
df['hs'] = df['hs'].transform(lambda h: [h[i] / h[-1]
                                             for i in range(0, len(h) - 1)])

hs=df['hs'].to_list();
ci=df['ci'].to_list();

hs_train, hs_test, ci_train, ci_test = train_test_split(\
    df['hs'].to_list(), df['ci'].to_list(), test_size=0.2,
    shuffle=True)

In [19]:
# random forest classifier. n_estimators random tree of max_depth:
clf = RandomForestClassifier(n_estimators=100, max_depth=100)
clf.fit(hs_train, ci_train)

# predictions and metrics:
ci_pred = clf.predict(hs_test)
print(f'MCC: {matthews_corrcoef(ci_test, ci_pred)}')
print(f'accuracy score: {accuracy_score(ci_test, ci_pred)}')

MCC: 0.9131205780261956
accuracy score: 0.9564655172413793


In [3]:
# MLP classifier:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)
clf.fit(hs_train, ci_train)

# predictions and metrics:
ci_pred = clf.predict(hs_test)
print(f'MCC: {matthews_corrcoef(ci_test, ci_pred)}')
print(f'accuracy score: {accuracy_score(ci_test, ci_pred)}')

MCC: 0.1582294622426255
accuracy score: 0.5780172413793103


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
