In [529]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

In [443]:
df = pd.read_csv('ceneval_data.csv', index_col=0, encoding='utf-8', low_memory=False)
df

Unnamed: 0,TIPO_EXA,ANO_NAC,SEXO,LENG_MA,LENG_PA,PLT_PROC,RAZ_RAPT,RAZ_RACT,RAZ_OPT,RAZ_PCUM,...,SER_TVP,SER_TABL,CUARTOS,SER_PC,SER_TV,SER_AUTO,SER_BANO,VAC_RM,DICTAMEN,VECES_EXAM
0,0.0077,0.90625,0,0.0,0.0,0.0355,0.0,1.0,1.0,0.0,...,1.0,1.0,0.22,1.0,1.0,1.0,1.0,0.0,0,1.0
1,0.0705,1.00000,1,0.0,0.0,0.0900,1.0,0.0,0.0,0.0,...,0.0,0.0,0.22,1.0,1.0,1.0,1.0,1.0,0,1.0
2,0.0705,0.90625,0,0.0,0.0,0.0900,1.0,1.0,1.0,0.0,...,0.0,0.0,0.41,0.0,1.0,0.0,1.0,0.0,1,1.0
3,0.0705,1.00000,1,0.0,0.0,0.0900,1.0,1.0,1.0,1.0,...,1.0,1.0,0.41,1.0,1.0,0.0,1.0,1.0,1,1.0
4,0.0705,0.84375,0,0.0,0.0,0.0900,1.0,1.0,1.0,0.0,...,0.0,0.0,0.22,1.0,1.0,0.0,1.0,0.0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15608,0.0303,1.00000,1,0.0,0.0,0.0434,0.0,1.0,1.0,0.0,...,1.0,0.0,0.41,0.0,1.0,1.0,1.0,1.0,1,1.0
15609,0.0705,1.00000,1,0.0,0.0,0.0900,0.0,1.0,1.0,0.0,...,0.0,0.0,0.22,0.0,1.0,1.0,1.0,1.0,0,1.0
15610,0.0705,1.00000,1,0.0,0.0,0.0900,1.0,1.0,1.0,0.0,...,1.0,0.0,0.41,1.0,1.0,1.0,1.0,1.0,1,1.0
15611,0.0705,1.00000,1,0.0,0.0,0.0900,0.0,0.0,1.0,1.0,...,1.0,0.0,0.22,1.0,1.0,1.0,1.0,1.0,0,1.0


In [502]:
elements = df.size
elements

1857947

In [338]:
def generate_random_alpha():
    return (10 ** np.random.randint(-5,0)) * np.random.randint(1,10)

In [458]:
def generate_random_config(layer_sizes=100, batch='auto', n_samples=1500):
    hidden_layer_sizes = (layer_sizes,)
    activation = [
        'identity',
        'logistic',
        'tanh',
        'relu'
    ]
    solver = [
        'lbfgs',
        'sgd',
        'adam'
    ]
    learning_rate = [
        'constant',
        'invscaling',
        'adaptive'
    ]
    alpha = np.random.rand()
    batch_size=batch if batch is 'auto' else (np.random.randint(10, 200), n_samples)
    learning_rate_init = generate_random_alpha()
    max_iter = np.random.randint(10, 200)
    momentum = generate_random_alpha()
    return { 
        'hidden_layer_sizes': hidden_layer_sizes,
        'activation': activation[np.random.randint(0,4)],
        'solver': solver[np.random.randint(0,3)],
        'learning_rate': learning_rate[np.random.randint(0,3)],
        'alpha': alpha,
        'batch_size': batch_size,
        'learning_rate_init': learning_rate_init,
        'max_iter': max_iter,
        'momentum': momentum
    }
    

In [546]:
def generate_models_config(samples, n_samples):
    return [generate_random_config(n_samples=n_samples) for i in range(samples)]

In [537]:
def find_better_model(x_train, y_train, x_test, y_test,models_config=[]):
    better_score = -1
    best_model = None
    for model_config in models_config:
        mlp = MLPClassifier(**model_config)
        mlp.fit(x_train, y_train)
        y_pred=mlp.predict(x_test)
        accuracy=accuracy_score(y_test, y_pred)
        if accuracy > better_score:
            better_score = accuracy
            best_model = mlp
    return mlp, better_score

In [523]:
# Selecting features
y = df['DICTAMEN']
x = df.drop(["DICTAMEN"], axis=1).values 

# Spliting dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 70% training and 30% test

In [534]:
x_train

array([[0.1087    , 0.78125   , 1.        , ..., 1.        , 0.        ,
        1.        ],
       [0.0028    , 0.8125    , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [0.1087    , 0.875     , 0.        , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [0.0655    , 0.75      , 0.        , ..., 1.        , 0.        ,
        0.10887443],
       [0.0189    , 0.875     , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.0705    , 0.9375    , 1.        , ..., 1.        , 0.        ,
        1.        ]])

In [526]:
y_train

4438     1
2504     1
626      1
15167    1
10205    0
        ..
15018    1
4649     1
8789     1
1357     0
1592     1
Name: DICTAMEN, Length: 10929, dtype: int64

In [514]:
mlp = MLPClassifier(**generate_random_config(n_samples=elements))

In [527]:
mlp.fit(x_train, y_train)

MLPClassifier(activation='logistic', alpha=0.9028682871300118,
              batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False,
              epsilon=1e-08, hidden_layer_sizes=(100,),
              learning_rate='constant', learning_rate_init=0.003, max_iter=168,
              momentum=0.6000000000000001, n_iter_no_change=10,
              nesterovs_momentum=True, power_t=0.5, random_state=None,
              shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
              verbose=False, warm_start=False)

In [528]:
y_pred=mlp.predict(x_test)

In [531]:
mlp.score(x_test, y_test)

0.591801878736123

In [532]:
# MLP Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.591801878736123


In [547]:
models_config = generate_models_config(samples=1000, n_samples=elements)
models_config

[{'hidden_layer_sizes': (100,),
  'activation': 'identity',
  'solver': 'sgd',
  'learning_rate': 'invscaling',
  'alpha': 0.44587783092470923,
  'batch_size': 'auto',
  'learning_rate_init': 0.0007,
  'max_iter': 175,
  'momentum': 0.5},
 {'hidden_layer_sizes': (100,),
  'activation': 'identity',
  'solver': 'adam',
  'learning_rate': 'invscaling',
  'alpha': 0.08797414682989368,
  'batch_size': 'auto',
  'learning_rate_init': 0.004,
  'max_iter': 97,
  'momentum': 0.1},
 {'hidden_layer_sizes': (100,),
  'activation': 'tanh',
  'solver': 'adam',
  'learning_rate': 'adaptive',
  'alpha': 0.9455229627729469,
  'batch_size': 'auto',
  'learning_rate_init': 0.8,
  'max_iter': 57,
  'momentum': 0.00030000000000000003},
 {'hidden_layer_sizes': (100,),
  'activation': 'tanh',
  'solver': 'adam',
  'learning_rate': 'invscaling',
  'alpha': 0.3866672058040804,
  'batch_size': 'auto',
  'learning_rate_init': 0.02,
  'max_iter': 79,
  'momentum': 0.05},
 {'hidden_layer_sizes': (100,),
  'activat

In [548]:
best_model, better_score = find_better_model(x_train, y_train, x_test, y_test,models_config=models_config)
print("better Accuray", better_score)





  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  y = np.array(y > threshold, dtype=np.int)




  y = np.array(y > threshold, dtype=np.int)
  y = np.array(y > threshold, dtype=np.int)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  y = np.array(y > threshold, dtype=np.int)




  y = np.array(y > threshold, dtype=np.int)
  y = np.array(y > threshold, dtype=np.int)


  y = np.array(y > threshold, dtype=np.int)


  y = np.array(y > threshold, dtype=np.int)
  y = np.array(y > threshold, dtype=np.int)


  y = np.array(y > threshold, dtype=np.int)




better Accuray 0.6652433817250214


In [549]:
best_model

MLPClassifier(activation='logistic', alpha=0.13611383704284163,
              batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False,
              epsilon=1e-08, hidden_layer_sizes=(100,),
              learning_rate='adaptive', learning_rate_init=0.0007, max_iter=77,
              momentum=0.0004, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)