In [2]:
#data manipulation & storage
import pandas as pd
import numpy as np
#visualization
import seaborn as sb
from matplotlib import pyplot as plt
#performance and preprocessing
from sklearn import metrics, model_selection, preprocessing
from sklearn.utils import resample
#models
from sklearn import ensemble, tree, neighbors, naive_bayes, linear_model

In [3]:
def get_means(results_dictionary):
    '''
    Returns average of all keys inside dictionary. 
    results_dictionary must have keys with numeric values
    '''
    if type(results_dictionary) != dict: return None
    average_dict = {}
    for key, val in results_dictionary.items():
        key += ' average'
        average_dict[key] = val.mean()
    return average_dict

In [4]:
def print_dictionary_elts(dictionary):
    '''
    Pretty print key/value pairs of a dictionary
    '''
    for key,val in dictionary.items():
        print(f'{key}: {val}')

In [5]:
def run_model(model, x, y, score, cv):
    '''Run cross-validation on sklearn model
    Arguments:
        model - model to be run using cross-validation
        x - vector of independent variables/features
        y - vector of dependent variables/outcomes
        score - scoring metric(s) to estimate model performance
        cv - sklearn KFold cross-validation object
    Returns:
        dictionary with scoring metric summary based on specified cross-validation
    '''
    return model_selection.cross_validate(
        model,
        x,
        y,
        cv=cv,
        scoring=score)

In [2]:
def one_off_accuracy(predicted, actual):
    '''
    Calculates one-off accuracy of predictive model.  One-off accuracy is defined as follows:
    a prediction being correct if it is one more or one less than the actual value.
    '''
    if len(predicted) != len(actual): return None
    _len = len(predicted)
    num_correct = 0
    for i in range(_len):
        pred = predicted[i]
        act  = actual[i]
        tup  = (act-1, act, act+1)
        if pred in tup: num_correct += 1
    print(f'Number of correct predicitions is {num_correct} out of {_len} total predictions.')
    return num_correct/_len

In [6]:
cv = model_selection.KFold(n_splits=10, random_state=42, shuffle=True)

In [7]:
#scoring = ('accuracy', 'f1_weighted', 'roc_auc_ovo_weighted')

In [8]:
scoring = ('f1_weighted', 'accuracy')

In [9]:
df = pd.read_csv('imputed_volcanoes.csv')

In [10]:
df

Unnamed: 0,Volcano Number,Volcano Name,VEI,Eruption Category,Latitude,Longitude,Country,Recoded Volcano Type,Region,Recoded Dominant Rock Type,Tectonic Setting,Elevation (m)
0,264050,Sangeang Api,2.0,Confirmed Eruption,-8.200,119.070,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),1912
1,264050,Sangeang Api,2.0,Confirmed Eruption,-8.200,119.070,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),1912
2,264050,Sangeang Api,3.0,Confirmed Eruption,-8.200,119.070,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),1912
3,264050,Sangeang Api,2.0,Uncertain Eruption,-8.200,119.070,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),1912
4,264050,Sangeang Api,2.0,Confirmed Eruption,-8.200,119.070,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),1912
...,...,...,...,...,...,...,...,...,...,...,...,...
11109,327812,Red Hill,1.0,Confirmed Eruption,34.250,-108.830,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),2300
11110,327812,Red Hill,1.0,Confirmed Eruption,34.250,-108.830,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),2300
11111,327812,Red Hill,1.0,Confirmed Eruption,34.250,-108.830,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),2300
11112,283141,Nantaisan,4.0,Confirmed Eruption,36.765,139.491,Japan,Stratovolcano,"Japan, Taiwan, Marianas",Andesite / Basaltic Andesite,Subduction zone / Continental crust (>25 km),2486


In [11]:
df['VEI'].value_counts()

2.0    5434
1.0    1735
3.0    1491
0.0    1286
4.0     900
5.0     196
6.0      65
7.0       7
Name: VEI, dtype: int64

In [12]:
ind_cols = [
    #'Volcano Number',
    'Latitude',
    'Longitude',
    'Country',
    'Recoded Volcano Type',
    'Region',
    'Recoded Dominant Rock Type',
    'Tectonic Setting',
    'Elevation (m)'
]

In [13]:
dep_cols = [
    #'Volcano Number',
    'VEI'
]

In [14]:
scaler = preprocessing.MinMaxScaler()

In [15]:
# Get indendent variable columns
x = df[ind_cols]
x.shape

(11114, 8)

In [16]:
elevation = x['Elevation (m)'].values
x['Elevation'] = scaler.fit_transform(elevation.reshape(-1, 1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Elevation'] = scaler.fit_transform(elevation.reshape(-1, 1))


In [17]:
latitude = x['Latitude'].values
x['Latitude'] = scaler.fit_transform(latitude.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Latitude'] = scaler.fit_transform(latitude.reshape(-1,1))


In [18]:
longitude = x['Longitude'].values
x['Longitude'] = scaler.fit_transform(longitude.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Longitude'] = scaler.fit_transform(longitude.reshape(-1,1))


In [19]:
x= x.drop(['Elevation (m)'], axis=1)
x

Unnamed: 0,Latitude,Longitude,Country,Recoded Volcano Type,Region,Recoded Dominant Rock Type,Tectonic Setting,Elevation
0,0.424978,0.831706,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),0.605136
1,0.424978,0.831706,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),0.605136
2,0.424978,0.831706,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),0.605136
3,0.424978,0.831706,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),0.605136
4,0.424978,0.831706,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),0.605136
...,...,...,...,...,...,...,...,...
11109,0.685187,0.197858,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),0.635981
11110,0.685187,0.197858,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),0.635981
11111,0.685187,0.197858,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),0.635981
11112,0.700603,0.888502,Japan,Stratovolcano,"Japan, Taiwan, Marianas",Andesite / Basaltic Andesite,Subduction zone / Continental crust (>25 km),0.650767


In [20]:
x = pd.get_dummies(x)
x

Unnamed: 0,Latitude,Longitude,Elevation,Country_Antarctica,Country_Argentina,Country_Armenia,Country_Armenia-Azerbaijan,Country_Australia,Country_Burma (Myanmar),Country_Cameroon,...,Tectonic Setting_Intraplate / Continental crust (>25 km),Tectonic Setting_Intraplate / Intermediate crust (15-25 km),Tectonic Setting_Intraplate / Oceanic crust (< 15 km),Tectonic Setting_Rift zone / Continental crust (>25 km),Tectonic Setting_Rift zone / Intermediate crust (15-25 km),Tectonic Setting_Rift zone / Oceanic crust (< 15 km),Tectonic Setting_Subduction zone / Continental crust (>25 km),Tectonic Setting_Subduction zone / Crustal thickness unknown,Tectonic Setting_Subduction zone / Intermediate crust (15-25 km),Tectonic Setting_Subduction zone / Oceanic crust (< 15 km)
0,0.424978,0.831706,0.605136,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0.424978,0.831706,0.605136,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0.424978,0.831706,0.605136,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0.424978,0.831706,0.605136,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0.424978,0.831706,0.605136,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11109,0.685187,0.197858,0.635981,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11110,0.685187,0.197858,0.635981,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11111,0.685187,0.197858,0.635981,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11112,0.700603,0.888502,0.650767,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [21]:
x.describe()

Unnamed: 0,Latitude,Longitude,Elevation,Country_Antarctica,Country_Argentina,Country_Armenia,Country_Armenia-Azerbaijan,Country_Australia,Country_Burma (Myanmar),Country_Cameroon,...,Tectonic Setting_Intraplate / Continental crust (>25 km),Tectonic Setting_Intraplate / Intermediate crust (15-25 km),Tectonic Setting_Intraplate / Oceanic crust (< 15 km),Tectonic Setting_Rift zone / Continental crust (>25 km),Tectonic Setting_Rift zone / Intermediate crust (15-25 km),Tectonic Setting_Rift zone / Oceanic crust (< 15 km),Tectonic Setting_Subduction zone / Continental crust (>25 km),Tectonic Setting_Subduction zone / Crustal thickness unknown,Tectonic Setting_Subduction zone / Intermediate crust (15-25 km),Tectonic Setting_Subduction zone / Oceanic crust (< 15 km)
count,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,...,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0
mean,0.579266,0.590946,0.623861,0.005938,0.00063,0.00027,0.00036,0.001979,0.00018,0.00189,...,0.022764,0.002429,0.055156,0.022224,0.002609,0.072611,0.637304,0.045888,0.050027,0.088717
std,0.18889,0.320867,0.110151,0.076836,0.02509,0.016428,0.018969,0.044449,0.013414,0.043429,...,0.149157,0.049231,0.228294,0.147419,0.051017,0.259509,0.4808,0.209252,0.21801,0.284348
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.437623,0.284561,0.551793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.586375,0.65548,0.605454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,0.725466,0.888233,0.687893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
# Get dependent variable columns
y = df[dep_cols]
y.shape

(11114, 1)

In [23]:
# split data into train and validation sets.  Validation will be held out to estimate model generalizability
x_train, x_val, y_train, y_val = \
    model_selection.train_test_split(
        x,
        y,
        test_size = 0.30,
        random_state = 42) 

In [24]:
x_train.shape

(7779, 135)

In [25]:
y_train.shape

(7779, 1)

In [26]:
x_val.shape

(3335, 135)

In [27]:
y_val.shape

(3335, 1)

In [28]:
y_train.value_counts()

VEI
2.0    3808
1.0    1194
3.0    1042
0.0     892
4.0     641
5.0     148
6.0      48
7.0       6
dtype: int64

In [29]:
y_val.value_counts()

VEI
2.0    1626
1.0     541
3.0     449
0.0     394
4.0     259
5.0      48
6.0      17
7.0       1
dtype: int64

In [30]:
np.ravel(y_train)

array([2., 0., 0., ..., 1., 1., 2.])

In [31]:
# Bring training data back together
training_data = pd.concat([x_train, y_train], axis=1)
training_data

Unnamed: 0,Latitude,Longitude,Elevation,Country_Antarctica,Country_Argentina,Country_Armenia,Country_Armenia-Azerbaijan,Country_Australia,Country_Burma (Myanmar),Country_Cameroon,...,Tectonic Setting_Intraplate / Intermediate crust (15-25 km),Tectonic Setting_Intraplate / Oceanic crust (< 15 km),Tectonic Setting_Rift zone / Continental crust (>25 km),Tectonic Setting_Rift zone / Intermediate crust (15-25 km),Tectonic Setting_Rift zone / Oceanic crust (< 15 km),Tectonic Setting_Subduction zone / Continental crust (>25 km),Tectonic Setting_Subduction zone / Crustal thickness unknown,Tectonic Setting_Subduction zone / Intermediate crust (15-25 km),Tectonic Setting_Subduction zone / Oceanic crust (< 15 km),VEI
540,0.676813,0.865176,0.579696,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,2.0
7371,0.670708,0.889690,0.454011,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.0
9870,0.472588,0.446041,0.331664,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0.0
8834,0.719532,0.892702,0.615152,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,2.0
1146,0.426559,0.814685,0.638286,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,0.295823,0.005702,0.494157,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.0
5191,0.545967,0.262406,0.588282,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1.0
5390,0.741176,0.901079,0.572303,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1.0
860,0.814936,0.050274,0.651324,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1.0


In [32]:
# Since data is imbalanced (counts of outcome classes are very different)
# try upsampling the minority classes.  Note that majority class is
# that where VEI = 2.0
vei_0 = training_data[training_data.VEI == 0.0]
vei_1 = training_data[training_data.VEI == 1.0]
vei_2 = training_data[training_data.VEI == 2.0]
vei_3 = training_data[training_data.VEI == 3.0]
vei_4 = training_data[training_data.VEI == 4.0]
vei_5 = training_data[training_data.VEI == 5.0]
vei_6 = training_data[training_data.VEI == 6.0]
vei_7 = training_data[training_data.VEI == 7.0]

In [33]:
vei_0_upsample = resample(vei_0,
                          replace=True, #sample w/ replacement
                          n_samples = len(vei_2), #match majority class count
                          random_state = 42)
vei_0_upsample.shape

(3808, 136)

In [34]:
vei_1_upsample = resample(vei_1,
                          replace=True, #sample w/ replacement
                          n_samples = len(vei_2), #match majority class count
                          random_state = 42)
vei_1_upsample.shape

(3808, 136)

In [35]:
vei_3_upsample = resample(vei_3,
                          replace=True, #sample w/ replacement
                          n_samples = len(vei_2), #match majority class count
                          random_state = 42)
vei_3_upsample.shape

(3808, 136)

In [36]:
vei_4_upsample = resample(vei_4,
                          replace=True, #sample w/ replacement
                          n_samples = len(vei_2), #match majority class count
                          random_state = 42)
vei_4_upsample.shape

(3808, 136)

In [37]:
vei_5_upsample = resample(vei_5,
                          replace=True, #sample w/ replacement
                          n_samples = len(vei_2), #match majority class count
                          random_state = 42)
vei_5_upsample.shape

(3808, 136)

In [38]:
vei_6_upsample = resample(vei_6,
                          replace=True, #sample w/ replacement
                          n_samples = len(vei_2), #match majority class count
                          random_state = 42)
vei_6_upsample.shape

(3808, 136)

In [39]:
vei_7_upsample = resample(vei_7,
                          replace=True, #sample w/ replacement
                          n_samples = len(vei_2), #match majority class count
                          random_state = 42)
vei_7_upsample.shape

(3808, 136)

In [40]:
# Now, combine all of the upsampled classes again
classes = [
    vei_0_upsample,
    vei_1_upsample,
    vei_2,
    vei_3_upsample,
    vei_4_upsample,
    vei_5_upsample,
    vei_6_upsample,
    vei_7_upsample
]
upsampled = pd.concat(classes)
upsampled.shape

(30464, 136)

In [41]:
y_train = upsampled.VEI

In [42]:
x_train = upsampled.drop('VEI', axis=1)

In [43]:
y_train.shape

(30464,)

In [44]:
x_train.shape

(30464, 135)

In [45]:
## Now, construct models

In [46]:
# Figure out the best K for KNN
knn_cv = model_selection.KFold(n_splits=5, random_state=42, shuffle=True)
best_n, best_acc = None, 0
for n in range(40):
    n += 1
    knn = neighbors.KNeighborsClassifier(n_neighbors = n)
    results = get_means(run_model(knn, x_train, np.ravel(y_train), scoring, knn_cv))
    acc = results['test_accuracy average']
    print(f'Number of neighbors is {n}... & accuracy is {acc}')
    if acc > best_acc:
        best_n, best_acc = n, acc

Number of neighbors is 1... & accuracy is 0.3595716438969231
Number of neighbors is 2... & accuracy is 0.35770060128416636
Number of neighbors is 3... & accuracy is 0.3712904079566026
Number of neighbors is 4... & accuracy is 0.6542149484478867
Number of neighbors is 5... & accuracy is 0.6712513601014005
Number of neighbors is 6... & accuracy is 0.6696430647787054
Number of neighbors is 7... & accuracy is 0.6743696980022607
Number of neighbors is 8... & accuracy is 0.6744681070028694
Number of neighbors is 9... & accuracy is 0.6744352770619633
Number of neighbors is 10... & accuracy is 0.6775867089226207
Number of neighbors is 11... & accuracy is 0.67499358003043
Number of neighbors is 12... & accuracy is 0.6778168310211206
Number of neighbors is 13... & accuracy is 0.6788343274991624
Number of neighbors is 14... & accuracy is 0.6773569693821063
Number of neighbors is 15... & accuracy is 0.6802129641034528
Number of neighbors is 16... & accuracy is 0.6790640023819892
Number of neighbor

In [47]:
print(f'Best n is {best_n} and best accuracy is {best_acc}')

Best n is 15 and best accuracy is 0.6802129641034528


In [48]:
lr = linear_model.LogisticRegression(
    penalty='l2',
    C=0.5,
    random_state=42,
    multi_class='multinomial',
    max_iter = 1000
)
lr

LogisticRegression(C=0.5, max_iter=1000, multi_class='multinomial',
                   random_state=42)

In [49]:
nb = naive_bayes.MultinomialNB(
    alpha=5.0,
    fit_prior=True)
nb

MultinomialNB(alpha=5.0)

In [50]:
knn = neighbors.KNeighborsClassifier(n_neighbors = best_n)
knn

KNeighborsClassifier(n_neighbors=15)

In [51]:
tr = tree.DecisionTreeClassifier(
    random_state=42,
    min_samples_leaf=5)
tr

DecisionTreeClassifier(min_samples_leaf=5, random_state=42)

In [52]:
rf = ensemble.RandomForestClassifier(
    n_estimators=1000,
    min_samples_leaf=5,
    random_state=42)
rf

RandomForestClassifier(min_samples_leaf=5, n_estimators=1000, random_state=42)

In [53]:
gbt = ensemble.GradientBoostingClassifier(
    n_estimators=500,
    min_samples_leaf=5,
    random_state=42,
    warm_start=True)
gbt

GradientBoostingClassifier(min_samples_leaf=5, n_estimators=500,
                           random_state=42, warm_start=True)

In [54]:
lr_results = run_model(lr, x_train, np.ravel(y_train), scoring, cv)
get_means(lr_results)

{'fit_time average': 14.635276603698731,
 'score_time average': 0.012486076354980469,
 'test_f1_weighted average': 0.5151901615681795,
 'test_accuracy average': 0.5332853149206964}

In [55]:
nb_results = run_model(nb, x_train, np.ravel(y_train), scoring, cv)
get_means(nb_results)

{'fit_time average': 0.06539862155914307,
 'score_time average': 0.012553167343139649,
 'test_f1_weighted average': 0.442602585662084,
 'test_accuracy average': 0.4604451037488625}

In [56]:
knn_results = run_model(knn, x_train, np.ravel(y_train), scoring, cv)
means = get_means(knn_results)
means

{'fit_time average': 0.05423803329467773,
 'score_time average': 2.755174446105957,
 'test_f1_weighted average': 0.6733639877282868,
 'test_accuracy average': 0.6804756559577346}

In [57]:
tr_results = run_model(tr, x_train, np.ravel(y_train), scoring, cv)
get_means(tr_results)

{'fit_time average': 0.2882028102874756,
 'score_time average': 0.014094209671020508,
 'test_f1_weighted average': 0.6998234774056759,
 'test_accuracy average': 0.7062439595386871}

In [58]:
rf_results = run_model(rf, x_train, np.ravel(y_train), scoring, cv)
get_means(rf_results)

{'fit_time average': 50.596342539787294,
 'score_time average': 0.9303753137588501,
 'test_f1_weighted average': 0.696846823317902,
 'test_accuracy average': 0.7036834073147307}

In [959]:
gbt_results = run_model(gbt, x_train, np.ravel(y_train), scoring, cv)
get_means(gbt_results)

{'fit_time average': 573.5064512968063,
 'score_time average': 0.37054612636566164,
 'test_f1_weighted average': 0.7015979024495752,
 'test_accuracy average': 0.7072612890497978}

In [67]:
# Pick GBT for validation
gbt.fit(x_train, y_train)
predictions = gbt.predict(x_val)
predictions.shape

(3335,)

In [68]:
metrics.accuracy_score(y_val, predictions)

0.512743628185907

In [69]:
metrics.f1_score(y_val, predictions, average='weighted')

0.5363473397455388

In [70]:
metrics.confusion_matrix(y_val, predictions)

array([[312,  14,  19,  17,  19,   5,   8,   0],
       [ 45, 240, 119,  61,  26,  28,  21,   1],
       [109, 249, 818, 215,  76, 105,  49,   5],
       [ 18,  43,  95, 200,  43,  39,   8,   3],
       [  7,  16,  27,  43, 111,  35,  19,   1],
       [  1,   1,   4,   8,   8,  25,   1,   0],
       [  0,   1,   4,   1,   3,   4,   4,   0],
       [  0,   1,   0,   0,   0,   0,   0,   0]], dtype=int64)