In [1]:
# Import xgboost library
#import sys
#!{sys.executable} -m pip install xgboost

In [3]:
#data manipulation & storage
import pandas as pd
import numpy as np
#visualization
import seaborn as sb
from matplotlib import pyplot as plt
#performance and preprocessing
from sklearn import metrics, model_selection, preprocessing
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
#import xgboost model
import xgboost as xgb

In [4]:
def get_means(results_dictionary):
    '''
    Returns average of all keys inside dictionary. 
    results_dictionary must have keys with numeric values
    '''
    if type(results_dictionary) != dict: return None
    average_dict = {}
    for key, val in results_dictionary.items():
        key += ' average'
        average_dict[key] = val.mean()
    return average_dict

In [5]:
def print_dictionary_elts(dictionary):
    '''
    Pretty print key/value pairs of a dictionary
    '''
    for key,val in dictionary.items():
        print(f'{key}: {val}')

In [6]:
def run_model(model, x, y, score, cv):
    '''Run cross-validation on sklearn model
    Arguments:
        model - model to be run using cross-validation
        x - vector of independent variables/features
        y - vector of dependent variables/outcomes
        score - scoring metric(s) to estimate model performance
        cv - sklearn KFold cross-validation object
    Returns:
        dictionary with scoring metric summary based on specified cross-validation
    '''
    return model_selection.cross_validate(
        model,
        x,
        y,
        cv=cv,
        scoring=score)

In [7]:
def one_off_accuracy(predicted, actual):
    '''
    Calculates one-off accuracy of predictive model.  One-off accuracy is defined as follows:
    a prediction being correct if it is one more or one less than the actual value.
    '''
    if len(predicted) != len(actual): return None
    _len = len(predicted)
    num_correct = 0
    for i in range(_len):
        pred = predicted[i]
        act  = actual[i]
        tup  = (act-1, act, act+1)
        if pred in tup: num_correct += 1
    print(f'Number of correct predicitions is {num_correct} out of {_len} total predictions.')
    return num_correct/_len

In [8]:
scoring = ('f1_weighted', 'accuracy')

In [9]:
cv = model_selection.KFold(n_splits=10, random_state=42, shuffle=True)

In [10]:
df = pd.read_csv('imputed_volcanoes.csv')

In [11]:
df

Unnamed: 0,Volcano Number,Volcano Name,VEI,Eruption Category,Latitude,Longitude,Country,Recoded Volcano Type,Region,Recoded Dominant Rock Type,Tectonic Setting,Elevation (m)
0,264050,Sangeang Api,2.0,Confirmed Eruption,-8.200,119.070,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),1912
1,264050,Sangeang Api,2.0,Confirmed Eruption,-8.200,119.070,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),1912
2,264050,Sangeang Api,3.0,Confirmed Eruption,-8.200,119.070,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),1912
3,264050,Sangeang Api,2.0,Uncertain Eruption,-8.200,119.070,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),1912
4,264050,Sangeang Api,2.0,Confirmed Eruption,-8.200,119.070,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),1912
...,...,...,...,...,...,...,...,...,...,...,...,...
11109,327812,Red Hill,1.0,Confirmed Eruption,34.250,-108.830,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),2300
11110,327812,Red Hill,1.0,Confirmed Eruption,34.250,-108.830,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),2300
11111,327812,Red Hill,1.0,Confirmed Eruption,34.250,-108.830,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),2300
11112,283141,Nantaisan,4.0,Confirmed Eruption,36.765,139.491,Japan,Stratovolcano,"Japan, Taiwan, Marianas",Andesite / Basaltic Andesite,Subduction zone / Continental crust (>25 km),2486


In [16]:
df[(df.VEI == 7.0) | (df.VEI == 6.0)]

Unnamed: 0,Volcano Number,Volcano Name,VEI,Eruption Category,Latitude,Longitude,Country,Recoded Volcano Type,Region,Recoded Dominant Rock Type,Tectonic Setting,Elevation (m)
69,262000,Krakatau,6.0,Confirmed Eruption,-6.102,105.423,Indonesia,Caldera,Indonesia,Andesite / Basaltic Andesite,Subduction zone / Continental crust (>25 km),155
166,257040,Ambrym,6.0,Confirmed Eruption,-16.250,168.120,Vanuatu,Shield,Melanesia and Australia,Basalt / Picro-Basalt,Subduction zone / Intermediate crust (15-25 km),1334
401,273083,Pinatubo,6.0,Confirmed Eruption,15.130,120.350,Philippines,Stratovolcano,Philippines and SE Asia,Dacite,Subduction zone / Continental crust (>25 km),1486
403,273083,Pinatubo,6.0,Confirmed Eruption,15.130,120.350,Philippines,Stratovolcano,Philippines and SE Asia,Dacite,Subduction zone / Continental crust (>25 km),1486
404,273083,Pinatubo,6.0,Confirmed Eruption,15.130,120.350,Philippines,Stratovolcano,Philippines and SE Asia,Dacite,Subduction zone / Continental crust (>25 km),1486
...,...,...,...,...,...,...,...,...,...,...,...,...
11022,312250,Kaguyak,6.0,Confirmed Eruption,58.608,-154.028,United States,Lava dome,Alaska,Dacite,Subduction zone / Continental crust (>25 km),901
11039,354000,Quimsachata,6.0,Confirmed Eruption,-14.133,-71.367,Peru,Lava dome,South America,Rhyolite,Subduction zone / Continental crust (>25 km),3848
11077,222060,Menengai,6.0,Confirmed Eruption,-0.200,36.070,Kenya,Shield,Africa and Red Sea,Trachyte / Trachydacite,Rift zone / Continental crust (>25 km),2278
11084,300023,Kurile Lake,7.0,Confirmed Eruption,51.450,157.120,Russia,Caldera,Kamchatka and Mainland Asia,Dacite,Subduction zone / Continental crust (>25 km),81


In [11]:
df['VEI'].value_counts()

2.0    5434
1.0    1735
3.0    1491
0.0    1286
4.0     900
5.0     196
6.0      65
7.0       7
Name: VEI, dtype: int64

In [12]:
ind_cols = [
    #'Volcano Number',
    'Latitude',
    'Longitude',
    'Country',
    'Recoded Volcano Type',
    'Region',
    'Recoded Dominant Rock Type',
    'Tectonic Setting',
    'Elevation (m)'
]

In [13]:
dep_cols = [
    'VEI'
]

In [14]:
# Get indendent variable columns
x = df[ind_cols]
x.shape

(11114, 8)

In [15]:
# Get dependent variable columns
y = df[dep_cols]
y.shape

(11114, 1)

In [16]:
# Scale the numeric independent variables
scaler = preprocessing.MinMaxScaler()

In [17]:
elevation = x['Elevation (m)'].values
x['Elevation'] = scaler.fit_transform(elevation.reshape(-1, 1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Elevation'] = scaler.fit_transform(elevation.reshape(-1, 1))


In [18]:
latitude = x['Latitude'].values
x['Latitude'] = scaler.fit_transform(latitude.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Latitude'] = scaler.fit_transform(latitude.reshape(-1,1))


In [19]:
longitude = x['Longitude'].values
x['Longitude'] = scaler.fit_transform(longitude.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Longitude'] = scaler.fit_transform(longitude.reshape(-1,1))


In [20]:
x= x.drop(['Elevation (m)'], axis=1)
x

Unnamed: 0,Latitude,Longitude,Country,Recoded Volcano Type,Region,Recoded Dominant Rock Type,Tectonic Setting,Elevation
0,0.424978,0.831706,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),0.605136
1,0.424978,0.831706,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),0.605136
2,0.424978,0.831706,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),0.605136
3,0.424978,0.831706,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),0.605136
4,0.424978,0.831706,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),0.605136
...,...,...,...,...,...,...,...,...
11109,0.685187,0.197858,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),0.635981
11110,0.685187,0.197858,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),0.635981
11111,0.685187,0.197858,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),0.635981
11112,0.700603,0.888502,Japan,Stratovolcano,"Japan, Taiwan, Marianas",Andesite / Basaltic Andesite,Subduction zone / Continental crust (>25 km),0.650767


In [21]:
x = pd.get_dummies(x)
x

Unnamed: 0,Latitude,Longitude,Elevation,Country_Antarctica,Country_Argentina,Country_Armenia,Country_Armenia-Azerbaijan,Country_Australia,Country_Burma (Myanmar),Country_Cameroon,...,Tectonic Setting_Intraplate / Continental crust (>25 km),Tectonic Setting_Intraplate / Intermediate crust (15-25 km),Tectonic Setting_Intraplate / Oceanic crust (< 15 km),Tectonic Setting_Rift zone / Continental crust (>25 km),Tectonic Setting_Rift zone / Intermediate crust (15-25 km),Tectonic Setting_Rift zone / Oceanic crust (< 15 km),Tectonic Setting_Subduction zone / Continental crust (>25 km),Tectonic Setting_Subduction zone / Crustal thickness unknown,Tectonic Setting_Subduction zone / Intermediate crust (15-25 km),Tectonic Setting_Subduction zone / Oceanic crust (< 15 km)
0,0.424978,0.831706,0.605136,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0.424978,0.831706,0.605136,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0.424978,0.831706,0.605136,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0.424978,0.831706,0.605136,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0.424978,0.831706,0.605136,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11109,0.685187,0.197858,0.635981,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11110,0.685187,0.197858,0.635981,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11111,0.685187,0.197858,0.635981,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11112,0.700603,0.888502,0.650767,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [22]:
x.describe()

Unnamed: 0,Latitude,Longitude,Elevation,Country_Antarctica,Country_Argentina,Country_Armenia,Country_Armenia-Azerbaijan,Country_Australia,Country_Burma (Myanmar),Country_Cameroon,...,Tectonic Setting_Intraplate / Continental crust (>25 km),Tectonic Setting_Intraplate / Intermediate crust (15-25 km),Tectonic Setting_Intraplate / Oceanic crust (< 15 km),Tectonic Setting_Rift zone / Continental crust (>25 km),Tectonic Setting_Rift zone / Intermediate crust (15-25 km),Tectonic Setting_Rift zone / Oceanic crust (< 15 km),Tectonic Setting_Subduction zone / Continental crust (>25 km),Tectonic Setting_Subduction zone / Crustal thickness unknown,Tectonic Setting_Subduction zone / Intermediate crust (15-25 km),Tectonic Setting_Subduction zone / Oceanic crust (< 15 km)
count,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,...,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0
mean,0.579266,0.590946,0.623861,0.005938,0.00063,0.00027,0.00036,0.001979,0.00018,0.00189,...,0.022764,0.002429,0.055156,0.022224,0.002609,0.072611,0.637304,0.045888,0.050027,0.088717
std,0.18889,0.320867,0.110151,0.076836,0.02509,0.016428,0.018969,0.044449,0.013414,0.043429,...,0.149157,0.049231,0.228294,0.147419,0.051017,0.259509,0.4808,0.209252,0.21801,0.284348
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.437623,0.284561,0.551793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.586375,0.65548,0.605454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,0.725466,0.888233,0.687893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [23]:
for col in x: print(col)

Latitude
Longitude
Elevation
Country_Antarctica
Country_Argentina
Country_Armenia
Country_Armenia-Azerbaijan
Country_Australia
Country_Burma (Myanmar)
Country_Cameroon
Country_Canada
Country_Cape Verde
Country_Chile
Country_Chile-Argentina
Country_Chile-Bolivia
Country_Chile-Peru
Country_China
Country_China-North Korea
Country_Colombia
Country_Colombia-Ecuador
Country_Comoros
Country_Costa Rica
Country_DR Congo
Country_DR Congo-Rwanda
Country_Djibouti
Country_Dominica
Country_Ecuador
Country_El Salvador
Country_Equatorial Guinea
Country_Eritrea
Country_Ethiopia
Country_Ethiopia-Djibouti
Country_Fiji
Country_France
Country_Georgia
Country_Germany
Country_Greece
Country_Grenada
Country_Guatemala
Country_Honduras
Country_Iceland
Country_India
Country_Indonesia
Country_Iran
Country_Italy
Country_Japan
Country_Japan - administered by Russia
Country_Kenya
Country_Madagascar
Country_Mexico
Country_Mexico-Guatemala
Country_Mongolia
Country_Netherlands
Country_New Zealand
Country_Nicaragua
Coun

In [24]:
# Rename columns so that GridSearch can run properly
x.rename(
    columns = {
        'Region_Japan, Taiwan, Marianas': 'Region-Japan-Taiwan-Marianas',
        'Tectonic Setting_Intraplate / Continental crust (>25 km)': 'Tectonic Setting_Intraplate / Continental crust (grt 25 km)',
        'Tectonic Setting_Intraplate / Oceanic crust (< 15 km)': 'Tectonic Setting_Intraplate / Oceanic crust (less 15 km)',
        'Tectonic Setting_Rift zone / Continental crust (>25 km)': 'Tectonic Setting_Rift zone / Continental crust (grt 25 km)',
        'Tectonic Setting_Rift zone / Oceanic crust (< 15 km)': 'Tectonic Setting_Rift zone / Oceanic crust (less 15 km)',
        'Tectonic Setting_Subduction zone / Continental crust (>25 km)': 'Tectonic Setting_Subduction zone / Continental crust (grt 25 km)',
        'Tectonic Setting_Subduction zone / Oceanic crust (< 15 km)': 'Tectonic Setting_Subduction zone / Oceanic crust (less 15 km)'
    },
    inplace=True
)

In [25]:
for col in x: print(col)

Latitude
Longitude
Elevation
Country_Antarctica
Country_Argentina
Country_Armenia
Country_Armenia-Azerbaijan
Country_Australia
Country_Burma (Myanmar)
Country_Cameroon
Country_Canada
Country_Cape Verde
Country_Chile
Country_Chile-Argentina
Country_Chile-Bolivia
Country_Chile-Peru
Country_China
Country_China-North Korea
Country_Colombia
Country_Colombia-Ecuador
Country_Comoros
Country_Costa Rica
Country_DR Congo
Country_DR Congo-Rwanda
Country_Djibouti
Country_Dominica
Country_Ecuador
Country_El Salvador
Country_Equatorial Guinea
Country_Eritrea
Country_Ethiopia
Country_Ethiopia-Djibouti
Country_Fiji
Country_France
Country_Georgia
Country_Germany
Country_Greece
Country_Grenada
Country_Guatemala
Country_Honduras
Country_Iceland
Country_India
Country_Indonesia
Country_Iran
Country_Italy
Country_Japan
Country_Japan - administered by Russia
Country_Kenya
Country_Madagascar
Country_Mexico
Country_Mexico-Guatemala
Country_Mongolia
Country_Netherlands
Country_New Zealand
Country_Nicaragua
Coun

In [26]:
# split data into train and validation sets.  Validation will be held out to estimate model generalizability
x_train, x_val, y_train, y_val = \
    model_selection.train_test_split(
        x,
        y,
        test_size = 0.30,
        random_state = 42) 

In [27]:
x_train.shape

(7779, 135)

In [28]:
y_train.shape

(7779, 1)

In [29]:
x_val.shape

(3335, 135)

In [30]:
y_val.shape

(3335, 1)

In [31]:
y_train.value_counts()

VEI
2.0    3808
1.0    1194
3.0    1042
0.0     892
4.0     641
5.0     148
6.0      48
7.0       6
dtype: int64

In [32]:
y_val.value_counts()

VEI
2.0    1626
1.0     541
3.0     449
0.0     394
4.0     259
5.0      48
6.0      17
7.0       1
dtype: int64

In [33]:
np.ravel(y_train)

array([2., 0., 0., ..., 1., 1., 2.])

In [34]:
# Bring training data back together
training_data = pd.concat([x_train, y_train], axis=1)
training_data

Unnamed: 0,Latitude,Longitude,Elevation,Country_Antarctica,Country_Argentina,Country_Armenia,Country_Armenia-Azerbaijan,Country_Australia,Country_Burma (Myanmar),Country_Cameroon,...,Tectonic Setting_Intraplate / Intermediate crust (15-25 km),Tectonic Setting_Intraplate / Oceanic crust (less 15 km),Tectonic Setting_Rift zone / Continental crust (grt 25 km),Tectonic Setting_Rift zone / Intermediate crust (15-25 km),Tectonic Setting_Rift zone / Oceanic crust (less 15 km),Tectonic Setting_Subduction zone / Continental crust (grt 25 km),Tectonic Setting_Subduction zone / Crustal thickness unknown,Tectonic Setting_Subduction zone / Intermediate crust (15-25 km),Tectonic Setting_Subduction zone / Oceanic crust (less 15 km),VEI
540,0.676813,0.865176,0.579696,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,2.0
7371,0.670708,0.889690,0.454011,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.0
9870,0.472588,0.446041,0.331664,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0.0
8834,0.719532,0.892702,0.615152,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,2.0
1146,0.426559,0.814685,0.638286,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,0.295823,0.005702,0.494157,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.0
5191,0.545967,0.262406,0.588282,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1.0
5390,0.741176,0.901079,0.572303,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1.0
860,0.814936,0.050274,0.651324,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1.0


In [35]:
# Since data is imbalanced (counts of outcome classes are very different)
# try upsampling each of the minority classes.  
# Note that majority class is that where VEI = 2.0
vei_0 = training_data[training_data.VEI == 0.0]
vei_1 = training_data[training_data.VEI == 1.0]
vei_2 = training_data[training_data.VEI == 2.0]
vei_3 = training_data[training_data.VEI == 3.0]
vei_4 = training_data[training_data.VEI == 4.0]
vei_5 = training_data[training_data.VEI == 5.0]
vei_6 = training_data[training_data.VEI == 6.0]
vei_7 = training_data[training_data.VEI == 7.0]

In [36]:
vei_0_upsample = resample(vei_0,
                          replace=True, #sample w/ replacement
                          n_samples = len(vei_2), #match majority class count
                          random_state = 42)
vei_0_upsample.shape

(3808, 136)

In [37]:
vei_1_upsample = resample(vei_1,
                          replace=True, #sample w/ replacement
                          n_samples = len(vei_2), #match majority class count
                          random_state = 42)
vei_1_upsample.shape

(3808, 136)

In [38]:
vei_3_upsample = resample(vei_3,
                          replace=True, #sample w/ replacement
                          n_samples = len(vei_2), #match majority class count
                          random_state = 42)
vei_3_upsample.shape

(3808, 136)

In [39]:
vei_4_upsample = resample(vei_4,
                          replace=True, #sample w/ replacement
                          n_samples = len(vei_2), #match majority class count
                          random_state = 42)
vei_4_upsample.shape

(3808, 136)

In [40]:
vei_5_upsample = resample(vei_5,
                          replace=True, #sample w/ replacement
                          n_samples = len(vei_2), #match majority class count
                          random_state = 42)
vei_5_upsample.shape

(3808, 136)

In [41]:
vei_6_upsample = resample(vei_6,
                          replace=True, #sample w/ replacement
                          n_samples = len(vei_2), #match majority class count
                          random_state = 42)
vei_6_upsample.shape

(3808, 136)

In [42]:
vei_7_upsample = resample(vei_7,
                          replace=True, #sample w/ replacement
                          n_samples = len(vei_2), #match majority class count
                          random_state = 42)
vei_7_upsample.shape

(3808, 136)

In [43]:
# Now, combine all of the upsampled classes again
classes = [
    vei_0_upsample,
    vei_1_upsample,
    vei_2,
    vei_3_upsample,
    vei_4_upsample,
    vei_5_upsample,
    vei_6_upsample,
    vei_7_upsample
]
upsampled = pd.concat(classes)
upsampled.shape

(30464, 136)

In [44]:
y_train = upsampled.VEI

In [45]:
x_train = upsampled.drop('VEI', axis=1)

In [46]:
y_train.shape

(30464,)

In [47]:
x_train.shape

(30464, 135)

In [48]:
xgb_classifier = xgb.XGBClassifier(
    objective='multi:softmax',
    n_jobs=-1,
    random_state=42)

In [49]:
# Set tunable parameters and the allowed ranges
params = {
    'max_depth': range(2, 10, 2),
    'n_estimators': range(200, 500, 150),
    'learning_rate': [0.1, 0.01, 0.05]
}

In [50]:
# Set up GridSearch for hyperparameter tuning
grid_search = GridSearchCV(
    estimator=xgb_classifier,
    param_grid=params,
    scoring = 'f1_weighted',
    n_jobs = 10,
    cv = 5,
    verbose=True
)

In [51]:
#gbt = ensemble.GradientBoostingClassifier(
#    n_estimators=500,
#    min_samples_leaf=5,
#    random_state=42,
#    warm_start=True)
#gbt

In [52]:
#gbt_results = run_model(gbt, x_train, np.ravel(y_train), scoring, cv)
#get_means(gbt_results)

In [53]:
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits




GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_ca...
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=-1,
                          

In [54]:
grid_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=8, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=350,
              n_jobs=-1, num_parallel_tree=1, objective='multi:softmax',
              predictor='auto', random_state=42, reg_alpha=0, ...)

In [55]:
tuned_classifier = grid_search.best_estimator_

In [56]:
tuned_classifier

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=8, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=350,
              n_jobs=-1, num_parallel_tree=1, objective='multi:softmax',
              predictor='auto', random_state=42, reg_alpha=0, ...)

In [57]:
tuned_classifier.fit(x_train, y_train)
predictions = tuned_classifier.predict(x_val)
predictions.shape

(3335,)

In [58]:
y_val.shape

(3335, 1)

In [1]:
metrics.f1_score(y_val, predictions, average='weighted')

NameError: name 'metrics' is not defined

In [60]:
metrics.confusion_matrix(y_val, predictions)

array([[319,  18,  18,  14,  12,   5,   8,   0],
       [ 44, 293, 104,  48,  13,  19,  19,   1],
       [112, 328, 780, 188,  67,  99,  47,   5],
       [ 18,  58,  94, 194,  36,  38,   8,   3],
       [  9,  21,  25,  38, 113,  34,  18,   1],
       [  1,   1,   3,   9,   8,  25,   1,   0],
       [  0,   3,   4,   1,   3,   4,   2,   0],
       [  0,   1,   0,   0,   0,   0,   0,   0]], dtype=int64)

In [61]:
# Regular accuracy
metrics.accuracy_score(y_val, predictions)

0.5175412293853073

In [62]:
#One off accuracy
one_off_accuracy(predictions, y_val.values)

Number of correct predicitions is 2623 out of 3335 total predictions.


0.7865067466266866

In [63]:
feature_importance = tuned_classifier.feature_importances_

In [64]:
column_names = x.columns

In [65]:
feature_importance_dict = {}
for i in range(len(column_names)):
    feature_name = column_names[i]
    importance = feature_importance[i]
    feature_importance_dict[feature_name] = importance
feature_importance_dict

{'Latitude': 0.007095976,
 'Longitude': 0.0068097585,
 'Elevation': 0.0074713775,
 'Country_Antarctica': 0.0,
 'Country_Argentina': 0.053348392,
 'Country_Armenia': 0.0029322747,
 'Country_Armenia-Azerbaijan': 0.0,
 'Country_Australia': 0.0009742594,
 'Country_Burma (Myanmar)': 0.0013872654,
 'Country_Cameroon': 0.00083904265,
 'Country_Canada': 0.0029646445,
 'Country_Cape Verde': 0.0007472073,
 'Country_Chile': 0.013273541,
 'Country_Chile-Argentina': 0.0020241179,
 'Country_Chile-Bolivia': 0.0,
 'Country_Chile-Peru': 0.0,
 'Country_China': 0.0008078856,
 'Country_China-North Korea': 0.01771785,
 'Country_Colombia': 0.00825297,
 'Country_Colombia-Ecuador': 0.0,
 'Country_Comoros': 0.0035147455,
 'Country_Costa Rica': 0.002595035,
 'Country_DR Congo': 0.0050228424,
 'Country_DR Congo-Rwanda': 0.0008180603,
 'Country_Djibouti': 0.00079789426,
 'Country_Dominica': 0.0051555987,
 'Country_Ecuador': 0.010357982,
 'Country_El Salvador': 0.0031913489,
 'Country_Equatorial Guinea': 0.0,
 'Co

In [66]:
_list = sorted(feature_importance_dict.items(), key=lambda x:x[1])
sorted_importance_dict = dict(_list)
for feat in sorted_importance_dict:
    print(feat, ':', sorted_importance_dict[feat])

Country_Antarctica : 0.0
Country_Armenia-Azerbaijan : 0.0
Country_Chile-Bolivia : 0.0
Country_Chile-Peru : 0.0
Country_Colombia-Ecuador : 0.0
Country_Equatorial Guinea : 0.0
Country_Ethiopia-Djibouti : 0.0
Country_Georgia : 0.0
Country_Germany : 0.0
Country_Grenada : 0.0
Country_Iran : 0.0
Country_Madagascar : 0.0
Country_South Africa : 0.0
Country_Syria-Jordan-Saudi Arabia : 0.0
Country_Uganda : 0.0
Country_Vietnam : 0.0
Region_Antarctica : 0.0
Region_Iceland and Arctic Ocean : 0.0
Recoded Dominant Rock Type_No Data (checked) : 0.00028688513
Country_Fiji : 0.00040586808
Country_Eritrea : 0.00044856366
Country_Saint Vincent and the Grenadines : 0.00048357563
Country_Saint Kitts and Nevis : 0.0004915696
Country_Greece : 0.0005447501
Country_Undersea Features : 0.00057950494
Country_Norway : 0.0006440203
Country_Spain : 0.0006572714
Country_Taiwan : 0.00070492394
Country_Cape Verde : 0.0007472073
Country_Djibouti : 0.00079789426
Country_China : 0.0008078856
Country_DR Congo-Rwanda : 0.00

In [67]:
plot_importance(tuned_classifier)
plt.show

NameError: name 'plot_importance' is not defined