In [1]:
#data manipulation & storage
import pandas as pd
import numpy as np
#visualization
import seaborn as sb
from matplotlib import pyplot as plt
#performance and preprocessing
from sklearn import metrics, model_selection, preprocessing
from sklearn.utils import resample
#models
from sklearn import ensemble, tree, neighbors, naive_bayes, linear_model

In [2]:
def get_means(results_dictionary):
    '''
    Returns average of all keys inside dictionary. 
    results_dictionary must have keys with numeric values
    '''
    if type(results_dictionary) != dict: return None
    average_dict = {}
    for key, val in results_dictionary.items():
        key += ' average'
        average_dict[key] = val.mean()
    return average_dict

In [3]:
def print_dictionary_elts(dictionary):
    '''
    Pretty print key/value pairs of a dictionary
    '''
    for key,val in dictionary.items():
        print(f'{key}: {val}')

In [4]:
def run_model(model, x, y, score, cv):
    '''Run cross-validation on sklearn model
    Arguments:
        model - model to be run using cross-validation
        x - vector of independent variables/features
        y - vector of dependent variables/outcomes
        score - scoring metric(s) to estimate model performance
        cv - sklearn KFold cross-validation object
    Returns:
        dictionary with scoring metric summary based on specified cross-validation
    '''
    return model_selection.cross_validate(
        model,
        x,
        y,
        cv=cv,
        scoring=score)

In [5]:
cv = model_selection.KFold(n_splits=10, random_state=42, shuffle=True)

In [6]:
#scoring = ('accuracy', 'f1_weighted', 'roc_auc_ovo_weighted')

In [7]:
scoring = ('f1_weighted', 'accuracy')

In [8]:
df = pd.read_csv('imputed_volcanoes.csv')

In [9]:
df

Unnamed: 0,Volcano Number,Volcano Name,VEI,Eruption Category,Latitude,Longitude,Country,Recoded Volcano Type,Region,Recoded Dominant Rock Type,Tectonic Setting,Elevation (m)
0,264050,Sangeang Api,2.0,Confirmed Eruption,-8.200,119.070,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),1912
1,264050,Sangeang Api,2.0,Confirmed Eruption,-8.200,119.070,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),1912
2,264050,Sangeang Api,3.0,Confirmed Eruption,-8.200,119.070,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),1912
3,264050,Sangeang Api,2.0,Uncertain Eruption,-8.200,119.070,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),1912
4,264050,Sangeang Api,2.0,Confirmed Eruption,-8.200,119.070,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),1912
...,...,...,...,...,...,...,...,...,...,...,...,...
11109,327812,Red Hill,1.0,Confirmed Eruption,34.250,-108.830,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),2300
11110,327812,Red Hill,1.0,Confirmed Eruption,34.250,-108.830,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),2300
11111,327812,Red Hill,1.0,Confirmed Eruption,34.250,-108.830,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),2300
11112,283141,Nantaisan,4.0,Confirmed Eruption,36.765,139.491,Japan,Stratovolcano,"Japan, Taiwan, Marianas",Andesite / Basaltic Andesite,Subduction zone / Continental crust (>25 km),2486


In [10]:
df['VEI'].value_counts()

2.0    5434
1.0    1735
3.0    1491
0.0    1286
4.0     900
5.0     196
6.0      65
7.0       7
Name: VEI, dtype: int64

In [11]:
ind_cols = [
    #'Volcano Number',
    'Latitude',
    'Longitude',
    'Country',
    'Recoded Volcano Type',
    'Region',
    'Recoded Dominant Rock Type',
    'Tectonic Setting',
    'Elevation (m)'
]

In [12]:
dep_cols = [
    #'Volcano Number',
    'VEI'
]

In [13]:
scaler = preprocessing.MinMaxScaler()

In [14]:
# Get indendent variable columns
x = df[ind_cols]
x.shape

(11114, 8)

In [15]:
elevation = x['Elevation (m)'].values
x['Elevation'] = scaler.fit_transform(elevation.reshape(-1, 1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Elevation'] = scaler.fit_transform(elevation.reshape(-1, 1))


In [16]:
latitude = x['Latitude'].values
x['Latitude'] = scaler.fit_transform(latitude.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Latitude'] = scaler.fit_transform(latitude.reshape(-1,1))


In [17]:
longitude = x['Longitude'].values
x['Longitude'] = scaler.fit_transform(longitude.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['Longitude'] = scaler.fit_transform(longitude.reshape(-1,1))


In [18]:
x= x.drop(['Elevation (m)'], axis=1)
x

Unnamed: 0,Latitude,Longitude,Country,Recoded Volcano Type,Region,Recoded Dominant Rock Type,Tectonic Setting,Elevation
0,0.424978,0.831706,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),0.605136
1,0.424978,0.831706,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),0.605136
2,0.424978,0.831706,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),0.605136
3,0.424978,0.831706,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),0.605136
4,0.424978,0.831706,Indonesia,Complex,Indonesia,Trachybasalt / Tephrite Basanite,Subduction zone / Continental crust (>25 km),0.605136
...,...,...,...,...,...,...,...,...
11109,0.685187,0.197858,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),0.635981
11110,0.685187,0.197858,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),0.635981
11111,0.685187,0.197858,United States,Volcanic field,Canada and Western USA,Basalt / Picro-Basalt,Rift zone / Continental crust (>25 km),0.635981
11112,0.700603,0.888502,Japan,Stratovolcano,"Japan, Taiwan, Marianas",Andesite / Basaltic Andesite,Subduction zone / Continental crust (>25 km),0.650767


In [19]:
x = pd.get_dummies(x)
x

Unnamed: 0,Latitude,Longitude,Elevation,Country_Antarctica,Country_Argentina,Country_Armenia,Country_Armenia-Azerbaijan,Country_Australia,Country_Burma (Myanmar),Country_Cameroon,...,Tectonic Setting_Intraplate / Continental crust (>25 km),Tectonic Setting_Intraplate / Intermediate crust (15-25 km),Tectonic Setting_Intraplate / Oceanic crust (< 15 km),Tectonic Setting_Rift zone / Continental crust (>25 km),Tectonic Setting_Rift zone / Intermediate crust (15-25 km),Tectonic Setting_Rift zone / Oceanic crust (< 15 km),Tectonic Setting_Subduction zone / Continental crust (>25 km),Tectonic Setting_Subduction zone / Crustal thickness unknown,Tectonic Setting_Subduction zone / Intermediate crust (15-25 km),Tectonic Setting_Subduction zone / Oceanic crust (< 15 km)
0,0.424978,0.831706,0.605136,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0.424978,0.831706,0.605136,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0.424978,0.831706,0.605136,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0.424978,0.831706,0.605136,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0.424978,0.831706,0.605136,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11109,0.685187,0.197858,0.635981,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11110,0.685187,0.197858,0.635981,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11111,0.685187,0.197858,0.635981,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11112,0.700603,0.888502,0.650767,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [20]:
x.describe()

Unnamed: 0,Latitude,Longitude,Elevation,Country_Antarctica,Country_Argentina,Country_Armenia,Country_Armenia-Azerbaijan,Country_Australia,Country_Burma (Myanmar),Country_Cameroon,...,Tectonic Setting_Intraplate / Continental crust (>25 km),Tectonic Setting_Intraplate / Intermediate crust (15-25 km),Tectonic Setting_Intraplate / Oceanic crust (< 15 km),Tectonic Setting_Rift zone / Continental crust (>25 km),Tectonic Setting_Rift zone / Intermediate crust (15-25 km),Tectonic Setting_Rift zone / Oceanic crust (< 15 km),Tectonic Setting_Subduction zone / Continental crust (>25 km),Tectonic Setting_Subduction zone / Crustal thickness unknown,Tectonic Setting_Subduction zone / Intermediate crust (15-25 km),Tectonic Setting_Subduction zone / Oceanic crust (< 15 km)
count,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,...,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0,11114.0
mean,0.579266,0.590946,0.623861,0.005938,0.00063,0.00027,0.00036,0.001979,0.00018,0.00189,...,0.022764,0.002429,0.055156,0.022224,0.002609,0.072611,0.637304,0.045888,0.050027,0.088717
std,0.18889,0.320867,0.110151,0.076836,0.02509,0.016428,0.018969,0.044449,0.013414,0.043429,...,0.149157,0.049231,0.228294,0.147419,0.051017,0.259509,0.4808,0.209252,0.21801,0.284348
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.437623,0.284561,0.551793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.586375,0.65548,0.605454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,0.725466,0.888233,0.687893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
# Get dependent variable columns
y = df[dep_cols]
y.shape

(11114, 1)

In [22]:
# split data into train and validation sets.  Validation will be held out to estimate model generalizability
x_train, x_val, y_train, y_val = \
    model_selection.train_test_split(
        x,
        y,
        test_size = 0.30,
        random_state = 42) 

In [23]:
x_train.shape

(7779, 135)

In [24]:
y_train.shape

(7779, 1)

In [25]:
x_val.shape

(3335, 135)

In [26]:
y_val.shape

(3335, 1)

In [27]:
y_train.value_counts()

VEI
2.0    3808
1.0    1194
3.0    1042
0.0     892
4.0     641
5.0     148
6.0      48
7.0       6
dtype: int64

In [28]:
y_val.value_counts()

VEI
2.0    1626
1.0     541
3.0     449
0.0     394
4.0     259
5.0      48
6.0      17
7.0       1
dtype: int64

In [29]:
np.ravel(y_train)

array([2., 0., 0., ..., 1., 1., 2.])

In [30]:
y_train.shape

(7779, 1)

In [31]:
x_train.shape

(7779, 135)

In [32]:
## Now, construct models

In [33]:
# Figure out the best K for KNN
knn_cv = model_selection.KFold(n_splits=5, random_state=42, shuffle=True)
best_n, best_acc = None, 0
for n in range(40):
    n += 1
    knn = neighbors.KNeighborsClassifier(n_neighbors = n)
    results = get_means(run_model(knn, x_train, np.ravel(y_train), scoring, knn_cv))
    acc = results['test_accuracy average']
    print(f'Number of neighbors is {n}... & accuracy is {acc}')
    if acc > best_acc:
        best_n, best_acc = n, acc

Number of neighbors is 1... & accuracy is 0.546469139272105
Number of neighbors is 2... & accuracy is 0.5460864282230801
Number of neighbors is 3... & accuracy is 0.5802793873316856
Number of neighbors is 4... & accuracy is 0.5914645516990552
Number of neighbors is 5... & accuracy is 0.604060870068359
Number of neighbors is 6... & accuracy is 0.6075329602658313
Number of neighbors is 7... & accuracy is 0.6117751841228644
Number of neighbors is 8... & accuracy is 0.6108770943717505
Number of neighbors is 9... & accuracy is 0.6156322171616562
Number of neighbors is 10... & accuracy is 0.6110045545094603
Number of neighbors is 11... & accuracy is 0.61267459641756
Number of neighbors is 12... & accuracy is 0.6111330892138305
Number of neighbors is 13... & accuracy is 0.616660825432513
Number of neighbors is 14... & accuracy is 0.6193605501781301
Number of neighbors is 15... & accuracy is 0.6160188131824531
Number of neighbors is 16... & accuracy is 0.6117763413484985
Number of neighbors is

In [34]:
print(f'Best n is {best_n} and best accuracy is {best_acc}')

Best n is 14 and best accuracy is 0.6193605501781301


In [35]:
lr = linear_model.LogisticRegression(
    penalty='l2',
    C=0.5,
    random_state=42,
    multi_class='multinomial',
    max_iter = 1000
)
lr

LogisticRegression(C=0.5, max_iter=1000, multi_class='multinomial',
                   random_state=42)

In [36]:
nb = naive_bayes.MultinomialNB(
    alpha=5.0,
    fit_prior=True)
nb

MultinomialNB(alpha=5.0)

In [37]:
knn = neighbors.KNeighborsClassifier(n_neighbors = best_n)
knn

KNeighborsClassifier(n_neighbors=14)

In [38]:
tr = tree.DecisionTreeClassifier(
    random_state=42,
    min_samples_leaf=5)
tr

DecisionTreeClassifier(min_samples_leaf=5, random_state=42)

In [39]:
rf = ensemble.RandomForestClassifier(
    n_estimators=1000,
    min_samples_leaf=5,
    random_state=42)
rf

RandomForestClassifier(min_samples_leaf=5, n_estimators=1000, random_state=42)

In [40]:
gbt = ensemble.GradientBoostingClassifier(
    n_estimators=500,
    min_samples_leaf=5,
    random_state=42,
    warm_start=True)
gbt

GradientBoostingClassifier(min_samples_leaf=5, n_estimators=500,
                           random_state=42, warm_start=True)

In [41]:
lr_results = run_model(lr, x_train, np.ravel(y_train), scoring, cv)
get_means(lr_results)

{'fit_time average': 2.9664016723632813,
 'score_time average': 0.011438965797424316,
 'test_f1_weighted average': 0.4671224969362246,
 'test_accuracy average': 0.551995017419182}

In [42]:
nb_results = run_model(nb, x_train, np.ravel(y_train), scoring, cv)
get_means(nb_results)

{'fit_time average': 0.04517199993133545,
 'score_time average': 0.01067206859588623,
 'test_f1_weighted average': 0.4557218711597611,
 'test_accuracy average': 0.540684294283266}

In [43]:
knn_results = run_model(knn, x_train, np.ravel(y_train), scoring, cv)
means = get_means(knn_results)
means

{'fit_time average': 0.02775723934173584,
 'score_time average': 0.32340245246887206,
 'test_f1_weighted average': 0.5886252200706085,
 'test_accuracy average': 0.6161465725733077}

In [44]:
tr_results = run_model(tr, x_train, np.ravel(y_train), scoring, cv)
get_means(tr_results)

{'fit_time average': 0.18310353755950928,
 'score_time average': 0.009275317192077637,
 'test_f1_weighted average': 0.6083423580821525,
 'test_accuracy average': 0.6335015698768912}

In [45]:
rf_results = run_model(rf, x_train, np.ravel(y_train), scoring, cv)
get_means(rf_results)

{'fit_time average': 19.82608096599579,
 'score_time average': 0.487879204750061,
 'test_f1_weighted average': 0.5765478260106194,
 'test_accuracy average': 0.6275876500812234}

In [46]:
gbt_results = run_model(gbt, x_train, np.ravel(y_train), scoring, cv)
get_means(gbt_results)

{'fit_time average': 138.0058955192566,
 'score_time average': 0.08118314743041992,
 'test_f1_weighted average': 0.618353219924066,
 'test_accuracy average': 0.6469985409574098}