In [1]:
### run Notebook to get feature permutation importances for the ProppLearner Gold dataset

In [6]:
from sklearn.inspection import permutation_importance
from sklearn import svm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from imblearn.over_sampling import SMOTE

%matplotlib inline

In [7]:
import sys
sys.path.append('../src')

from SVM_functions import param_selection, combine_features
from misc import get_file_names
from settings import settings

In [8]:
expSettings = settings['ProppLearner_from_gold']
featuresDir = expSettings['featuresDir']
corefsDir = expSettings['corefDir']

#### Permutation testing

In [11]:
features = ['CL', 'DP', 'NE', 'SS', 'TP', 'WN', 'CN', 'CD', 'QU', 'CP']

features.append(expSettings['animacy labels dir extention'])
features.append(expSettings['character labels dir extention'])  
fileNames = get_file_names(corefsDir, '.p')

featuresAll = combine_features(featuresDir, features, fileNames).transpose()

X = featuresAll[:, :-1]
y = featuresAll[:, -1]

# select best model parameters
params = expSettings['parameters']['over']

# set up k fold cross validation
numFolds = 5
kf = KFold(n_splits = 5)

numRepeats = 20

# to store results
results = np.zeros((11,5,numFolds,numRepeats))


for repNum in range(numRepeats):
    for foldNum, (train_idx, test_idx) in enumerate(kf.split(X)):

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        over = SMOTE(sampling_strategy = 0.8)
        X_train, y_train = over.fit_resample(X_train, y_train)
        # under = RandomUnderSampler(sampling_strategy = 1.0)
        # X_train, y_train = under.fit_resample(X_train, y_train)

        # define and train model
        model = svm.SVC(kernel = params['kernel'], C = params['C'], gamma = params['gamma']).fit(X_train,y_train)

        # get perm importance, create and save table
        perm_importance = permutation_importance(model, X_test, y_test)

        features = ['CL', 'DP', 'NE', 'SS', 'TP', 'WN', 'CN', 'CD', 'QU', 'CP', 'AN']
        features = np.array(features)


        results[:,:,foldNum,repNum] = perm_importance['importances']


# resultsMean = np.mean(results, axis = )

# sorted_idx = perm_importance.importances_mean.argsort()
# plt.barh(features[sorted_idx], perm_importance.importances_mean[sorted_idx])
# plt.xlabel("Permutation Importance")
# plt.ylabel("Features")
# plt.savefig('../results/feature_importance/feature_permutation_importance_new.pdf')  


In [12]:
results = np.mean(results, axis = 3)
results = np.mean(results, axis = 2)
results = np.mean(results, axis = 1)

In [None]:
sorted_idx = results.argsort()
plt.barh(features[sorted_idx], results[sorted_idx])
plt.xlabel("Permutation Importance")
plt.ylabel("Features")
plt.savefig('../results/feature_importance/feature_permutation_importance_new_CEN.pdf')  

#### PPCA

In [9]:
features = ['CL', 'DP', 'NE', 'SS', 'TP', 'WN', 'CD', 'QU', 'CP']

features.append(expSettings['animacy labels dir extention'])
features.append(expSettings['character labels dir extention'])  
fileNames = get_file_names(corefsDir, '.p')

featuresAll = combine_features(featuresDir, features, fileNames).transpose()

X = featuresAll[:, :-1]
y = featuresAll[:, -1]

# select best model parameters
params = expSettings['parameters']['over']


# split into test & train

over = SMOTE(sampling_strategy = 0.8)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=109)
X_train, y_train = over.fit_resample(X_train, y_train)


# train classifier & get predictions
model = svm.SVC(kernel = params['kernel'], C = params['C'], gamma = params['gamma']).fit(X_train,y_train)


In [10]:
X = featuresAll

In [14]:
import pandas as pd

In [3]:
from sklearn.decomposition import PCA

import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [23]:
import pandas as pd

In [36]:
X.shape

(1633, 11)

In [37]:
plotX = pd.DataFrame(X)

In [38]:
plotX.columns = features

In [39]:
plotX.head()

Unnamed: 0,CL,DP,NE,SS,TP,WN,CD,QU,CP,animacy_labels_gold,character_labels_gold
0,-0.253005,-0.187994,0.0,-0.1655,-0.196803,0.0,4.440892e-16,-0.151884,-0.165256,0.0,0.0
1,-0.075399,-0.187994,0.0,-0.1655,-0.056796,0.0,0.2705045,-0.151884,-0.165256,0.0,0.0
2,6.67363,7.609654,0.0,7.281991,7.083605,0.0,0.8695139,8.196136,7.201778,1.0,1.0
3,-0.253005,-0.187994,0.0,-0.1655,-0.196803,1.0,4.440892e-16,-0.151884,-0.165256,0.0,0.0
4,-0.164202,-0.187994,0.0,-0.1655,-0.196803,1.0,0.3117528,-0.151884,-0.165256,0.0,0.0


In [40]:
# initialize PCA models
pca_1d = PCA(n_components=1)
pca_2d = PCA(n_components=2)
pca_3d = PCA(n_components=3)

In [41]:
PCs_1d = pd.DataFrame(pca_1d.fit_transform(plotX.drop(['character_labels_gold'], axis=1)))
PCs_2d = pd.DataFrame(pca_2d.fit_transform(plotX.drop(['character_labels_gold'], axis=1)))
PCs_3d = pd.DataFrame(pca_3d.fit_transform(plotX.drop(['character_labels_gold'], axis=1)))

In [42]:
PCs_1d.columns = ["PC1_1d"]
PCs_2d.columns = ['PC1_2d', 'PC2_2d']
PCs_3d.columns = ['PC1_3d', 'PC2_3d', 'PC3_3d']

In [43]:
plotX = pd.concat([plotX, PCs_1d, PCs_2d, PCs_3d], axis=1, join='inner')

In [44]:
plotX['dummy'] = 0

In [45]:
cluster0 = plotX[plotX['character_labels_gold'] == 0]
cluster1 = plotX[plotX['character_labels_gold'] == 1]

In [46]:
init_notebook_mode(connected=True)

In [48]:
#Instructions for building the 1-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_1d"],
                    y = cluster0["dummy"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_1d"],
                    y = cluster1["dummy"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

# #trace3 is for 'Cluster 2'
# trace3 = go.Scatter(
#                     x = cluster2["PC1_1d"],
#                     y = cluster2["dummy"],
#                     mode = "markers",
#                     name = "Cluster 2",
#                     marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
#                     text = None)

data = [trace1, trace2] # , trace3]

title = "Visualizing Clusters in One Dimension Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= '',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [49]:
# 2D

In [51]:
#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_2d"],
                    y = cluster0["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_2d"],
                    y = cluster1["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

# #trace3 is for 'Cluster 2'
# trace3 = go.Scatter(
#                     x = cluster2["PC1_2d"],
#                     y = cluster2["PC2_2d"],
#                     mode = "markers",
#                     name = "Cluster 2",
#                     marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
#                     text = None)

data = [trace1, trace2 ] #, trace3]

title = "Visualizing Clusters in Two Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [53]:
# trace1 is for 'Cluster 0'
trace1 = go.Scatter3d(
                    x = cluster0["PC1_3d"],
                    y = cluster0["PC2_3d"],
                    z = cluster0["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter3d(
                    x = cluster1["PC1_3d"],
                    y = cluster1["PC2_3d"],
                    z = cluster1["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

# #trace3 is for 'Cluster 2'
# trace3 = go.Scatter3d(
#                     x = cluster2["PC1_3d"],
#                     y = cluster2["PC2_3d"],
#                     z = cluster2["PC3_3d"],
#                     mode = "markers",
#                     name = "Cluster 2",
#                     marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
#                     text = None)

data = [trace1, trace2] #, trace3]

title = "Visualizing Clusters in Three Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)