In [0]:
import numpy as np
import pandas as pd
import os
import io
from sklearn.preprocessing import LabelEncoder
import time

import cv2
from PIL import Image

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import cohen_kappa_score

import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

from pprint import pprint
from joblib import dump, load

import warnings

%config InlineBackend.figure_format = 'retina'
%matplotlib inline
pd.options.display.max_rows = 5
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [5]:
ls gdrive/'My Drive'/MURA_files/

[0m[01;34mcam_images[0m/  [01;34mdata[0m/  [01;34mmodels[0m/


In [4]:
!cp -r /content/gdrive/My\ Drive/MURA_files/data/MURA-v1.1.zip MURA-v1.1.zip
!unzip -q MURA-v1.1.zip
!cp -r /content/gdrive/My\ Drive/MURA_files/data/processed processed

cp: cannot open '/content/gdrive/My Drive/MURA_files/data/processed/train_all.gsheet' for reading: Operation not supported
cp: cannot open '/content/gdrive/My Drive/MURA_files/data/processed/train_wrist.gsheet' for reading: Operation not supported


In [0]:
cp -r /content/gdrive/My\ Drive/MURA_files/models/rf_*.joblib models

In [0]:
pwd

'/content'

In [0]:
ls

[0m[01;34mgdrive[0m/  [01;34mMURA-v1.1[0m/  MURA-v1.1.zip  [01;34mprocessed[0m/  [01;34msample_data[0m/


In [0]:
dpath = '/content/'
flinks_path = '/content/processed/'

## Body part - Hand

In [6]:
data_hand_train = pd.read_csv(flinks_path + 'train_hand.csv')
data_hand_valid = pd.read_csv(flinks_path + 'valid_hand.csv')

print(str(data_hand_train.shape) + '\n' + str(data_hand_train.columns))

(5543, 6)
Index(['patient', 'study_id', 'img', 'path', 'part', 'result'], dtype='object')


In [0]:
# Iteratively read and process images
def process_img(path, thresholding=True):
    img_name = os.path.join(dpath, path)
        
    image = cv2.imread(img_name, 0)
    
    # thresholding
    if thresholding:
        image = cv2.adaptiveThreshold(image,255,cv2.ADAPTIVE_THRESH_MEAN_C,cv2.THRESH_BINARY,11,2) 
    
    image = cv2.resize(image, (227, 227)) / 255
    return(image.reshape(1, 227 * 227))

In [8]:
# Construct training data (currently with thresholding)

# for randomly choosing select number later
N = data_hand_train.shape[0]
rperm = np.random.permutation(N)

# n = 1000
n = N

img_shp = (227, 227)
P = img_shp[0] * img_shp[1]

Xtrain = np.zeros((n, P))
ytrain = data_hand_train.loc[rperm[:n], 'result']

for i, idx in enumerate(rperm[:n]):
    path = data_hand_train.loc[i, 'path']
    img = process_img(path, thresholding=True)
    
    Xtrain[i, :] = img

Xtrain.shape

(5543, 51529)

In [9]:
Xvalid = np.zeros((len(data_hand_valid), P))
yvalid = data_hand_valid.loc[:, 'result']

for i in range(len(data_hand_valid)):
    path = data_hand_train.loc[i, 'path']
    img = process_img(path, thresholding=True)
    
    Xvalid[i, :] = img

Xvalid.shape

(460, 51529)

### WITHOUT ANY HYPER-PARAMETER TUNING

#### WITH ADAPTIVE THRESHOLDING

In [0]:
# Fit a model without any hyper-parameter defaults
rf_hand = RandomForestClassifier()
rf_hand.fit(Xtrain, ytrain);
yhat_train = rf_hand.predict(Xtrain)

In [51]:
print("Training metrics:")
print(sklearn.metrics.classification_report(y_true=ytrain, y_pred=yhat_train))

Training metrics:
              precision    recall  f1-score   support

    abnormal       0.99      0.98      0.99      1484
      normal       0.99      1.00      1.00      4059

   micro avg       0.99      0.99      0.99      5543
   macro avg       0.99      0.99      0.99      5543
weighted avg       0.99      0.99      0.99      5543



In [53]:
# Predict on validation set
yhat_valid = rf_hand.predict(Xvalid)

# Accuracy on validation set
print("Accuracy on validation set:", rf_hand.score(Xvalid, yvalid))

print("Testing metrics")
print(sklearn.metrics.classification_report(y_true=yvalid, y_pred=yhat_valid))

Accuracy on validation set: 0.508695652173913
Testing metrics
              precision    recall  f1-score   support

    abnormal       0.33      0.19      0.24       189
      normal       0.56      0.73      0.64       271

   micro avg       0.51      0.51      0.51       460
   macro avg       0.45      0.46      0.44       460
weighted avg       0.47      0.51      0.47       460



#### WITHOUT ADAPTIVE THRESHOLIDING

In [0]:
# Fit a model without any hyper-parameter defaults
rf = RandomForestClassifier()
rf.fit(Xtrain, ytrain);
yhat_train = rf.predict(Xtrain)

In [0]:
# Predict on validation set
yhat_valid = rf.predict(Xvalid)

# Accuracy on validation set
print("Accuracy on validation set:", rf.score(Xvalid, yvalid))

print("Testing metrics")
print(sklearn.metrics.classification_report(y_true=yvalid, y_pred=yhat_valid))

Accuracy on validation set: 0.55
Testing metrics
              precision    recall  f1-score   support

    abnormal       0.43      0.28      0.34       189
      normal       0.60      0.74      0.66       271

   micro avg       0.55      0.55      0.55       460
   macro avg       0.51      0.51      0.50       460
weighted avg       0.53      0.55      0.53       460



### WITH PARAMETER TUNING

Without any hyper-parameters, the model performs better without adaptive thresholding.

In [0]:
# Parameters of Random Forests
pprint(rf.get_params(deep=True))

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [16]:
# Give parameter ranges for hyper parameters

# number of trees
n_estimators = np.linspace(start=200, stop=2000, num=10, dtype=int)

# max number of features to consider at each split
# max_features = ['auto', 'sqrt', 'log2', None]
max_features = ['auto', 'sqrt', 'log2']

# max depth of each tree
max_depth = np.linspace(start=10, stop=150, num=15, dtype=int)
max_depth = np.append(max_depth, None)

# min samples at node (non-leaf) to consider for splitting
min_samples_split = [2, 5, 10, 20]

# min samples needed at each leaf node
min_samples_leaf = [2, 5, 10]

# bootstrapping allowed?
bootstrap = [True, False]

# index to grow the trees
criterion = ['gini', 'entropy']

# reuse previous solution or start again?
warm_start = [False, True]

# max leaf nodes (a way to prevent overfitting)
max_leaf_nodes = np.linspace(start=100, stop=1e+5, num=20, dtype=int)
max_leaf_nodes = np.append(max_leaf_nodes, None)

# whether to use out-of-bag samples to score
oob_score = [False, True]


# CREATE DICTIONARY OF PARAMS RANGES
random_grid = {
    'n_estimators' : n_estimators,
    'max_features' : max_features,
    'max_depth' : max_depth,
    'min_samples_split' : min_samples_split,
    'min_samples_leaf' : min_samples_leaf,
#     'bootstrap' : bootstrap,
#     'criterion' : criterion,
#     'warm_start' : warm_start,
#     'max_leaf_nodes' : max_leaf_nodes, 
    'oob_score' : oob_score
}

pprint(random_grid)

{'max_depth': array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150,
       None], dtype=object),
 'max_features': ['auto', 'sqrt', 'log2'],
 'min_samples_leaf': [2, 5, 10],
 'min_samples_split': [2, 5, 10, 20],
 'n_estimators': array([ 200,  400,  600,  800, 1000, 1200, 1400, 1600, 1800, 2000]),
 'oob_score': [False, True]}


In [0]:
# Using RandomizedSearchCV
%%time

# classifier
rf_hand = RandomForestClassifier()

rf_hand_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, 
                               n_iter=100, n_jobs=1, cv=2, verbose=2, 
                               random_state=2)

rf_hand_random.fit(Xtrain, ytrain)

Fitting 2 folds for each of 100 candidates, totalling 200 fits
[CV] oob_score=False, n_estimators=1800, min_samples_split=10, min_samples_leaf=5, max_features=auto, max_depth=110 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  oob_score=False, n_estimators=1800, min_samples_split=10, min_samples_leaf=5, max_features=auto, max_depth=110, total=  40.2s
[CV] oob_score=False, n_estimators=1800, min_samples_split=10, min_samples_leaf=5, max_features=auto, max_depth=110 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   40.7s remaining:    0.0s


[CV]  oob_score=False, n_estimators=1800, min_samples_split=10, min_samples_leaf=5, max_features=auto, max_depth=110, total=  39.6s
[CV] oob_score=True, n_estimators=1600, min_samples_split=2, min_samples_leaf=10, max_features=sqrt, max_depth=40 
[CV]  oob_score=True, n_estimators=1600, min_samples_split=2, min_samples_leaf=10, max_features=sqrt, max_depth=40, total=  55.7s
[CV] oob_score=True, n_estimators=1600, min_samples_split=2, min_samples_leaf=10, max_features=sqrt, max_depth=40 
[CV]  oob_score=True, n_estimators=1600, min_samples_split=2, min_samples_leaf=10, max_features=sqrt, max_depth=40, total=  55.8s
[CV] oob_score=True, n_estimators=1800, min_samples_split=20, min_samples_leaf=5, max_features=auto, max_depth=100 
[CV]  oob_score=True, n_estimators=1800, min_samples_split=20, min_samples_leaf=5, max_features=auto, max_depth=100, total= 1.1min
[CV] oob_score=True, n_estimators=1800, min_samples_split=20, min_samples_leaf=5, max_features=auto, max_depth=100 
[CV]  oob_score

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed: 91.8min finished


CPU times: user 1h 30min 3s, sys: 3min 18s, total: 1h 33min 21s
Wall time: 1h 33min 27s


In [0]:
rf_hand_random.best_params_

{'max_depth': 110,
 'max_features': 'auto',
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'n_estimators': 1800,
 'oob_score': False}

In [0]:
rf_hand.fit(Xtrain, ytrain)
rf_hand.score(Xvalid, yvalid)

0.5021739130434782

In [0]:
rf_hand_random_best = rf_hand_random.best_estimator_.fit(Xtrain, ytrain)
rf_hand_random_best.score(Xvalid, yvalid)

0.5804347826086956

In [0]:
rf_hand_random_best.oob_score_

AttributeError: ignored

In [0]:
fname = 'rf_hand_random.joblib'
dump(rf_hand_random, fname)

['rf_random_hand.joblib']

In [0]:
  !cp {fname} /content/gdrive/My\ Drive/MURA_files/models/

In [22]:
ls gdrive/My\ Drive/MURA_files/models/

[0m[01;34mmodels[0m/                svm_finger.joblib   svm_humerus.joblib
rf_random_hand.joblib  svm_forearm.joblib  svm_shoulder.joblib
svm_elbow.joblib       svm_hand.joblib     svm_wrist.joblib


In [0]:
rf_hand_random = load('/content/gdrive/My Drive/MURA_files/models/rf_hand_random.joblib')

In [12]:
# Best parameters
rf_hand_random.best_params_

{'max_depth': 110,
 'max_features': 'auto',
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'n_estimators': 1800,
 'oob_score': False}

In [17]:
random_grid

{'max_depth': array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150,
        None], dtype=object),
 'max_features': ['auto', 'sqrt', 'log2'],
 'min_samples_leaf': [2, 5, 10],
 'min_samples_split': [2, 5, 10, 20],
 'n_estimators': array([ 200,  400,  600,  800, 1000, 1200, 1400, 1600, 1800, 2000]),
 'oob_score': [False, True]}

In [18]:
# Give parameter ranges for hyper parameters

# number of trees
n_estimators = np.linspace(start=1700, stop=1900, num=3, dtype=int)

# max number of features to consider at each split
# max_features = ['auto', 'sqrt', 'log2', None]
# max_features = ['auto', 'sqrt', 'log2']

# max depth of each tree
max_depth = np.linspace(start=105, stop=115, num=3, dtype=int)
# max_depth = np.append(max_depth, None)

# min samples at node (non-leaf) to consider for splitting
min_samples_split = [8, 10, 12]

# min samples needed at each leaf node
min_samples_leaf = [4, 5, 6]

# bootstrapping allowed?
bootstrap = [True, False]

# index to grow the trees
criterion = ['gini', 'entropy']

# reuse previous solution or start again?
warm_start = [False, True]

# max leaf nodes (a way to prevent overfitting)
max_leaf_nodes = np.linspace(start=100, stop=1e+5, num=20, dtype=int)
max_leaf_nodes = np.append(max_leaf_nodes, None)

# whether to use out-of-bag samples to score
oob_score = [False, True]


# CREATE DICTIONARY OF PARAMS RANGES
param_grid = {
    'n_estimators' : n_estimators,
#     'max_features' : max_features,
    'max_depth' : max_depth,
    'min_samples_split' : min_samples_split,
    'min_samples_leaf' : min_samples_leaf,
#     'bootstrap' : bootstrap,
#     'criterion' : criterion,
#     'warm_start' : warm_start,
#     'max_leaf_nodes' : max_leaf_nodes, 
#     'oob_score' : oob_score
}

pprint(param_grid)

{'max_depth': array([105, 110, 115]),
 'min_samples_leaf': [4, 5, 6],
 'min_samples_split': [8, 10, 12],
 'n_estimators': array([1700, 1800, 1900])}


In [20]:
# Using GridSearchCV to finetune the parameters

%%time

# classifier
rf_hand = RandomForestClassifier()

rf_hand_grid = GridSearchCV(estimator=rf_hand, param_grid=param_grid, 
                            n_jobs=1, cv=2, verbose=3)
n=1000
rf_hand_grid.fit(Xtrain[rperm[:n],:], ytrain[rperm[:n]])

Fitting 2 folds for each of 81 candidates, totalling 162 fits
[CV] max_depth=105, min_samples_leaf=4, min_samples_split=8, n_estimators=1700 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=105, min_samples_leaf=4, min_samples_split=8, n_estimators=1700, score=0.728, total=  35.4s
[CV] max_depth=105, min_samples_leaf=4, min_samples_split=8, n_estimators=1700 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   35.9s remaining:    0.0s


[CV]  max_depth=105, min_samples_leaf=4, min_samples_split=8, n_estimators=1700, score=0.728, total=  36.1s
[CV] max_depth=105, min_samples_leaf=4, min_samples_split=8, n_estimators=1800 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.2min remaining:    0.0s


[CV]  max_depth=105, min_samples_leaf=4, min_samples_split=8, n_estimators=1800, score=0.728, total=  37.1s
[CV] max_depth=105, min_samples_leaf=4, min_samples_split=8, n_estimators=1800 
[CV]  max_depth=105, min_samples_leaf=4, min_samples_split=8, n_estimators=1800, score=0.728, total=  38.5s
[CV] max_depth=105, min_samples_leaf=4, min_samples_split=8, n_estimators=1900 
[CV]  max_depth=105, min_samples_leaf=4, min_samples_split=8, n_estimators=1900, score=0.728, total=  39.2s
[CV] max_depth=105, min_samples_leaf=4, min_samples_split=8, n_estimators=1900 
[CV]  max_depth=105, min_samples_leaf=4, min_samples_split=8, n_estimators=1900, score=0.728, total=  40.1s
[CV] max_depth=105, min_samples_leaf=4, min_samples_split=10, n_estimators=1700 
[CV]  max_depth=105, min_samples_leaf=4, min_samples_split=10, n_estimators=1700, score=0.728, total=  34.8s
[CV] max_depth=105, min_samples_leaf=4, min_samples_split=10, n_estimators=1700 
[CV]  max_depth=105, min_samples_leaf=4, min_samples_spli

[Parallel(n_jobs=1)]: Done 162 out of 162 | elapsed: 100.0min finished


CPU times: user 1h 41min 18s, sys: 7.51 s, total: 1h 41min 25s
Wall time: 1h 41min 29s


In [21]:
rf_hand_random.best_params_

{'max_depth': 110,
 'max_features': 'auto',
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'n_estimators': 1800,
 'oob_score': False}

In [22]:
rf_hand_grid.best_params_

{'max_depth': 105,
 'min_samples_leaf': 4,
 'min_samples_split': 8,
 'n_estimators': 1700}

In [0]:
rf_hand_grid_best = RandomForestClassifier(n_estimators=1700, 
                                           max_depth=105, 
                                           min_samples_leaf=4, 
                                           min_samples_split=8, verbose=3)

In [20]:
rf_hand_grid_best.fit(Xtrain[rperm[:n],:], ytrain[rperm[:n]])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 1700


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


building tree 2 of 1700


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s


building tree 3 of 1700
building tree 4 of 1700
building tree 5 of 1700
building tree 6 of 1700
building tree 7 of 1700
building tree 8 of 1700
building tree 9 of 1700
building tree 10 of 1700
building tree 11 of 1700
building tree 12 of 1700
building tree 13 of 1700
building tree 14 of 1700
building tree 15 of 1700
building tree 16 of 1700
building tree 17 of 1700
building tree 18 of 1700
building tree 19 of 1700
building tree 20 of 1700
building tree 21 of 1700
building tree 22 of 1700
building tree 23 of 1700
building tree 24 of 1700
building tree 25 of 1700
building tree 26 of 1700
building tree 27 of 1700
building tree 28 of 1700
building tree 29 of 1700
building tree 30 of 1700
building tree 31 of 1700
building tree 32 of 1700
building tree 33 of 1700
building tree 34 of 1700
building tree 35 of 1700
building tree 36 of 1700
building tree 37 of 1700
building tree 38 of 1700
building tree 39 of 1700
building tree 40 of 1700
building tree 41 of 1700
building tree 42 of 1700
buildin

[Parallel(n_jobs=1)]: Done 1700 out of 1700 | elapsed: 12.4min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=105, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=8,
            min_weight_fraction_leaf=0.0, n_estimators=1700, n_jobs=None,
            oob_score=False, random_state=None, verbose=3,
            warm_start=False)

In [21]:
rf_hand_grid_best.score(Xvalid, yvalid)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 1700 out of 1700 | elapsed:    0.5s finished


0.4282608695652174

In [0]:
fname = 'rf_hand_random_best.joblib'
dump(rf_hand_random_best, fname)
!cp {fname} /content/gdrive/My\ Drive/MURA_files/models/

In [0]:
fname2 = 'rf_hand_grid_best.joblib'
dump(rf_hand_grid_best, fname2)
!cp {fname2} /content/gdrive/My\ Drive/MURA_files/models/

In [18]:
rf_hand_random_best = rf_hand_random.best_estimator_
rf_hand_random_best.fit(Xtrain[rperm[:n],:], ytrain[rperm[:n]])
rf_hand_random_best.score(Xvalid, yvalid)

0.4260869565217391

## Body part - All

In [0]:
# Iteratively read and process images
def process_img(path, thresholding=True):
    img_name = os.path.join(dpath, path)
        
    image = cv2.imread(img_name, 0)
    
    # thresholding
    if thresholding:
        image = cv2.adaptiveThreshold(image,255,cv2.ADAPTIVE_THRESH_MEAN_C,cv2.THRESH_BINARY,11,2) 
    
    image = cv2.resize(image, (227, 227))
    return(image.reshape(1, 227 * 227))

In [12]:
# Give parameter ranges for hyper parameters

# number of trees
n_estimators = np.linspace(start=200, stop=2000, num=10, dtype=int)

# max number of features to consider at each split
# max_features = ['auto', 'sqrt', 'log2', None]
max_features = ['auto', 'sqrt', 'log2']

# max depth of each tree
max_depth = np.linspace(start=80, stop=150, num=7, dtype=int)
# max_depth = np.append(max_depth, None)

# min samples at node (non-leaf) to consider for splitting
min_samples_split = [5, 10, 20]

# min samples needed at each leaf node
min_samples_leaf = [2, 5, 10]

# bootstrapping allowed?
bootstrap = [True, False]

# index to grow the trees
criterion = ['gini', 'entropy']

# reuse previous solution or start again?
warm_start = [False, True]

# max leaf nodes (a way to prevent overfitting)
max_leaf_nodes = np.linspace(start=100, stop=1e+5, num=20, dtype=int)
max_leaf_nodes = np.append(max_leaf_nodes, None)

# whether to use out-of-bag samples to score
oob_score = [False, True]


# CREATE DICTIONARY OF PARAMS RANGES
random_grid = {
    'n_estimators' : n_estimators,
#     'max_features' : max_features,
    'max_depth' : max_depth,
    'min_samples_split' : min_samples_split,
    'min_samples_leaf' : min_samples_leaf,
#     'bootstrap' : bootstrap,
#     'criterion' : criterion,
#     'warm_start' : warm_start,
#     'max_leaf_nodes' : max_leaf_nodes, 
#     'oob_score' : oob_score
}

pprint(random_grid)

{'max_depth': array([ 80,  91, 103, 115, 126, 138, 150]),
 'min_samples_leaf': [2, 5, 10],
 'min_samples_split': [5, 10, 20],
 'n_estimators': array([ 200,  400,  600,  800, 1000, 1200, 1400, 1600, 1800, 2000])}


In [0]:
rf_hand = None
rf_hand_random = None
rf_hand_random_best = None

In [0]:
rf_all_no_tuning = {'hand' : rf_hand}
rf_all_random = {'hand' : rf_hand_random}
rf_all_random_best = {'hand' : rf_hand_random_best}
# rf_all_grid = {'hand' : rf_hand_grid}

In [0]:
# parts = ['hand', 'elbow', 'finger', 'forearm', 'humerus', 'shoulder', 'wrist']
parts = ['wrist']

In [53]:
%%time

for part in parts:
    if part == 'hand':
        continue
    
    data_train = pd.read_csv(flinks_path + 'train_' + part + '.csv')
    data_valid = pd.read_csv(flinks_path + 'valid_' + part + '.csv')

    print(str(data_train.shape) + '\n' + str(data_valid.columns))
    
    
    # ------------------
    # Construct training data (currently with thresholding)

    # for randomly choosing select number later
    N = data_train.shape[0]
    rperm = np.random.permutation(N)

    n = 1000
    # n = N

    img_shp = (227, 227)
    P = img_shp[0] * img_shp[1]

    Xtrain = np.zeros((N, P))
    ytrain = data_train.loc[:, 'result']

    for i, idx in enumerate(rperm[:n]):
        path = data_train.loc[i, 'path']
        img = process_img(path, thresholding=True)

        Xtrain[i, :] = img
    Xtrain_cv = Xtrain[rperm[:n], :]
    ytrain_cv = ytrain[rperm[:n]]
    
    # Construct validation data
    Xvalid = np.zeros((len(data_valid), P))
    yvalid = data_valid.loc[:, 'result']

    for i in range(len(data_valid)):
        path = data_valid.loc[i, 'path']
        img = process_img(path, thresholding=True)

        Xvalid[i, :] = img
        
    
    # -------------------
    
    print("\nFor part,", part.upper())
    
    # Fit without tuning for comparison
    rf_part = RandomForestClassifier()
    rf_part.fit(Xtrain, ytrain)
    rf_all_no_tuning[part] = rf_part
    
    accu_tr = rf_part.score(Xtrain, ytrain)
    
    print("\nWithout tuning,")
    print("Accuracy on training set,", accu_tr)
    fname = 'rf_' + part + '.joblib'
    dump(rf_part, fname)
    
    accu_va = rf_part.score(Xvalid, yvalid)
    print("\nWithout tuning,")
    print("Accuracy on validation set,", accu_va)
    
    
    # Run RandomizedSearchCV
    rf_ = RandomForestClassifier()
    rf_part_random = RandomizedSearchCV(estimator=rf_, 
                                        param_distributions=random_grid, 
                                        n_iter=20, n_jobs=1, cv=3, verbose=3, 
                                        random_state=2)
    rf_part_random.fit(Xtrain_cv, ytrain_cv)
    rf_all_random[part] = rf_part_random
    
    print("Best hyper-parameters for", part.upper())
    print(rf_part_random.best_params_)
    
    fname2 = 'rf_' + part + '_random_best.joblib'
    dump(rf_part_random.best_estimator_, fname2)
    

(9752, 6)
Index(['patient', 'study_id', 'img', 'path', 'part', 'result'], dtype='object')

For part, WRIST

Without tuning,
Accuracy on training set, 0.6843724364232978

Without tuning,
Accuracy on validation set, 0.44613050075872535
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] n_estimators=1200, min_samples_split=10, min_samples_leaf=2, max_depth=126 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=1200, min_samples_split=10, min_samples_leaf=2, max_depth=126, score=0.6976047904191617, total= 4.8min
[CV] n_estimators=1200, min_samples_split=10, min_samples_leaf=2, max_depth=126 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.8min remaining:    0.0s


[CV]  n_estimators=1200, min_samples_split=10, min_samples_leaf=2, max_depth=126, score=0.7155688622754491, total= 6.1min
[CV] n_estimators=1200, min_samples_split=10, min_samples_leaf=2, max_depth=126 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 10.9min remaining:    0.0s


[CV]  n_estimators=1200, min_samples_split=10, min_samples_leaf=2, max_depth=126, score=0.6837349397590361, total= 5.8min
[CV] n_estimators=800, min_samples_split=5, min_samples_leaf=2, max_depth=138 
[CV]  n_estimators=800, min_samples_split=5, min_samples_leaf=2, max_depth=138, score=0.6976047904191617, total= 4.0min
[CV] n_estimators=800, min_samples_split=5, min_samples_leaf=2, max_depth=138 
[CV]  n_estimators=800, min_samples_split=5, min_samples_leaf=2, max_depth=138, score=0.7155688622754491, total= 4.1min
[CV] n_estimators=800, min_samples_split=5, min_samples_leaf=2, max_depth=138 
[CV]  n_estimators=800, min_samples_split=5, min_samples_leaf=2, max_depth=138, score=0.6837349397590361, total= 3.9min
[CV] n_estimators=1600, min_samples_split=10, min_samples_leaf=2, max_depth=91 
[CV]  n_estimators=1600, min_samples_split=10, min_samples_leaf=2, max_depth=91, score=0.6976047904191617, total= 7.9min
[CV] n_estimators=1600, min_samples_split=10, min_samples_leaf=2, max_depth=91 


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 139.8min finished


Best hyper-parameters for WRIST
{'n_estimators': 1200, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 126}
CPU times: user 2h 31min 30s, sys: 11.2 s, total: 2h 31min 41s
Wall time: 2h 31min 52s


In [0]:
rf_wrist_random = rf_part_random

In [0]:
rf_finger_random_best = RandomForestClassifier(n_estimators=1200,
                                               min_samples_split=10,
                                               min_samples_leaf=2, 
                                               max_depth=126)

In [16]:
rf_finger_random_best.fit(Xtrain_cv, ytrain_cv)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=126, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=1200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
rf_wrist_random_best.score(Xvalid, yvalid)

0.5357917570498916

In [55]:
fname = 'rf_wrist_random.joblib'
dump(rf_wrist_random, fname)

['rf_wrist_random.joblib']

In [0]:
rf_wrist_random_best = rf_wrist_random.best_estimator_

In [58]:
rf_wrist_random_best.fit(Xtrain_cv, ytrain_cv)
rf_wrist_random_best.score(Xtrain_cv, ytrain_cv)

0.699

In [49]:
fname2 = 'rf_wrist_random_best.joblib'
dump(rf_wrist_random_best, fname2)

['rf_shoulder_random_best.joblib']

In [59]:
rf_wrist_random_best.score(Xvalid, yvalid)

0.44764795144157815

In [50]:
rf_elbow_random_best.fit(Xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=126, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=1200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [51]:
rf_elbow_random_best.score(Xvalid, yvalid)

0.5053763440860215

In [0]:
cp *.joblib /content/gdrive/My\ Drive/MURA_files/models/

In [0]:
from sklearn.metrics import cohen_kappa_score
yhat_valid = rf_wrist_random_best.predict(Xvalid)

In [63]:
cohen_kappa_score(yvalid, yhat_valid)

0.0

## Reducing overfitting

### hand

In [0]:
# Iteratively read and process images
def process_img(path, thresholding=True):
    img_name = os.path.join(dpath, path)
        
    image = cv2.imread(img_name, 0)
    
    # thresholding
    if thresholding:
        image = cv2.adaptiveThreshold(image,255,cv2.ADAPTIVE_THRESH_MEAN_C,cv2.THRESH_BINARY,11,2) 
    
    image = cv2.resize(image, (227, 227)) / 255
    return(image.reshape(1, 227 * 227))

In [0]:
fname = 'models/rf_hand_random.joblib'
rf_hand_random = load(fname)

In [23]:
rf_hand_random_best = rf_hand_random.best_estimator_
rf_hand_random_best

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=110, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=1800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
data_hand_train = pd.read_csv(flinks_path + 'train_hand.csv')
data_hand_valid = pd.read_csv(flinks_path + 'valid_hand.csv')

print(str(data_hand_train.shape) + '\n' + str(data_hand_train.columns))

(5543, 6)
Index(['patient', 'study_id', 'img', 'path', 'part', 'result'], dtype='object')


In [16]:
# Construct training data (currently with thresholding)

# for randomly choosing select number later
N = data_hand_train.shape[0]
rperm = np.random.permutation(N)

# n = 1000
n = N

img_shp = (227, 227)
P = img_shp[0] * img_shp[1]

Xtrain = np.zeros((n, P))
ytrain = data_hand_train.loc[rperm[:n], 'result']

for i, idx in enumerate(rperm[:n]):
    path = data_hand_train.loc[i, 'path']
    img = process_img(path, thresholding=True)
    
    Xtrain[i, :] = img

Xtrain.shape

(5543, 51529)

In [21]:
Xvalid = np.zeros((len(data_hand_valid), P))
yvalid = data_hand_valid.loc[:, 'result']

for i in range(len(data_hand_valid)):
    path = data_hand_train.loc[i, 'path']
    img = process_img(path, thresholding=True)
    
    Xvalid[i, :] = img

Xvalid.shape

(460, 51529)

In [19]:
rf_hand_random_best.fit(Xtrain[rperm[:n],:], ytrain[rperm[:n]])
rf_hand_random_best.score(Xtrain, ytrain)

0.6061699440736064

In [36]:
rf_hand_random_best.score(Xvalid, yvalid)

0.4260869565217391

In [0]:
hand = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=110, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=15, min_samples_split=20,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [41]:
hand.fit(Xtrain[rperm[:n],:], ytrain[rperm[:n]])
hand.score(Xtrain, ytrain)

0.7266823020025257

In [42]:
hand.score(Xvalid, yvalid)

0.5934782608695652

In [43]:
yhat_valid = hand.predict(Xvalid)
cohen_kappa_score(yvalid, yhat_valid)

0.025291211530616864

### elbow

In [0]:
fname = 'models/rf_elbow_random.joblib'
rf_elbow_random = load(fname)

In [0]:
elbow = rf_elbow_random.best_estimator_

In [59]:
elbow

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=126, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=1200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [0]:
elbow = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=126, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [48]:
data_elbow_train = pd.read_csv(flinks_path + 'train_elbow.csv')
data_elbow_valid = pd.read_csv(flinks_path + 'valid_elbow.csv')

print(str(data_elbow_train.shape) + '\n' + str(data_elbow_train.columns))

(4931, 6)
Index(['patient', 'study_id', 'img', 'path', 'part', 'result'], dtype='object')


In [49]:
# Construct training data (currently with thresholding)

# for randomly choosing select number later
N = data_elbow_train.shape[0]
rperm = np.random.permutation(N)

# n = 1000
n = N

img_shp = (227, 227)
P = img_shp[0] * img_shp[1]

Xtrain = np.zeros((n, P))
ytrain = data_elbow_train.loc[rperm[:n], 'result']

for i, idx in enumerate(rperm[:n]):
    path = data_elbow_train.loc[i, 'path']
    img = process_img(path, thresholding=True)
    
    Xtrain[i, :] = img

Xtrain.shape

(4931, 51529)

In [50]:
Xvalid = np.zeros((len(data_elbow_valid), P))
yvalid = data_elbow_valid.loc[:, 'result']

for i in range(len(data_elbow_valid)):
    path = data_elbow_train.loc[i, 'path']
    img = process_img(path, thresholding=True)
    
    Xvalid[i, :] = img

Xvalid.shape

(465, 51529)

In [61]:
elbow.fit(Xtrain[rperm[:n],:], ytrain[rperm[:n]])
elbow.score(Xtrain, ytrain)

0.5151084972622186

In [62]:
elbow.score(Xvalid, yvalid)

0.4924731182795699

In [63]:
yhat_valid = elbow.predict(Xvalid)
cohen_kappa_score(yvalid, yhat_valid)

-0.005129144531965446

### finger

In [0]:
fname = 'models/rf_finger_random_best.joblib'
rf_finger_random_best = load(fname)

In [0]:
finger = rf_finger_random_best

In [68]:
finger

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=126, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=1200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [69]:
data_finger_train = pd.read_csv(flinks_path + 'train_finger.csv')
data_finger_valid = pd.read_csv(flinks_path + 'valid_finger.csv')

print(str(data_finger_train.shape) + '\n' + str(data_finger_train.columns))



# Construct training data (currently with thresholding)

# for randomly choosing select number later
N = data_finger_train.shape[0]
rperm = np.random.permutation(N)

# n = 1000
n = N

img_shp = (227, 227)
P = img_shp[0] * img_shp[1]

Xtrain = np.zeros((n, P))
ytrain = data_finger_train.loc[rperm[:n], 'result']

for i, idx in enumerate(rperm[:n]):
    path = data_finger_train.loc[i, 'path']
    img = process_img(path, thresholding=True)
    
    Xtrain[i, :] = img

Xtrain.shape



Xvalid = np.zeros((len(data_finger_valid), P))
yvalid = data_finger_valid.loc[:, 'result']

for i in range(len(data_finger_valid)):
    path = data_finger_train.loc[i, 'path']
    img = process_img(path, thresholding=True)
    
    Xvalid[i, :] = img

Xvalid.shape

(5106, 6)
Index(['patient', 'study_id', 'img', 'path', 'part', 'result'], dtype='object')


(461, 51529)

In [71]:
finger.fit(Xtrain[rperm[:n],:], ytrain[rperm[:n]])
finger.score(Xtrain, ytrain)

0.5374069721895809

In [72]:
finger.score(Xvalid, yvalid)

0.4924078091106291

In [73]:
yhat_valid = finger.predict(Xvalid)
cohen_kappa_score(yvalid, yhat_valid)

-0.08474951229813166

### forearm

In [0]:
fname = 'models/rf_forearm_random.joblib'
rf_forearm_random = load(fname)

In [0]:
forearm = rf_forearm_random.best_estimator_

In [76]:
forearm

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=150, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

(1825, 6)
Index(['patient', 'study_id', 'img', 'path', 'part', 'result'], dtype='object')


In [82]:
data_forearm_train = pd.read_csv(flinks_path + 'train_forearm.csv')
data_forearm_valid = pd.read_csv(flinks_path + 'valid_forearm.csv')

print(str(data_forearm_train.shape) + '\n' + str(data_forearm_train.columns))


# Construct training data (currently with thresholding)

# for randomly choosing select number later
N = data_forearm_train.shape[0]
rperm = np.random.permutation(N)

# n = 1000
n = N

img_shp = (227, 227)
P = img_shp[0] * img_shp[1]

Xtrain = np.zeros((n, P))
ytrain = data_forearm_train.loc[rperm[:n], 'result']

for i, idx in enumerate(rperm[:n]):
    path = data_forearm_train.loc[i, 'path']
    img = process_img(path, thresholding=True)

    Xtrain[i, :] = img

Xtrain.shape



Xvalid = np.zeros((len(data_forearm_valid), P))
yvalid = data_forearm_valid.loc[:, 'result']

for i in range(len(data_forearm_valid)):
    path = data_forearm_train.loc[i, 'path']
    img = process_img(path, thresholding=True)

    Xvalid[i, :] = img

Xvalid.shape


(1825, 6)
Index(['patient', 'study_id', 'img', 'path', 'part', 'result'], dtype='object')


(301, 51529)

In [0]:
forearm = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=150, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [88]:
forearm.fit(Xtrain[rperm[:n],:], ytrain[rperm[:n]])
forearm.score(Xtrain, ytrain)

0.5287671232876713

In [89]:
forearm.score(Xvalid, yvalid)

0.5083056478405316

In [90]:
yhat_valid = forearm.predict(Xvalid)
cohen_kappa_score(yvalid, yhat_valid)

0.01342073791912124

### humerus

In [0]:
fname = 'models/rf_humerus_random.joblib'
rf_humerus_random = load(fname)

In [92]:
humerus = rf_humerus_random.best_estimator_
humerus

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=126, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=20,
            min_weight_fraction_leaf=0.0, n_estimators=1800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [93]:
data_humerus_train = pd.read_csv(flinks_path + 'train_humerus.csv')
data_humerus_valid = pd.read_csv(flinks_path + 'valid_humerus.csv')

print(str(data_humerus_train.shape) + '\n' + str(data_humerus_train.columns))


# Construct training data (currently with thresholding)

# for randomly choosing select number later
N = data_humerus_train.shape[0]
rperm = np.random.permutation(N)

# n = 1000
n = N

img_shp = (227, 227)
P = img_shp[0] * img_shp[1]

Xtrain = np.zeros((n, P))
ytrain = data_humerus_train.loc[rperm[:n], 'result']

for i, idx in enumerate(rperm[:n]):
    path = data_humerus_train.loc[i, 'path']
    img = process_img(path, thresholding=True)

    Xtrain[i, :] = img

Xtrain.shape



Xvalid = np.zeros((len(data_humerus_valid), P))
yvalid = data_humerus_valid.loc[:, 'result']

for i in range(len(data_humerus_valid)):
    path = data_humerus_train.loc[i, 'path']
    img = process_img(path, thresholding=True)

    Xvalid[i, :] = img

Xvalid.shape


(1272, 6)
Index(['patient', 'study_id', 'img', 'path', 'part', 'result'], dtype='object')


(288, 51529)

In [0]:
humerus = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=126, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=1800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [102]:
humerus.fit(Xtrain[rperm[:n],:], ytrain[rperm[:n]])
humerus.score(Xtrain, ytrain)

0.4803459119496855

In [103]:
humerus.score(Xvalid, yvalid)

0.4895833333333333

In [104]:
yhat_valid = humerus.predict(Xvalid)
cohen_kappa_score(yvalid, yhat_valid)

0.00619718309859163

### shoulder

In [0]:
fname = 'models/rf_shoulder_random.joblib'
rf_shoulder_random = load(fname)

In [106]:
shoulder = rf_shoulder_random.best_estimator_
shoulder

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=126, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=1200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [107]:
data_shoulder_train = pd.read_csv(flinks_path + 'train_shoulder.csv')
data_shoulder_valid = pd.read_csv(flinks_path + 'valid_shoulder.csv')

print(str(data_shoulder_train.shape) + '\n' + str(data_shoulder_train.columns))


# Construct training data (currently with thresholding)

# for randomly choosing select number later
N = data_shoulder_train.shape[0]
rperm = np.random.permutation(N)

# n = 1000
n = N

img_shp = (227, 227)
P = img_shp[0] * img_shp[1]

Xtrain = np.zeros((n, P))
ytrain = data_shoulder_train.loc[rperm[:n], 'result']

for i, idx in enumerate(rperm[:n]):
    path = data_shoulder_train.loc[i, 'path']
    img = process_img(path, thresholding=True)

    Xtrain[i, :] = img

Xtrain.shape



Xvalid = np.zeros((len(data_shoulder_valid), P))
yvalid = data_shoulder_valid.loc[:, 'result']

for i in range(len(data_shoulder_valid)):
    path = data_shoulder_train.loc[i, 'path']
    img = process_img(path, thresholding=True)

    Xvalid[i, :] = img

Xvalid.shape


(8379, 6)
Index(['patient', 'study_id', 'img', 'path', 'part', 'result'], dtype='object')


(563, 51529)

In [0]:
shoulder = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=126, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=1200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [109]:
shoulder.fit(Xtrain[rperm[:n],:], ytrain[rperm[:n]])
shoulder.score(Xtrain, ytrain)

0.5041174364482636

In [0]:
shoulder.score(Xvalid, yvalid)

In [110]:
yhat_valid = humerus.predict(Xvalid)
cohen_kappa_score(yvalid, yhat_valid)

-0.02366901655555198

### wrist

In [0]:
fname = 'models/rf_wrist_random.joblib'
rf_wrist_random = load(fname)

In [112]:
wrist = rf_shoulder_random.best_estimator_
wrist

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=126, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=1200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [113]:
data_wrist_train = pd.read_csv(flinks_path + 'train_wrist.csv')
data_wrist_valid = pd.read_csv(flinks_path + 'valid_wrist.csv')

print(str(data_wrist_train.shape) + '\n' + str(data_wrist_train.columns))


# Construct training data (currently with thresholding)

# for randomly choosing select number later
N = data_wrist_train.shape[0]
rperm = np.random.permutation(N)

# n = 1000
n = N

img_shp = (227, 227)
P = img_shp[0] * img_shp[1]

Xtrain = np.zeros((n, P))
ytrain = data_wrist_train.loc[rperm[:n], 'result']

for i, idx in enumerate(rperm[:n]):
    path = data_wrist_train.loc[i, 'path']
    img = process_img(path, thresholding=True)

    Xtrain[i, :] = img

Xtrain.shape



Xvalid = np.zeros((len(data_wrist_valid), P))
yvalid = data_wrist_valid.loc[:, 'result']

for i in range(len(data_wrist_valid)):
    path = data_wrist_train.loc[i, 'path']
    img = process_img(path, thresholding=True)

    Xvalid[i, :] = img

Xvalid.shape


(9752, 6)
Index(['patient', 'study_id', 'img', 'path', 'part', 'result'], dtype='object')


(659, 51529)

In [0]:
wrist = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=126, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [115]:
wrist.fit(Xtrain[rperm[:n],:], ytrain[rperm[:n]])
wrist.score(Xtrain, ytrain)

0.5182526661197703

In [116]:
wrist.score(Xvalid, yvalid)

0.4613050075872534

In [117]:
yhat_valid = wrist.predict(Xvalid)
cohen_kappa_score(yvalid, yhat_valid)

0.00846814696769993