In [None]:
corr = df_corr.corr(method='spearman')
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(df_corr.columns),1)
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(df_corr.columns)
ax.set_yticklabels(df_corr.columns)
plt.show()


In [None]:
def render_dist_plots(dataset):
    n_bins = 20

    for cluster in dataset['Cluster_ID'].unique():
        display("Distribution for cluster {}".format(cluster))

        # create subplots
        fig, ax = plt.subplots(nrows=5)

        ax[0].set_title("Cluster {}".format(cluster))

        plt.figure(figsize = (6,22))
        for j, col in enumerate(numeric_cols):
            # create the bins
            bins = np.linspace(min(dataset[col]), max(dataset[col]), 20)
            # plot distribution of the cluster using histogram
            sns.distplot(dataset[dataset['Cluster_ID'] == cluster][col], bins=bins, ax=ax[j], norm_hist=True)
            ax[j].set_ylabel(f'{" " * int((1.7* len(col))) }{col}', rotation=30 )#.set_rotation(0)
            ax[j].yaxis.set_label_position('right')
            ax[j].set_xlabel('')
            #ax[j].yaxis.rotation=30
            #ax[j].xaxis.set_label_position('top')
            # plot the normal distribution with a black line

            sns.distplot(dataset[col], bins=bins, ax=ax[j], hist=False, color="k")
            ax[j].set_xlabel('')

        ##plt.tight_layout()
        plt.show()

In [None]:
def model_validity(model, Y_train, X_train, model_name):
    Y_train_pred = model.predict(X_train)
    
    residuals = Y_train_pred - Y_train
        
    fig = plt.figure(figsize=[20, 16])
    
    ax = fig.add_subplot(2, 2, 1)
    stats.probplot(residuals, dist="norm", plot=ax)
    ax.set_title('Q-Q Plot for '+ model_name)

    ax = fig.add_subplot(2, 2, 2)
    ax.hist(residuals, 50)
    ax.set_title('Histogram of Residuals for '+ model_name)

    ax = fig.add_subplot(2, 2, 3)
    ax.scatter(Y_train_pred, residuals)
    ax.set_title('Residuals for '+ model_name)
    ax.set_xlabel('Fitted')
    ax.set_ylabel('Residuals')

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(8, 8), constrained_layout=True, sharey=True)

# plot each col onto one ax
for col, ax in zip(data.columns, axes.flat):
    data[col].plot.bar(ax=ax, rot=0)
    ax.set_title(col)
    
# disable leftover axes
for ax in axes.flat[data.columns.size:]:
    ax.set_axis_off()

In [None]:
nbr_of_rows = 5
nbr_of_cols = 2
 
coords = [(r, c) for r in range(nbr_of_rows) for c in range(nbr_of_cols)]

In [None]:
fig, ax = plt.subplots(nbr_of_rows, nbr_of_cols, figsize=(12,12))
 
for i, yr in enumerate(range(2010, 2020)):
    r,c = coords[i]  # grab the pre-built coordinates
    d = df[df.Year==yr][['Name', 'Gross']].sort_values('Gross').tail(10)
    _ = ax[r][c].barh(d.Name, d.Gross)
    _ = ax[r][c].set_title('Top 10 grossing movies in {0}'.format(yr))
     
fig.tight_layout()

In [None]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

In [None]:
def eval_models(model, history):
    fig = plt.figure(figsize=[10, 10])
    ax = fig.add_subplot(2, 2, 1)
    ax.plot(history.history['loss'], label = 'Overall Loss')
    ax.plot(history.history['segmentation_loss'], label = 'Segmentation Loss')
    ax.plot(history.history['classification_loss'], label = 'Classification Loss')
    ax.legend()
    ax.set_title('Training Loss')
    ax.set_xlabel('Epochs')
    ax.set_ylabel('Loss')
    
    ax = fig.add_subplot(2, 2, 2)
    #ax.plot(history.history['accuracy'], label = 'Overall Accuracy')
    ax.plot(history.history['segmentation_accuracy'], label = 'Segmentation Accuracy')
    ax.plot(history.history['classification_accuracy'], label = 'Classification Accuracy')
    ax.legend()
    ax.set_title('Training Accuracy')
    ax.set_xlabel('Epochs')
    ax.set_ylabel('Accuracy')
    
    ax = fig.add_subplot(2, 2, 3)
    ax.plot(history.history['val_loss'], label = 'Overall Loss')
    ax.plot(history.history['val_segmentation_loss'], label = 'Segmentation Loss')
    ax.plot(history.history['val_classification_loss'], label = 'Classification Loss')
    ax.legend()
    ax.set_title('Testing Loss')
    ax.set_xlabel('Epochs')
    ax.set_ylabel('Loss')
    
    ax = fig.add_subplot(2, 2, 4)
    #ax.plot(history.history['val_accuracy'], label = 'Overall Accuracy')
    ax.plot(history.history['val_segmentation_accuracy'], label = 'Segmentation Accuracy')
    ax.plot(history.history['val_classification_accuracy'], label = 'Classification Accuracy')
    ax.legend()
    ax.set_title('Testing Accuracy')
    ax.set_xlabel('Epochs')
    ax.set_ylabel('Accuracy')

In [None]:
def model_performance(model, x_train_image, y_train_class, x_test_image, y_test_class, model_name, train_time, process_time_df, perf_results_df):
    fig = plt.figure(figsize=[14, 8])    
    
    # predict on the training set
    train_pred_start = process_time()
    train_class_pred, train_seg_pred = model.predict(x_train_image, verbose=False)
    train_pred_end = process_time()

    # get indexes for the predictions and ground truth 
    indexes = tf.argmax(train_class_pred, axis=1)
    #gt_idx = tf.argmax(y_train_class, axis=1)
    train_f1_class = round(f1_score(y_train_class, indexes, average='weighted'), 4)

    # plot the confusion matrix -train set
    ax = fig.add_subplot(1, 2, 1) 
    confusion_mtx = tf.math.confusion_matrix(y_train_class, indexes) 
    sns.heatmap(confusion_mtx, xticklabels=range(37), yticklabels=range(37), 
            annot=True, fmt='g', ax=ax, annot_kws={"fontsize":8}, cbar=False)
    ax.set_title('Training, F1 Score: %f' % train_f1_class)
    ax.set_xlabel('Predicted label')
    ax.set_ylabel('Actual label')

    # predict on the test set
    test_pred_start = process_time()
    test_class_pred, test_seg_pred = model.predict(x_test_image, verbose=False)
    test_pred_end = process_time()
    
    indexes = tf.argmax(test_class_pred, axis=1) # predicted
    #gt_idx = tf.argmax(y_test_class, axis=1)
    test_f1_class = round(f1_score(y_test_class, indexes, average='weighted'), 4)
    
    # plot the confusion matrix - test set
    ax = fig.add_subplot(1, 2, 2) 
    confusion_mtx = tf.math.confusion_matrix(y_test_class, indexes) 
    sns.heatmap(confusion_mtx, xticklabels=range(37), yticklabels=range(37), 
            annot=True, fmt='g', ax=ax, annot_kws={"fontsize":8}, cbar=False)
    ax.set_title('Testing, F1 Score: %f' % test_f1_class)
    ax.set_xlabel('Predicted label')
    ax.set_ylabel('Actual label')

    # Collect inference times
    train_inference_time = round(train_pred_end-train_pred_start, 4)
    test_inference_time = round(test_pred_end-test_pred_start, 4)
    process_time_df[model_name]= [train_time, train_inference_time, test_inference_time]

    # Collect F1 Scores
    perf_results_df[model_name] = [train_f1_class, test_f1_class]

    display(process_time_df)
    display(perf_results_df)
    
    print(classification_report(y_test_class, indexes, digits = 3))

In [None]:
def eval_model(model, X_train, Y_train, X_val, Y_val, X_test, Y_test):
    fig = plt.figure(figsize=[12, 3])
    
    ax = fig.add_subplot(1, 3, 1)
    conf = ConfusionMatrixDisplay.from_estimator(model, X_train, Y_train, normalize=None, xticks_rotation='horizontal', ax=ax, colorbar=False)
    pred = model.predict(X_train)
    conf.ax_.set_title('Training Performance: F1 score ' + str(round(f1_score(Y_train, model.predict(X_train), average='weighted'), 3)));
    
    ax = fig.add_subplot(1, 3, 2)
    conf = ConfusionMatrixDisplay.from_estimator(model, X_val, Y_val, normalize=None, xticks_rotation='horizontal', ax=ax,colorbar=False)
    pred = model.predict(X_val)
    conf.ax_.set_title('Validation Performance: F1 score ' + str(round(f1_score(Y_val, model.predict(X_val), average='weighted'), 3)));
    
    ax = fig.add_subplot(1, 3, 3)
    conf = ConfusionMatrixDisplay.from_estimator(model, X_test, Y_test, normalize=None, xticks_rotation='horizontal', ax=ax,colorbar=False)
    pred = model.predict(X_test)
    conf.ax_.set_title(f"Testing Performance: F1 score {str(round(f1_score(Y_test, model.predict(X_test), average='weighted'), 3))}");

    print(classification_report(Y_val, model.predict(X_val), digits = 3))

Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=250, max_depth=15, random_state=24)
rf.fit(X_train, Y_train)
eval_model(rf, X_train, Y_train, X_val, Y_val,  X_test, Y_test)

In [None]:
num_est = [100, 200, 400, 800]
max_depth = [None, 1, 2, 4, 8, 16]
class_weights = [None, 'balanced', 'balanced_subsample']
criterion = ['gini', 'entropy', 'log_loss']

best_acc = 0
rf_best_model = None

for e in num_est:
    for d in max_depth:
        for w in class_weights:
            for c in criterion: 
            
                rf = RandomForestClassifier(n_estimators=e, max_depth=d, random_state = 42, class_weight = w, criterion = c, n_jobs=-1).fit(X_train, Y_train)

                acc = f1_score(Y_val, rf.predict(X_val), average='weighted')
                if (acc > best_acc):
                    best_acc = acc
                    rf_best_model = rf

eval_model(rf_best_model, X_train, Y_train, X_val, Y_val,  X_test, Y_test)

In [None]:
print(rf_best_model)

In [None]:
fig = plt.figure(figsize=(12,5))
_ = tree.plot_tree(rf_best_model.estimators_[0], filled=True, fontsize=9.5) 