In [None]:
!pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.9-py2.py3-none-any.whl (12 kB)
Collecting pytest==5.4.3
  Downloading pytest-5.4.3-py3-none-any.whl (248 kB)
[K     |████████████████████████████████| 248 kB 14.1 MB/s 
[?25hCollecting joblib==1.0.0
  Downloading joblib-1.0.0-py3-none-any.whl (302 kB)
[K     |████████████████████████████████| 302 kB 42.5 MB/s 
[?25hCollecting pandas==1.0.5
  Downloading pandas-1.0.5-cp37-cp37m-manylinux1_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 66 kB/s 
Collecting scikit-learn==0.23.1
  Downloading scikit_learn-0.23.1-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 25.6 MB/s 
[?25hCollecting numpy==1.19.1
  Downloading numpy-1.19.1-cp37-cp37m-manylinux2010_x86_64.whl (14.5 MB)
[K     |████████████████████████████████| 14.5 MB 8.4 kB/s 
[?25hCollecting scipy==1.5.4
  Downloading scipy-1.5.4-cp37-cp37m-manylinux1_x86_64.whl (25.9 MB)
[K     |█████████████████████████████

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/popularity_pred'

/content/drive/MyDrive/popularity_pred


In [None]:
import numpy as np
import pandas as pd
from prettytable import PrettyTable
from sklearn.cluster import KMeans, DBSCAN
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, f1_score, silhouette_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from preprocess import standard_scaler
from evaluator import spearmanr, reg_eval_metrics
from feature_gen import gen_text_features, gen_date_features

import warnings
warnings.simplefilter("ignore")

# import tensorflow.keras as keras
# import tensorflow as tf
import numpy as np
import time
# import tensorflow.keras.backend as K

# import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVR,SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.impute import SimpleImputer
import xgboost as XGB


In [None]:
def drop_columns(X, cols=['URL', 'Title', 'Description',
                            'Tags', 'Country', 'Unnamed: 0_y',
                            'Unnamed: 0_x', 'Camera','UserId',
                            'Username', 'FlickrId','DatePosted',
                            'DateTaken', 'DateCrawl','Latitude', 'Longitude']
                ):
    for col in cols:
        X = X.drop(col, axis=1)
    return X

def median_rmse(rmse_values):
    return np.median(rmse_values)

def trmse_median(rmse_values):
    rmse_values = np.array(rmse_values)
    rmse_values.sort()
    q1 = np.percentile(rmse_values, 25)
    q3 = np.percentile(rmse_values, 75)
    positive_indices = np.where(
        (rmse_values>q1) & (rmse_values<q3),
        True,
        False,
    )
    t_rmse_values = rmse_values[positive_indices]
    return median_rmse(t_rmse_values), t_rmse_values.mean(), median_rmse(rmse_values), rmse_values.mean()

In [None]:
def cluster_data(df, num_clusters, start_day='Day01', end_day='Day30'):
    Y_scale = df[end_day].values

    X = df.loc[:, start_day:end_day]
    for col in X.columns:
        # X[col] = X[col]/df[end_day]
        X[col] = X[col]/(df[end_day] + (df[end_day].eq(0.0)))

    # print(X[X.isna().any(axis=1)])

    X = X.values
    kmeans = KMeans(n_clusters = num_clusters, init = 'k-means++', max_iter =300, n_init = 10, random_state = 0)
    kmeans.fit(X)
    labels = kmeans.labels_
    aucc = silhouette_score(X, labels, metric='euclidean')
    print ('wcss: ', kmeans.inertia_, 'silhouette score: ', aucc)

    # return kmeans, labels, np.log(np.log(Y_scale + 1) + 1)
    return kmeans, labels, np.log(Y_scale)

In [None]:
users_df = pd.read_csv('data/users_TRAIN.csv')
image_info_df = pd.read_csv('data/img_info_TRAIN.csv')
headers_df = pd.read_csv('data/headers_TRAIN.csv')
popularity_df = pd.read_csv('data/popularity_TRAIN.csv')

print(users_df.shape, image_info_df.shape, headers_df.shape, popularity_df.shape)

cdf = pd.merge(headers_df, image_info_df, on='FlickrId')
combined_df = pd.merge(cdf, users_df, on='UserId' )
combined_df = pd.merge(combined_df, popularity_df, on='FlickrId')
print(f"Final data: {combined_df.shape}")

combined_df = combined_df.drop_duplicates('URL')
print(f"Dropped final data: {combined_df.shape}")

num_clusters = 2
kmeans_period_1, labels_period_1, Y_scale_1 = cluster_data(combined_df, num_clusters, 'Day01', 'Day10')
kmeans_period_2, labels_period_2, Y_scale_2 = cluster_data(combined_df, num_clusters, 'Day11', 'Day20')
kmeans_period_3, labels_period_3, Y_scale_3 = cluster_data(combined_df, num_clusters, 'Day21', 'Day30')

(23046, 11) (20337, 14) (28383, 7) (20337, 32)
Final data: (21950, 61)
Dropped final data: (20337, 61)
wcss:  6340.971359759586 silhouette score:  0.495459947479968
wcss:  2331.087575543659 silhouette score:  0.7845239734200878
wcss:  923.0133033282746 silhouette score:  0.7761967395778281


In [None]:
X = combined_df.loc[:, 'FlickrId': 'GroupsAvgPictures']

X = gen_text_features(X, 'Title')
X = gen_date_features(X, 'DatePosted')
X = gen_text_features(X, 'Description')

X['Tags'] = X['Tags'].map(lambda x: ' '.join(x))
X = gen_text_features(X, 'Tags')

X_df = X

X = drop_columns(X)

X['views_by_contact'] = X['MeanViews'] / (X['Contacts'] + 0.001)
X['views_by_num_grps'] = X['MeanViews'] / (X['NumGroups'] + 0.001)
X['views_by_photocount'] = X['MeanViews'] / (X['PhotoCount'] + 0.001)
X['views_by_grpavg'] = X['MeanViews'] / (X['GroupsAvgPictures'] + 0.001)
X['views_by_avg_grp_mem'] = X['MeanViews'] / (X['AvgGroupsMemb'] + 0.001)

col_names = X.columns
print(col_names)

X = standard_scaler(X.values)
# Y_scale = combined_df['Day30'].values

# Y_scale = np.log(Y_scale/30.0 + 0.1)
# Y_scale = np.log(np.log(Y_scale + 1) + 1)

X_actual = X.copy()

Index(['Size', 'NumSets', 'NumGroups', 'AvgGroupsMemb', 'AvgGroupPhotos',
       'Ispro', 'HasStats', 'Contacts', 'PhotoCount', 'MeanViews',
       'GroupsCount', 'GroupsAvgMembers', 'GroupsAvgPictures',
       'Title_word_count', 'Title_num_chars', 'Title_avg_word_len',
       'Title_num_uppercase', 'Title_num_title_case', 'DatePosted_year',
       'DatePosted_day', 'DatePosted_hour', 'DatePosted_day_of_week',
       'Description_word_count', 'Description_num_chars',
       'Description_avg_word_len', 'Description_num_uppercase',
       'Description_num_title_case', 'Tags_word_count', 'Tags_num_chars',
       'Tags_avg_word_len', 'Tags_num_uppercase', 'Tags_num_title_case',
       'views_by_contact', 'views_by_num_grps', 'views_by_photocount',
       'views_by_grpavg', 'views_by_avg_grp_mem'],
      dtype='object')


In [None]:
X.shape

(20337, 37)

In [None]:
# from lazypredict.Supervised import LazyClassifier,LazyRegressor

def classifier(X_train, y_train,
                X_test, y_test, col_names=None, period=1, run=1):

    # model_path = f'/content/drive/MyDrive/popularity_pred/models_1/classifier_p{period}_{run}.hdf5'
    # clf = Classifier_RESNET(X_train[0].shape, feature_maps, model_path, verbose=False)
    # global clf
    clf = RandomForestClassifier()
    # clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)

#     clf = XGB.XGBClassifier()
#     print(X_train.shape)
    if col_names is not None:
        X_train = pd.DataFrame(data=X_train, columns=col_names)
        X_test = pd.DataFrame(data=X_test, columns=col_names)

    # global models
    # models, predictions = clf.fit(X_train, X_test,y_train,y_test)
    clf.fit(X_train,y_train)
    # print(models)
    # print(predictions)
    y_train_pred = clf.predict(X_train)
    y_pred = clf.predict(X_test)

    if period == 1:
        train_ = kmeans_period_1.cluster_centers_[y_train_pred]
        test_  = kmeans_period_1.cluster_centers_[y_pred]
    elif period == 2:
        train_ = kmeans_period_2.cluster_centers_[y_train_pred]
        test_  = kmeans_period_2.cluster_centers_[y_pred]
    elif period == 3:
        train_ = kmeans_period_3.cluster_centers_[y_train_pred]
        test_  = kmeans_period_3.cluster_centers_[y_pred]

    temp = np.concatenate((train_, test_), axis=0)

    # print(y_train_pred.shape, y_train.shape)
    fold_score = f1_score(y_train, y_train_pred, average='weighted')
    print ("train data f1 score: ", fold_score)

    fold_score = f1_score(y_test, y_pred, average='weighted')
    print ("classifier f1 score: ", fold_score)

    print ("classifier prediction shape: ", temp.shape)
    return y_pred, temp, clf

In [None]:
def regressor(X_train_folds_reg, y_train_folds_reg,
              X_test_fold_reg, y_test_fold_reg, col_names=None, period=1, run=1):
    # global reg
    reg = RandomForestRegressor( )
#     reg = XGB.XGBRegressor()
    if col_names is not None:
        X_train_folds_reg = pd.DataFrame(data=X_train_folds_reg, columns=col_names)
        X_test_fold_reg = pd.DataFrame(data=X_test_fold_reg, columns=col_names)
    reg.fit(X_train_folds_reg, y_train_folds_reg)

    ypred = reg.predict(X_train_folds_reg)
    y_pred_train = ypred.copy()

    # print(y_train_folds_reg.shape, ypred.shape)
    (rmse, mae, r2) = reg_eval_metrics(y_train_folds_reg, ypred)
    p = PrettyTable(['RMSE', 'MAE', 'R2', 'Spearmanr'])
    p.add_row([rmse, mae, r2, spearmanr(y_train_folds_reg, ypred)])
    print(p)

    ypred = reg.predict(X_test_fold_reg)
    (rmse, mae, r2) = reg_eval_metrics(y_test_fold_reg, ypred)
    p = PrettyTable(['RMSE', 'MAE', 'R2', 'Spearmanr'])
    p.add_row([rmse, mae, r2, spearmanr(y_test_fold_reg, ypred)])
    print(p)

    return y_pred_train, ypred, reg

In [None]:
def overall_eval_metric(cluster_org_labels, cluster_centers, gt_scale, final_gt,
                        double_log_trans=True):

    missed = 0
    total_rmse, total_mae, total_r2, corr = 0, 0, 0, 0
    truncated_rmse, truncated_mae = [], []

    for i in range(len(gt_scale)):
        # pred = cluster_centers[cluster_org_labels[i]]* ((np.exp(gt_scale[i]) -1)*30)
        # applied log transform now redo the operation
        pred = cluster_centers[int(cluster_org_labels[i])] * (np.exp(np.exp(gt_scale[i])-1)-1)
        gt = final_gt[i]
        s_corr = spearmanr(gt, pred)
        pred = np.nan_to_num(pred)
        rmse, mae, r2 = reg_eval_metrics(gt, pred)
        truncated_rmse.append(rmse)
        truncated_mae.append(mae)

        if not np.isnan(s_corr):
            corr += s_corr
            total_rmse += rmse
            total_mae += mae
            total_r2 += r2
        else:
            missed+=1

    median_trmse, mean_trmse, median_rmse, mean_rmse = trmse_median(truncated_rmse)
    median_tmae, mean_tmae, _, _ = trmse_median(truncated_mae)

    return median_trmse, mean_trmse, median_tmae, mean_tmae, median_rmse, mean_rmse

In [None]:
def get_results(X, Y, Y_scale, kmeans, train_index, test_index, start_day, end_day, col_names=None, scale_pos_weight=1, period=1, run=1):
    # X = np.expand_dims(X, axis=1)

    X_train_fold = X[train_index]
    y_train_fold = Y[train_index]
    X_test_fold = X[test_index]
    y_test_fold = Y[test_index]

    # global feature_imp

    y_pred, class_preds, clff = classifier(X_train_fold, y_train_fold,
                                    X_test_fold, y_test_fold, col_names, period, run)
    print(confusion_matrix(y_test_fold, y_pred))

    # Scale Reg
    y_train_reg = Y_scale[train_index]
    y_test_reg  = Y_scale[test_index]
    scale_pred_train, scale_pred, regg  = regressor(X_train_fold, y_train_reg,
                            X_test_fold, y_test_reg, col_names, period, run)

#     scale_pred_train = np.exp(np.exp(scale_pred_train)-1)-1
#     scale_pred = np.exp(np.exp(scale_pred)-1)-1

    temp = pd.DataFrame(data=class_preds, columns=[start_day+"_"+str(i) for i in range(class_preds.shape[1])])
    scale_preds = list(scale_pred_train) + list(scale_pred)
    temp["scale_prediction"] = scale_preds

#     scale_preds = np.array(scale_preds).reshape(-1, 1)
#     all_preds = np.concatenate((scale_preds, class_preds), axis=1)

    print ("all_preds shape: ", temp.shape)


#     temp["predictions"] = list(all_preds)
    temp["index"] = list(train_index) + list(test_index)
    temp = temp.sort_values(by="index", ascending=True).reset_index(drop=True).drop("index", axis=1)

    predictions = temp.values

#     predictions = np.array(list(predictions))
    # print(predictions.shape)

    corr = 0
    gt_ = combined_df.loc[:, start_day:end_day].values[test_index]

    casea_median, casea_mean, casea_mae_median, casea_mae_mean, casea_nomedian, casea_nomean = overall_eval_metric(cluster_org_labels=Y[test_index],
            cluster_centers = kmeans.cluster_centers_ ,
            gt_scale=y_test_reg,
            final_gt=gt_)

    caseb_median, caseb_mean, caseb_mae_median, caseb_mae_mean, caseb_nomedian, caseb_nomean = overall_eval_metric(cluster_org_labels=y_pred,
            cluster_centers = kmeans.cluster_centers_,
            gt_scale=y_test_reg,
            final_gt=gt_)

    casec_median, casec_mean, casec_mae_median, casec_mae_mean, casec_nomedian, casec_nomean = overall_eval_metric(cluster_org_labels=Y[test_index],
            cluster_centers = kmeans.cluster_centers_ ,
            gt_scale=scale_pred,
            final_gt=gt_)

    cased_median, cased_mean, cased_mae_median, cased_mae_mean, cased_nomedian, cased_nomean = overall_eval_metric(cluster_org_labels=y_pred,
            cluster_centers = kmeans.cluster_centers_,
            gt_scale=scale_pred,
            final_gt=gt_)
    return casea_median, casea_mean, caseb_median, caseb_mean, casec_median, casec_mean, cased_median, cased_mean, casea_mae_median, casea_mae_mean, caseb_mae_median, caseb_mae_mean, casec_mae_median, casec_mae_mean, cased_mae_median, cased_mae_mean, casea_nomedian, casea_nomean, caseb_nomedian, caseb_nomean, casec_nomedian, casec_nomean, cased_nomedian, cased_nomean, predictions#, feature_imp_clf, feature_imp_reg

In [None]:

p1a_avg_nomean, p1a_avg_nomedian = [], []
p2a_avg_nomean, p2a_avg_nomedian = [], []
p3a_avg_nomean, p3a_avg_nomedian = [], []

p1b_avg_nomean, p1b_avg_nomedian = [], []
p2b_avg_nomean, p2b_avg_nomedian = [], []
p3b_avg_nomean, p3b_avg_nomedian = [], []

p1c_avg_nomean, p1c_avg_nomedian = [], []
p2c_avg_nomean, p2c_avg_nomedian = [], []
p3c_avg_nomean, p3c_avg_nomedian = [], []

p1d_avg_nomean, p1d_avg_nomedian = [], []
p2d_avg_nomean, p2d_avg_nomedian = [], []
p3d_avg_nomean, p3d_avg_nomedian = [], []

##############################################################

p1a_avg_mean, p1a_avg_median = [], []
p2a_avg_mean, p2a_avg_median = [], []
p3a_avg_mean, p3a_avg_median = [], []

p1b_avg_mean, p1b_avg_median = [], []
p2b_avg_mean, p2b_avg_median = [], []
p3b_avg_mean, p3b_avg_median = [], []

p1c_avg_mean, p1c_avg_median = [], []
p2c_avg_mean, p2c_avg_median = [], []
p3c_avg_mean, p3c_avg_median = [], []

p1d_avg_mean, p1d_avg_median = [], []
p2d_avg_mean, p2d_avg_median = [], []
p3d_avg_mean, p3d_avg_median = [], []

##############################################################

p1a_mae_avg_mean, p1a_mae_avg_median = [], []
p2a_mae_avg_mean, p2a_mae_avg_median = [], []
p3a_mae_avg_mean, p3a_mae_avg_median = [], []

p1b_mae_avg_mean, p1b_mae_avg_median = [], []
p2b_mae_avg_mean, p2b_mae_avg_median = [], []
p3b_mae_avg_mean, p3b_mae_avg_median = [], []

p1c_mae_avg_mean, p1c_mae_avg_median = [], []
p2c_mae_avg_mean, p2c_mae_avg_median = [], []
p3c_mae_avg_mean, p3c_mae_avg_median = [], []

p1d_mae_avg_mean, p1d_mae_avg_median = [], []
p2d_mae_avg_mean, p2d_mae_avg_median = [], []
p3d_mae_avg_mean, p3d_mae_avg_median = [], []

##############################################################

tablea = PrettyTable(["p1_tmean", "p2_tmean", "p3_tmean", "p1_tmedian", "p2_tmedian", "p3_tmedian", "p1_mean", "p2_mean", "p3_mean", "p1_median", "p2_median", "p3_median"])
tableb = PrettyTable(["p1_tmean", "p2_tmean", "p3_tmean", "p1_tmedian", "p2_tmedian", "p3_tmedian", "p1_mean", "p2_mean", "p3_mean", "p1_median", "p2_median", "p3_median"])
tablec = PrettyTable(["p1_tmean", "p2_tmean", "p3_tmean", "p1_tmedian", "p2_tmedian", "p3_tmedian", "p1_mean", "p2_mean", "p3_mean", "p1_median", "p2_median", "p3_median"])
tabled = PrettyTable(["p1_tmean", "p2_tmean", "p3_tmean", "p1_tmedian", "p2_tmedian", "p3_tmedian", "p1_mean", "p2_mean", "p3_mean", "p1_median", "p2_median", "p3_median"])

##############################################################

tablea_mae = PrettyTable(["p1_mae_mean", "p2_mae_mean", "p3_mae_mean", "p1_mae_median", "p2_mae_median", "p3_mae_median"])
tableb_mae = PrettyTable(["p1_mae_mean", "p2_mae_mean", "p3_mae_mean", "p1_mae_median", "p2_mae_median", "p3_mae_median"])
tablec_mae = PrettyTable(["p1_mae_mean", "p2_mae_mean", "p3_mae_mean", "p1_mae_median", "p2_mae_median", "p3_mae_median"])
tabled_mae = PrettyTable(["p1_mae_mean", "p2_mae_mean", "p3_mae_mean", "p1_mae_median", "p2_mae_median", "p3_mae_median"])

##############################################################

n_splits = 3
skfolds = StratifiedKFold(n_splits=n_splits, random_state=42,shuffle=True)

period_1_test_indices = []
period_1_train_indices = []
for train_index, test_index in skfolds.split(X, labels_period_1):
    period_1_train_indices.append(train_index)
    period_1_test_indices.append(test_index)

# period_2_test_indices = []
# period_2_train_indices = []
# for train_index, test_index in skfolds.split(X, labels_period_2):
#     period_2_train_indices.append(train_index)
#     period_2_test_indices.append(test_index)

# period_3_test_indices = []
# period_3_train_indices = []
# for train_index, test_index in skfolds.split(X, labels_period_3):
#     period_3_train_indices.append(train_index)
#     period_3_test_indices.append(test_index)

In [None]:
n = 3

for i in range(n_splits):
    X = X_actual.copy()

    print (X.shape)
    print ("&&&&"*n, f"Run {i+1} Period1 ", "&&&&"*n)
    p1a_median, p1a_mean, p1b_median, p1b_mean, p1c_median, p1c_mean, p1d_median, p1d_mean, p1a_mae_median, p1a_mae_mean, p1b_mae_median, p1b_mae_mean, p1c_mae_median, p1c_mae_mean, p1d_mae_median, p1d_mae_mean, p1a_nomedian, p1a_nomean, p1b_nomedian, p1b_nomean, p1c_nomedian, p1c_nomean, p1d_nomedian, p1d_nomean, predictions_day10 \
      = get_results(X, labels_period_1, Y_scale_1, kmeans_period_1, period_1_train_indices[i], period_1_test_indices[i], 'Day01', 'Day10', list(col_names), 1, period=1, run=i+1)

#     print ("&&&&"*n, f"Run {i+1} Period2 ", "&&&&"*n)
#     p2a_median, p2a_mean, p2b_median, p2b_mean, p2c_median, p2c_mean, p2d_median, p2d_mean, p2a_mae_median, p2a_mae_mean, p2b_mae_median, p2b_mae_mean, p2c_mae_median, p2c_mae_mean, p2d_mae_median, p2d_mae_mean \
#       = get_results(X, labels_period_2, Y_scale_2, kmeans_period_2, period_2_train_indices[i], period_2_test_indices[i], 'Day11', 'Day20', col_names, 1, period=2, run=i+1)

#     print ("&&&&"*n, f"Run {i+1} Period3 ", "&&&&"*n)
#     p3a_median, p3a_mean, p3b_median, p3b_mean, p3c_median, p3c_mean, p3d_median, p3d_mean, p3a_mae_median, p3a_mae_mean, p3b_mae_median, p3b_mae_mean, p3c_mae_median, p3c_mae_mean, p3d_mae_median, p3d_mae_mean \
#       = get_results(X, labels_period_3, Y_scale_3, kmeans_period_3, period_3_train_indices[i], period_3_test_indices[i], 'Day21', 'Day30', col_names, 1, period=3, run=i+1)


    X = np.hstack([X, predictions_day10])
    print (X.shape)
    print ("&&&&"*n, f"Run {i+1} Period2 ", "&&&&"*n)
    col_names_2 = list(col_names)+[str(i) for i in range(10)]+["predictions_day10"]
    p2a_median, p2a_mean, p2b_median, p2b_mean, p2c_median, p2c_mean, p2d_median, p2d_mean, p2a_mae_median, p2a_mae_mean, p2b_mae_median, p2b_mae_mean, p2c_mae_median, p2c_mae_mean, p2d_mae_median, p2d_mae_mean, p2a_nomedian, p2a_nomean, p2b_nomedian, p2b_nomean, p2c_nomedian, p2c_nomean, p2d_nomedian, p2d_nomean, predictions_day20 \
      = get_results(X, labels_period_2, Y_scale_2, kmeans_period_2, period_1_train_indices[i], period_1_test_indices[i], 'Day11', 'Day20', col_names_2, 1, period=2, run=i+1)

    X = np.hstack([X_actual, predictions_day20])
    print (X.shape)
    print ("&&&&"*n, f"Run {i+1} Period3 ", "&&&&"*n)
    col_names_3 = list(col_names)+[str(i) for i in range(10, 20)]+["predictions_day20"]
    p3a_median, p3a_mean, p3b_median, p3b_mean, p3c_median, p3c_mean, p3d_median, p3d_mean, p3a_mae_median, p3a_mae_mean, p3b_mae_median, p3b_mae_mean, p3c_mae_median, p3c_mae_mean, p3d_mae_median, p3d_mae_mean, p3a_nomedian, p3a_nomean, p3b_nomedian, p3b_nomean, p3c_nomedian, p3c_nomean, p3d_nomedian, p3d_nomean, predictions_day30 \
      = get_results(X, labels_period_3, Y_scale_3, kmeans_period_3, period_1_train_indices[i], period_1_test_indices[i], 'Day21', 'Day30', col_names_3, 1, period=3, run=i+1)


    tablea.add_row([p1a_mean, p2a_mean, p3a_mean, p1a_median, p2a_median, p3a_median, p1a_nomean, p2a_nomean, p3a_nomean, p1a_nomedian, p2a_nomedian, p3a_nomedian])
    tableb.add_row([p1b_mean, p2b_mean, p3b_mean, p1b_median, p2b_median, p3b_median, p1b_nomean, p2b_nomean, p3b_nomean, p1b_nomedian, p2b_nomedian, p3b_nomedian])
    tablec.add_row([p1c_mean, p2c_mean, p3c_mean, p1c_median, p2c_median, p3c_median, p1c_nomean, p2c_nomean, p3c_nomean, p1c_nomedian, p2c_nomedian, p3c_nomedian])
    tabled.add_row([p1d_mean, p2d_mean, p3d_mean, p1d_median, p2d_median, p3d_median, p1d_nomean, p2d_nomean, p3d_nomean, p1d_nomedian, p2d_nomedian, p3d_nomedian])

    p1a_avg_nomean.append(p1a_nomean)
    p1a_avg_nomedian.append(p1a_nomedian)
    p2a_avg_nomean.append(p2a_nomean)
    p2a_avg_nomedian.append(p2a_nomedian)
    p3a_avg_nomean.append(p3a_nomean)
    p3a_avg_nomedian.append(p3a_nomedian)

    p1b_avg_nomean.append(p1b_nomean)
    p1b_avg_nomedian.append(p1b_nomedian)
    p2b_avg_nomean.append(p2b_nomean)
    p2b_avg_nomedian.append(p2b_nomedian)
    p3b_avg_nomean.append(p3b_nomean)
    p3b_avg_nomedian.append(p3b_nomedian)

    p1c_avg_nomean.append(p1c_nomean)
    p1c_avg_nomedian.append(p1c_nomedian)
    p2c_avg_nomean.append(p2c_nomean)
    p2c_avg_nomedian.append(p2c_nomedian)
    p3c_avg_nomean.append(p3c_nomean)
    p3c_avg_nomedian.append(p3c_nomedian)

    p1d_avg_nomean.append(p1d_nomean)
    p1d_avg_nomedian.append(p1d_nomedian)
    p2d_avg_nomean.append(p2d_nomean)
    p2d_avg_nomedian.append(p2d_nomedian)
    p3d_avg_nomean.append(p3d_nomean)
    p3d_avg_nomedian.append(p3d_nomedian)

    ############################################################################

    p1a_avg_mean.append(p1a_mean)
    p1a_avg_median.append(p1a_median)
    p2a_avg_mean.append(p2a_mean)
    p2a_avg_median.append(p2a_median)
    p3a_avg_mean.append(p3a_mean)
    p3a_avg_median.append(p3a_median)

    p1b_avg_mean.append(p1b_mean)
    p1b_avg_median.append(p1b_median)
    p2b_avg_mean.append(p2b_mean)
    p2b_avg_median.append(p2b_median)
    p3b_avg_mean.append(p3b_mean)
    p3b_avg_median.append(p3b_median)

    p1c_avg_mean.append(p1c_mean)
    p1c_avg_median.append(p1c_median)
    p2c_avg_mean.append(p2c_mean)
    p2c_avg_median.append(p2c_median)
    p3c_avg_mean.append(p3c_mean)
    p3c_avg_median.append(p3c_median)

    p1d_avg_mean.append(p1d_mean)
    p1d_avg_median.append(p1d_median)
    p2d_avg_mean.append(p2d_mean)
    p2d_avg_median.append(p2d_median)
    p3d_avg_mean.append(p3d_mean)
    p3d_avg_median.append(p3d_median)

    ############################################################################

    tablea_mae.add_row([p1a_mae_mean, p2a_mae_mean, p3a_mae_mean, p1a_mae_median, p2a_mae_median, p3a_mae_median])
    tableb_mae.add_row([p1b_mae_mean, p2b_mae_mean, p3b_mae_mean, p1b_mae_median, p2b_mae_median, p3b_mae_median])
    tablec_mae.add_row([p1c_mae_mean, p2c_mae_mean, p3c_mae_mean, p1c_mae_median, p2c_mae_median, p3c_mae_median])
    tabled_mae.add_row([p1d_mae_mean, p2d_mae_mean, p3d_mae_mean, p1d_mae_median, p2d_mae_median, p3d_mae_median])


    p1a_mae_avg_mean.append(p1a_mae_mean)
    p1a_mae_avg_median.append(p1a_mae_median)
    p2a_mae_avg_mean.append(p2a_mae_mean)
    p2a_mae_avg_median.append(p2a_mae_median)
    p3a_mae_avg_mean.append(p3a_mae_mean)
    p3a_mae_avg_median.append(p3a_mae_median)

    p1b_mae_avg_mean.append(p1b_mae_mean)
    p1b_mae_avg_median.append(p1b_mae_median)
    p2b_mae_avg_mean.append(p2b_mae_mean)
    p2b_mae_avg_median.append(p2b_mae_median)
    p3b_mae_avg_mean.append(p3b_mae_mean)
    p3b_mae_avg_median.append(p3b_mae_median)

    p1c_mae_avg_mean.append(p1c_mae_mean)
    p1c_mae_avg_median.append(p1c_mae_median)
    p2c_mae_avg_mean.append(p2c_mae_mean)
    p2c_mae_avg_median.append(p2c_mae_median)
    p3c_mae_avg_mean.append(p3c_mae_mean)
    p3c_mae_avg_median.append(p3c_mae_median)

    p1d_mae_avg_mean.append(p1d_mae_mean)
    p1d_mae_avg_median.append(p1d_mae_median)
    p2d_mae_avg_mean.append(p2d_mae_mean)
    p2d_mae_avg_median.append(p2d_mae_median)
    p3d_mae_avg_mean.append(p3d_mae_mean)
    p3d_mae_avg_median.append(p3d_mae_median)

    print("---"*25)

(20337, 37)
&&&&&&&&&&&& Run 1 Period1  &&&&&&&&&&&&
train data f1 score:  0.9957171427078567
classifier f1 score:  0.749312761021064
classifier prediction shape:  (20337, 10)
[[4990  252]
 [1190  347]]
+---------------------+-------------------+--------------------+--------------------+
|         RMSE        |        MAE        |         R2         |     Spearmanr      |
+---------------------+-------------------+--------------------+--------------------+
| 0.10940285146662022 | 0.078183718187018 | 0.9469568342769851 | 0.9810614150474051 |
+---------------------+-------------------+--------------------+--------------------+
+--------------------+---------------------+-------------------+--------------------+
|        RMSE        |         MAE         |         R2        |     Spearmanr      |
+--------------------+---------------------+-------------------+--------------------+
| 0.2905131153886194 | 0.20929828832416383 | 0.631868400123484 | 0.8022623359335217 |
+--------------------+-

In [None]:
print(tablea)
print(tableb)
print(tablec)
print(tabled)

# print(tablea_mae)
# print(tableb_mae)
# print(tablec_mae)
# print(tabled_mae)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|      p1_tmean      |      p2_tmean      |      p3_tmean      |     p1_tmedian     |     p2_tmedian     |     p3_tmedian     |      p1_mean       |      p2_mean      |      p3_mean       |     p1_median      |     p2_median      |     p3_median      |
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
| 1.660460409420807  | 1.062633150676596  | 0.7257461697209517 | 1.2880207451235477 | 0.8572282825146499 | 0.5693019621007288 | 20.971710763832665 | 10.43565673992376 | 7.2344572986782545 | 1.2880207451235477 | 0.8210461837930564 | 0.5647367

In [None]:
tabled

<prettytable.prettytable.PrettyTable at 0x7f7ba8c65ed0>

In [None]:
print (tabled.)

SyntaxError: ignored

In [None]:
print("CASE A")
table = PrettyTable(["cluster", "p1_avg_mean", "p2_avg_mean", "p3_avg_mean", "p1_median", "p2_median", "p3_median", "Final_mean±std", "Final_median±std"])

p1_avg_mean = p1a_avg_mean
p2_avg_mean = p2a_avg_mean
p3_avg_mean = p3a_avg_mean

p1_avg_median = p1a_avg_median
p2_avg_median = p2a_avg_median
p3_avg_median = p3a_avg_median

f_mean = np.mean([np.mean(p1_avg_mean), np.mean(p2_avg_mean), np.mean(p3_avg_mean)])
f_mean_std = np.std([np.std(p1_avg_mean), np.std(p2_avg_mean), np.std(p3_avg_mean)])
f_median = np.mean([np.mean(p1_avg_median), np.mean(p2_avg_median), np.mean(p3_avg_median)])
f_med_std = np.std([np.std(p1_avg_median), np.std(p2_avg_median), np.std(p3_avg_median)])

table.add_row([num_clusters, round(np.mean(p1_avg_mean), 3), round(np.mean(p2_avg_mean), 3), np.mean(p3_avg_mean),
                np.mean(p1_avg_median), np.mean(p2_avg_median), np.mean(p3_avg_median),
                str(round(f_mean, 3))+str("±")+str(round(f_mean_std, 3)),
               str(round(f_median, 3))+str("±")+str(round(f_med_std, 3))])
print(table)

print("CASE B")
table = PrettyTable(["cluster", "p1_avg_mean", "p2_avg_mean", "p3_avg_mean", "p1_median", "p2_median", "p3_median", "Final_mean±std", "Final_median±std"])

p1_avg_mean = p1b_avg_mean
p2_avg_mean = p2b_avg_mean
p3_avg_mean = p3b_avg_mean

p1_avg_median = p1b_avg_median
p2_avg_median = p2b_avg_median
p3_avg_median = p3b_avg_median

f_mean = np.mean([np.mean(p1_avg_mean), np.mean(p2_avg_mean), np.mean(p3_avg_mean)])
f_mean_std = np.std([np.std(p1_avg_mean), np.std(p2_avg_mean), np.std(p3_avg_mean)])
f_median = np.mean([np.mean(p1_avg_median), np.mean(p2_avg_median), np.mean(p3_avg_median)])
f_med_std = np.std([np.std(p1_avg_median), np.std(p2_avg_median), np.std(p3_avg_median)])

table.add_row([num_clusters, round(np.mean(p1_avg_mean), 3), round(np.mean(p2_avg_mean), 3), np.mean(p3_avg_mean),
                np.mean(p1_avg_median), np.mean(p2_avg_median), np.mean(p3_avg_median),
                str(round(f_mean, 3))+str("±")+str(round(f_mean_std, 3)),
               str(round(f_median, 3))+str("±")+str(round(f_med_std, 3))])
print(table)

print("CASE C")
table = PrettyTable(["cluster", "p1_avg_mean", "p2_avg_mean", "p3_avg_mean", "p1_median", "p2_median", "p3_median", "Final_mean±std", "Final_median±std"])

p1_avg_mean = p1c_avg_mean
p2_avg_mean = p2c_avg_mean
p3_avg_mean = p3c_avg_mean

p1_avg_median = p1c_avg_median
p2_avg_median = p2c_avg_median
p3_avg_median = p3c_avg_median

f_mean = np.mean([np.mean(p1_avg_mean), np.mean(p2_avg_mean), np.mean(p3_avg_mean)])
f_mean_std = np.std([np.std(p1_avg_mean), np.std(p2_avg_mean), np.std(p3_avg_mean)])
f_median = np.mean([np.mean(p1_avg_median), np.mean(p2_avg_median), np.mean(p3_avg_median)])
f_med_std = np.std([np.std(p1_avg_median), np.std(p2_avg_median), np.std(p3_avg_median)])

table.add_row([num_clusters, round(np.mean(p1_avg_mean), 3), round(np.mean(p2_avg_mean), 3), np.mean(p3_avg_mean),
                np.mean(p1_avg_median), np.mean(p2_avg_median), np.mean(p3_avg_median),
                str(round(f_mean, 3))+str("±")+str(round(f_mean_std, 3)),
               str(round(f_median, 3))+str("±")+str(round(f_med_std, 3))])
print(table)

print("CASE D")
table = PrettyTable(["cluster", "p1_avg_mean", "p2_avg_mean", "p3_avg_mean", "p1_median", "p2_median", "p3_median", "Final_mean±std", "Final_median±std"])

p1_avg_mean = p1d_avg_mean
p2_avg_mean = p2d_avg_mean
p3_avg_mean = p3d_avg_mean

p1_avg_median = p1d_avg_median
p2_avg_median = p2d_avg_median
p3_avg_median = p3d_avg_median

f_mean = np.mean([np.mean(p1_avg_mean), np.mean(p2_avg_mean), np.mean(p3_avg_mean)])
f_mean_std = np.std([np.std(p1_avg_mean), np.std(p2_avg_mean), np.std(p3_avg_mean)])
f_median = np.mean([np.mean(p1_avg_median), np.mean(p2_avg_median), np.mean(p3_avg_median)])
f_med_std = np.std([np.std(p1_avg_median), np.std(p2_avg_median), np.std(p3_avg_median)])

table.add_row([num_clusters, round(np.mean(p1_avg_mean), 3), round(np.mean(p2_avg_mean), 3), np.mean(p3_avg_mean),
                np.mean(p1_avg_median), np.mean(p2_avg_median), np.mean(p3_avg_median),
                str(round(f_mean, 3))+str("±")+str(round(f_mean_std, 3)),
               str(round(f_median, 3))+str("±")+str(round(f_med_std, 3))])
print(table)

CASE A
+---------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+----------------+------------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean     |     p1_median      |     p2_median      |     p3_median      | Final_mean±std | Final_median±std |
+---------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+----------------+------------------+
|    2    |    1.625    |    1.038    | 0.7117878274987613 | 1.2495826193445552 | 0.8302783431923707 | 0.5625067614334752 |  1.125±0.009   |   0.881±0.011    |
+---------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+----------------+------------------+
CASE B
+---------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+----------------+------------------+
| cluster | p1_avg_mean | 

In [None]:
print("CASE A")
table = PrettyTable(["cluster", "p1_avg_mean", "p2_avg_mean", "p3_avg_mean", "p1_median", "p2_median", "p3_median", "Final_mean", "Final_median"])

p1_avg_mean = p1a_avg_nomean
p2_avg_mean = p2a_avg_nomean
p3_avg_mean = p3a_avg_nomean

p1_avg_median = p1a_avg_nomedian
p2_avg_median = p2a_avg_nomedian
p3_avg_median = p3a_avg_nomedian

f_mean = np.mean([np.mean(p1_avg_mean), np.mean(p2_avg_mean), np.mean(p3_avg_mean)])
f_mean_std = np.std([np.std(p1_avg_mean), np.std(p2_avg_mean), np.std(p3_avg_mean)])
f_median = np.mean([np.mean(p1_avg_median), np.mean(p2_avg_median), np.mean(p3_avg_median)])
f_med_std = np.std([np.std(p1_avg_median), np.std(p2_avg_median), np.std(p3_avg_median)])

table.add_row([num_clusters, round(np.mean(p1_avg_mean), 3), round(np.mean(p2_avg_mean), 3), np.mean(p3_avg_mean),
                np.mean(p1_avg_median), np.mean(p2_avg_median), np.mean(p3_avg_median),
                str(round(f_mean, 3))+str("±")+str(round(f_mean_std, 3)),
               str(round(f_median, 3))+str("±")+str(round(f_med_std, 3))])
print(table)

print("CASE B")
table = PrettyTable(["cluster", "p1_avg_mean", "p2_avg_mean", "p3_avg_mean", "p1_median", "p2_median", "p3_median", "Final_mean", "Final_median"])

p1_avg_mean = p1b_avg_nomean
p2_avg_mean = p2b_avg_nomean
p3_avg_mean = p3b_avg_nomean

p1_avg_median = p1b_avg_nomedian
p2_avg_median = p2b_avg_nomedian
p3_avg_median = p3b_avg_nomedian

f_mean = np.mean([np.mean(p1_avg_mean), np.mean(p2_avg_mean), np.mean(p3_avg_mean)])
f_mean_std = np.std([np.std(p1_avg_mean), np.std(p2_avg_mean), np.std(p3_avg_mean)])
f_median = np.mean([np.mean(p1_avg_median), np.mean(p2_avg_median), np.mean(p3_avg_median)])
f_med_std = np.std([np.std(p1_avg_median), np.std(p2_avg_median), np.std(p3_avg_median)])

table.add_row([num_clusters, round(np.mean(p1_avg_mean), 3), round(np.mean(p2_avg_mean), 3), np.mean(p3_avg_mean),
                np.mean(p1_avg_median), np.mean(p2_avg_median), np.mean(p3_avg_median),
                str(round(f_mean, 3))+str("±")+str(round(f_mean_std, 3)),
               str(round(f_median, 3))+str("±")+str(round(f_med_std, 3))])
print(table)

print("CASE C")
table = PrettyTable(["cluster", "p1_avg_mean", "p2_avg_mean", "p3_avg_mean", "p1_median", "p2_median", "p3_median", "Final_mean", "Final_median"])

p1_avg_mean = p1c_avg_nomean
p2_avg_mean = p2c_avg_nomean
p3_avg_mean = p3c_avg_nomean

p1_avg_median = p1c_avg_nomedian
p2_avg_median = p2c_avg_nomedian
p3_avg_median = p3c_avg_nomedian

f_mean = np.mean([np.mean(p1_avg_mean), np.mean(p2_avg_mean), np.mean(p3_avg_mean)])
f_mean_std = np.std([np.std(p1_avg_mean), np.std(p2_avg_mean), np.std(p3_avg_mean)])
f_median = np.mean([np.mean(p1_avg_median), np.mean(p2_avg_median), np.mean(p3_avg_median)])
f_med_std = np.std([np.std(p1_avg_median), np.std(p2_avg_median), np.std(p3_avg_median)])

table.add_row([num_clusters, round(np.mean(p1_avg_mean), 3), round(np.mean(p2_avg_mean), 3), np.mean(p3_avg_mean),
                np.mean(p1_avg_median), np.mean(p2_avg_median), np.mean(p3_avg_median),
                str(round(f_mean, 3))+str("±")+str(round(f_mean_std, 3)),
               str(round(f_median, 3))+str("±")+str(round(f_med_std, 3))])
print(table)

print("CASE D")
table = PrettyTable(["cluster", "p1_avg_mean", "p2_avg_mean", "p3_avg_mean", "p1_median", "p2_median", "p3_median", "Final_mean", "Final_median"])

p1_avg_mean = p1d_avg_nomean
p2_avg_mean = p2d_avg_nomean
p3_avg_mean = p3d_avg_nomean

p1_avg_median = p1d_avg_nomedian
p2_avg_median = p2d_avg_nomedian
p3_avg_median = p3d_avg_nomedian

f_mean = np.mean([np.mean(p1_avg_mean), np.mean(p2_avg_mean), np.mean(p3_avg_mean)])
f_mean_std = np.std([np.std(p1_avg_mean), np.std(p2_avg_mean), np.std(p3_avg_mean)])
f_median = np.mean([np.mean(p1_avg_median), np.mean(p2_avg_median), np.mean(p3_avg_median)])
f_med_std = np.std([np.std(p1_avg_median), np.std(p2_avg_median), np.std(p3_avg_median)])

table.add_row([num_clusters, round(np.mean(p1_avg_mean), 3), round(np.mean(p2_avg_mean), 3), np.mean(p3_avg_mean),
                np.mean(p1_avg_median), np.mean(p2_avg_median), np.mean(p3_avg_median),
                str(round(f_mean, 3))+str("±")+str(round(f_mean_std, 3)),
               str(round(f_median, 3))+str("±")+str(round(f_med_std, 3))])
print(table)

CASE A
+---------+-------------+-------------+-------------------+--------------------+--------------------+--------------------+--------------+--------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean    |     p1_median      |     p2_median      |     p3_median      |  Final_mean  | Final_median |
+---------+-------------+-------------+-------------------+--------------------+--------------------+--------------------+--------------+--------------+
|    2    |    17.707   |    9.699    | 6.923957754199752 | 1.2494746646847312 | 0.8002180554598306 | 0.5561165970833352 | 11.443±1.252 |  0.869±0.01  |
+---------+-------------+-------------+-------------------+--------------------+--------------------+--------------------+--------------+--------------+
CASE B
+---------+-------------+-------------+-------------------+--------------------+--------------------+--------------------+--------------+--------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean    |     p1_m

In [None]:
# unnormalize scale - RF

# normal


# truncated



In [None]:
# XGBoost

# normal
CASE A
+---------+-------------+-------------+-------------------+--------------------+-------------------+--------------------+--------------+--------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean    |     p1_median      |     p2_median     |     p3_median      |  Final_mean  | Final_median |
+---------+-------------+-------------+-------------------+--------------------+-------------------+--------------------+--------------+--------------+
|    2    |    17.711   |    9.698    | 6.923980810191025 | 1.2480616854295572 | 0.795913512013417 | 0.5547816659748691 | 11.444±0.242 |  0.866±0.02  |
+---------+-------------+-------------+-------------------+--------------------+-------------------+--------------------+--------------+--------------+
CASE B
+---------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean     |     p1_median      |     p2_median      |     p3_median      |  Final_mean  | Final_median |
+---------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------+
|    2    |    20.691   |    9.968    | 7.2683325382455735 | 1.4829643704102786 | 0.8440616826826486 | 0.6121775932487372 | 12.642±0.257 |  0.98±0.01   |
+---------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------+
CASE C
+---------+-------------+-------------+--------------------+-------------------+-------------------+-------------------+---------------+--------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean     |     p1_median     |     p2_median     |     p3_median     |   Final_mean  | Final_median |
+---------+-------------+-------------+--------------------+-------------------+-------------------+-------------------+---------------+--------------+
|    2    |    94.906   |   126.936   | 149.37965827457265 | 5.178952993830566 | 7.674117156600847 | 9.117903387742587 | 123.741±0.634 | 7.324±0.054  |
+---------+-------------+-------------+--------------------+-------------------+-------------------+-------------------+---------------+--------------+
CASE D
+---------+-------------+-------------+--------------------+--------------------+-------------------+-------------------+---------------+--------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean     |     p1_median      |     p2_median     |     p3_median     |   Final_mean  | Final_median |
+---------+-------------+-------------+--------------------+--------------------+-------------------+-------------------+---------------+--------------+
|    2    |    94.097   |   127.119   | 149.70016344616445 | 5.5191778567136565 | 7.891798551428683 | 9.225861417164055 | 123.638±0.707 | 7.546±0.053  |
+---------+-------------+-------------+--------------------+--------------------+-------------------+-------------------+---------------+--------------+

# truncated
CASE A
+---------+-------------+-------------+-------------------+--------------------+--------------------+--------------------+----------------+------------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean    |     p1_median      |     p2_median      |     p3_median      | Final_mean±std | Final_median±std |
+---------+-------------+-------------+-------------------+--------------------+--------------------+--------------------+----------------+------------------+
|    2    |    1.627    |    1.038    | 0.719108581616437 | 1.2483689095416288 | 0.8280252305243709 | 0.5696949965551511 |  1.128±0.015   |   0.882±0.022    |
+---------+-------------+-------------+-------------------+--------------------+--------------------+--------------------+----------------+------------------+
CASE B
+---------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+----------------+------------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean     |     p1_median      |     p2_median      |     p3_median      | Final_mean±std | Final_median±std |
+---------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+----------------+------------------+
|    2    |    1.957    |    1.086    | 0.7779980942559245 | 1.4937434764281947 | 0.8559582926306614 | 0.6200784780395261 |  1.274±0.023   |    0.99±0.018    |
+---------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+----------------+------------------+
CASE C
+---------+-------------+-------------+-------------------+-------------------+-------------------+-------------------+----------------+------------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean    |     p1_median     |     p2_median     |     p3_median     | Final_mean±std | Final_median±std |
+---------+-------------+-------------+-------------------+-------------------+-------------------+-------------------+----------------+------------------+
|    2    |    7.414    |    10.501   | 12.45058821795022 | 5.178952993830566 | 7.674117156600847 | 9.117903387742587 |  10.122±0.049  |   7.324±0.054    |
+---------+-------------+-------------+-------------------+-------------------+-------------------+-------------------+----------------+------------------+
CASE D
+---------+-------------+-------------+--------------------+--------------------+-------------------+-------------------+----------------+------------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean     |     p1_median      |     p2_median     |     p3_median     | Final_mean±std | Final_median±std |
+---------+-------------+-------------+--------------------+--------------------+-------------------+-------------------+----------------+------------------+
|    2    |    7.769    |    10.702   | 12.586511469424968 | 5.5191778567136565 | 7.891798551428683 | 9.225861417164055 |  10.353±0.056  |   7.546±0.053    |
+---------+-------------+-------------+--------------------+--------------------+-------------------+-------------------+----------------+------------------+

In [None]:
# Random Forest

# normal
CASE A
+---------+-------------+-------------+-------------------+--------------------+-------------------+--------------------+--------------+--------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean    |     p1_median      |     p2_median     |     p3_median      |  Final_mean  | Final_median |
+---------+-------------+-------------+-------------------+--------------------+-------------------+--------------------+--------------+--------------+
|    2    |    17.711   |    9.698    | 6.923980810191025 | 1.2480616854295572 | 0.795913512013417 | 0.5547816659748691 | 11.444±0.242 |  0.866±0.02  |
+---------+-------------+-------------+-------------------+--------------------+-------------------+--------------------+--------------+--------------+
CASE B
+---------+-------------+-------------+-------------------+--------------------+-------------------+--------------------+--------------+--------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean    |     p1_median      |     p2_median     |     p3_median      |  Final_mean  | Final_median |
+---------+-------------+-------------+-------------------+--------------------+-------------------+--------------------+--------------+--------------+
|    2    |    20.157   |    9.953    | 7.246312790914839 | 1.4753201216072822 | 0.838339090101958 | 0.6081042314470179 | 12.452±0.286 | 0.974±0.013  |
+---------+-------------+-------------+-------------------+--------------------+-------------------+--------------------+--------------+--------------+
CASE C
+---------+-------------+-------------+--------------------+-------------------+-------------------+-------------------+--------------+--------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean     |     p1_median     |     p2_median     |     p3_median     |  Final_mean  | Final_median |
+---------+-------------+-------------+--------------------+-------------------+-------------------+-------------------+--------------+--------------+
|    2    |    86.659   |   113.489   | 128.04516649309775 | 4.852922333792005 | 6.704761462627094 | 7.528388340239249 | 109.398±1.38 | 6.362±0.038  |
+---------+-------------+-------------+--------------------+-------------------+-------------------+-------------------+--------------+--------------+
CASE D
+---------+-------------+-------------+--------------------+--------------------+-------------------+-------------------+---------------+--------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean     |     p1_median      |     p2_median     |     p3_median     |   Final_mean  | Final_median |
+---------+-------------+-------------+--------------------+--------------------+-------------------+-------------------+---------------+--------------+
|    2    |    86.38    |   113.624   | 128.16048315756618 | 5.1469700958863305 | 6.869501277404052 | 7.641298147864784 | 109.388±1.438 |  6.553±0.02  |
+---------+-------------+-------------+--------------------+--------------------+-------------------+-------------------+---------------+--------------+

# truncated
CASE A
+---------+-------------+-------------+-------------------+--------------------+--------------------+--------------------+----------------+------------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean    |     p1_median      |     p2_median      |     p3_median      | Final_mean±std | Final_median±std |
+---------+-------------+-------------+-------------------+--------------------+--------------------+--------------------+----------------+------------------+
|    2    |    1.627    |    1.038    | 0.719108581616437 | 1.2483689095416288 | 0.8280252305243709 | 0.5696949965551511 |  1.128±0.015   |   0.882±0.022    |
+---------+-------------+-------------+-------------------+--------------------+--------------------+--------------------+----------------+------------------+
CASE B
+---------+-------------+-------------+--------------------+--------------------+--------------------+-------------------+----------------+------------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean     |     p1_median      |     p2_median      |     p3_median     | Final_mean±std | Final_median±std |
+---------+-------------+-------------+--------------------+--------------------+--------------------+-------------------+----------------+------------------+
|    2    |    1.934    |    1.078    | 0.7715926403302976 | 1.4823975481299596 | 0.8497340535955633 | 0.618906088187305 |  1.261±0.021   |   0.984±0.017    |
+---------+-------------+-------------+--------------------+--------------------+--------------------+-------------------+----------------+------------------+
CASE C
+---------+-------------+-------------+--------------------+-------------------+-------------------+-------------------+----------------+------------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean     |     p1_median     |     p2_median     |     p3_median     | Final_mean±std | Final_median±std |
+---------+-------------+-------------+--------------------+-------------------+-------------------+-------------------+----------------+------------------+
|    2    |    6.971    |    9.453    | 10.709199048520878 | 4.852922333792005 | 6.706334088197845 | 7.528388340239249 |  9.044±0.083   |   6.363±0.038    |
+---------+-------------+-------------+--------------------+-------------------+-------------------+-------------------+----------------+------------------+
CASE D
+---------+-------------+-------------+-------------------+--------------------+-------------------+-------------------+----------------+------------------+
| cluster | p1_avg_mean | p2_avg_mean |    p3_avg_mean    |     p1_median      |     p2_median     |     p3_median     | Final_mean±std | Final_median±std |
+---------+-------------+-------------+-------------------+--------------------+-------------------+-------------------+----------------+------------------+
|    2    |    7.296    |    9.609    | 10.81682702526126 | 5.1469700958863305 | 6.869501277404052 | 7.641298147864784 |   9.24±0.089   |    6.553±0.02    |
+---------+-------------+-------------+-------------------+--------------------+-------------------+-------------------+----------------+------------------+