<a href="https://colab.research.google.com/github/gladcolor/SVM_DNN_testing/blob/master/CSCE822_HW2_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import packages

In [157]:
import pandas as pd

import numpy as np

from sklearn.impute import SimpleImputer
# import category_encoders as ce
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC
import sklearn.metrics as metrics
from sklearn.model_selection import cross_validate

# Define functions

In [11]:
def print_str_unique(df):
    for col in df.columns:
        if original_data.dtypes[col] == np.object:            
            unique_cnt = len(df[col].unique())
            print(f'Column {col.rjust(13)} has {unique_cnt:5} unique values.')

def count_column_nan(df):
    row_cnt = len(df)
    for col in df.columns:
        nan_cnt = df[col].isna().sum()
        percent_str = f'({(nan_cnt / row_cnt * 100):3.1f}%)'.rjust(7)
        print(f'Column {str(col).rjust(13)} has {nan_cnt:4} {percent_str} nan values.')       

def impute_df(df, strategy="most_frequent"):
    
    numeric_cols = ['BuildingArea', 'YearBuilt', 'Car']
    nominal_cols = ['CouncilArea']

    my_imputer = SimpleImputer(strategy="most_frequent")
    council_area_with_imputed_values = my_imputer.fit_transform(df[nominal_cols])
    imputed_df = df.copy()
    imputed_df.loc[:, nominal_cols] = council_area_with_imputed_values


    if strategy == "most_frequent":
        my_imputer = SimpleImputer(strategy="most_frequent")
        data_with_imputed_values = my_imputer.fit_transform(df)        
        imputed_df.loc[:, :] = data_with_imputed_values

    if strategy == "mean":
        my_imputer = SimpleImputer(strategy="mean")
        area_year_with_imputed_values = my_imputer.fit_transform(df[numeric_cols])
        imputed_df.loc[:, numeric_cols] = area_year_with_imputed_values

    if strategy == "median":
        my_imputer = SimpleImputer(strategy="median")
        area_year_with_imputed_values = my_imputer.fit_transform(df[numeric_cols])
        imputed_df.loc[:, numeric_cols] = area_year_with_imputed_values

    return imputed_df

def encode_dates(imputed_df):
    imputed_df['Date'] = pd.to_datetime(imputed_df['Date']) 
    imputed_df['Ori_Date'] = pd.to_datetime('1970-01-01', format='YY-m-d', errors='ignore')
    imputed_df['Ori_Date'] = pd.to_datetime(imputed_df['Ori_Date'])
    imputed_df['delta_days'] = imputed_df['Date'] - imputed_df['Ori_Date']
    imputed_df['delta_days'] = imputed_df['delta_days'].dt.days
    imputed_df = imputed_df.drop(columns=['Date', 'Ori_Date'])
     
    return imputed_df



def encoder_nominals(imputed_df, encode_method='one_hot'):
    # print(f'Encode methods: {ENCODING_METHODS_DICT.keys()} \n')
    
    ce_encoder = ENCODING_METHODS_DICT[encode_method](cols = ENCODING_COLUMNS)

    y = imputed_df['Price_class'].copy()
    
    for drop_column in DROPPED_COLUMNS:
        try:
            imputed_df = imputed_df.drop(columns=drop_column).copy()
        except:
            pass
            # print(f'Columns: {drop_column} have already dropped before.')

    encoded_df = ce_encoder.fit_transform(imputed_df, y=y) 

    return encoded_df

def assign_price_class(imputed_df):
    row_cnt = len(imputed_df)
    price_class_cnt = 5
    class_step = int(row_cnt / price_class_cnt)
    price_bins = list(range(class_step, row_cnt,  class_step))

    imputed_df.loc[0:price_bins[0], 'Price_class'] = '0' # 'bottom_value'
    imputed_df.loc[price_bins[0]:price_bins[1], 'Price_class'] = '1' # 'low_value'
    imputed_df.loc[price_bins[1]:price_bins[2], 'Price_class'] = '2' # 'medium_value'
    imputed_df.loc[price_bins[2]:price_bins[3], 'Price_class'] = '3' # 'high_value'
    imputed_df.loc[price_bins[3]:row_cnt, 'Price_class'] = '4'  #  'top_value'

    imputed_df['Price_class'] = imputed_df['Price_class'].astype(int)

    # gb = imputed_df.groupby('Price_class')['Price_class'].count().to_frame()
    # gb.columns = ['Count']
    # custom_dict = {'bottom_value': 0, 'low_value': 1, 'medium_value': 2, 'high_value': 3, 'top_value': 4}
    # gb.sort_index(key=lambda x: x.map(custom_dict))
    # print("Price class counts:")
    return imputed_df    


def split_data(encoded_df):
    X = encoded_df.drop(columns=['Price_class'])
    y = encoded_df['Price_class']

    train_ratio = 0.75
    validation_ratio = 0.10
    test_ratio = 0.15

    xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size= (1 - train_ratio), random_state = 0)

    xVal, xTest, yVal, yTest = train_test_split(xTest, yTest, test_size=test_ratio/(test_ratio + validation_ratio), random_state = 0) 

    
    return xTrain, yTrain, xVal, yVal, xTest, yTest   

def standardize_data(encoded_df, class_col='Price_class'):
    labels = encoded_df[class_col].copy()
    data_df = encoded_df.drop(columns=[class_col])
    scaler = preprocessing.StandardScaler()
    scaler.fit(data_df) 
    data_df.iloc[:, :] = scaler.transform(data_df)
    data_df.loc[:, class_col] = labels
    return data_df
           
def sample_train_dataset(positive_count=len(train_1), negative_count=len(train_1)):
    positive_count = len(train_1)
    negative_count = len(train_1)

    balanced_train = np.concatenate((train_1.sample(positive_count, replace=True), train_0.sample(negative_count, replace=True)), axis=0)
    balanced_train_label = np.concatenate(([1] * positive_count, [0] * negative_count), axis=0)

    print(f"Positive sample counts in the training set: {positive_count}")
    print(f"Negative sample counts in the training set: {negative_count}")

    return balanced_train, balanced_train_label

# Load and understand the data

## Load data

In [6]:
test_csv = r'https://github.com/gladcolor/SVM_DNN_testing/raw/master/test10000.zip'
test_label_csv = r'https://github.com/gladcolor/SVM_DNN_testing/raw/master/test10000_label.zip'
train_csv = r'https://github.com/gladcolor/SVM_DNN_testing/raw/master/train10000.zip'
train_label_csv = r'https://github.com/gladcolor/SVM_DNN_testing/raw/master/train10000_Label.zip'

train_df = pd.read_csv(train_csv, header=None)
train_label_df = pd.read_csv(train_label_csv, header=None)
test_df = pd.read_csv(test_csv, header=None)
test_label_df = pd.read_csv(test_label_csv, header=None)

print("Training sets samples:")
train_df.sample(4)


Training sets samples:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333
1144,1.0,46,999000.0,46,196000.0,1.0,3.0,999000.0,6.0,1.0,30,3,10.0,45,0.0,70,449,482,356,6,650,310,580,260,1513,4,55,999000.0,999000.0,28,27,1379,11,3,5,19,12,1973,1973,364,...,88,68,30,12,52,9,5,21,15,14,8,70,49,5,5,67,7,46,0,1,0,1,2,0,0,5,11,32,6,82,51,12,85,15,6,3,17,20,252,150
6779,999000.0,46,4.0,46,196000.0,2.0,999000.0,2.0,2.0,999000.0,25,1,999000.0,25,999000.0,170,439,483,349,5,501,300,436,240,1495,5,46,9849.0,9.0,27,31,1385,11,3,3,19,11,1962,1969,339,...,95,47,60,11,39,8,5,24,22,28,14,42,30,8,8,63,9,29,1,4,0,1,4,0,0,7,33,38,8,60,37,12,66,10,6,2,18,14,184,96
4812,999000.0,46,999000.0,46,196000.0,999000.0,999000.0,999000.0,999000.0,999000.0,50,2,999000.0,115,999000.0,170,402,438,333,7,671,300,575,230,1083,4,47,2077.0,0.0,35,30,1179,12,3,4,14,11,1971,1971,335,...,95,93,54,11,35,7,6,32,24,26,11,36,30,6,3,69,13,40,0,4,0,1,14,1,0,8,24,31,7,40,28,7,48,5,4,3,18,15,252,154
1510,1.0,72,1.0,46,196000.0,1.0,4.0,999000.0,8.0,1.0,50,4,10.0,14,1.0,240,484,512,383,3,606,310,537,260,1046,4,61,4343.0,0.0,25,42,966,8,2,13,20,14,1958,1958,408,...,81,18,16,8,69,7,5,19,14,14,7,89,67,4,10,71,2,56,0,0,0,1,0,0,0,3,10,26,23,95,57,19,96,20,9,1,65,25,436,275


In [10]:
print("County nan data:")

print(f"Train data have {train_df.isna().sum().sum()} nan values.")
print(f"Test data have {test_df.isna().sum().sum()} nan values.")

train_1 = train_df[train_label_df[0] == 1]
train_0 = train_df[train_label_df[0] == 0]

print()

print(f"Positive sample counts: {len(train_1)}")
print(f"Negative sample counts: {len(train_0)}")

County nan data:
Train data have 0 nan values.
Test data have 0 nan values.

Positive sample counts: 909
Negative sample counts: 9091


# Train 10 SVM models in an ensemble learning manner



*italicized text*## Train 10 models with cross validation

In [None]:
# train 10 models
from sklearn.model_selection import cross_val_score

MODEL_CNT = int(len(train_0)/len(train_1))
sample_cnt_per_portion = int(len(train_0)/MODEL_CNT)

positive_count = len(train_1)
negative_count = len(train_1)

print("Model count: ", MODEL_CNT)
print("sample_cnt_per_portion:", sample_cnt_per_portion)

trained_model_list = []
scores_precision_list = []
scores_recall_list = []
scores_roc_auc_list =[]
scores_mcc_list = []

USE_CROSS_VALIDATION = True   # 

for i in range(MODEL_CNT):
    print(f"Training # {i + 1} model...")
    start_row = i * sample_cnt_per_portion
    end_row = start_row + sample_cnt_per_portion
    # print(start_row, end_row)

    # create a balancd training set
    balanced_train = np.concatenate((train_1, train_0[start_row:end_row]), axis=0)
    balanced_train_label = np.concatenate(([1] * positive_count, [0] * negative_count), axis=0)
    
    

    clf = make_pipeline(StandardScaler(), SVC(gamma='auto',
                                              kernel='rbf',
                                              verbose=True, probability=True))


    # actually train the model 200 times.
    score_recall = cross_val_score(clf, balanced_train, balanced_train_label, cv=10, scoring='recall')
    score_precision = cross_val_score(clf, balanced_train, balanced_train_label, cv=10, scoring='precision')
    score_roc_auc = cross_val_score(clf, balanced_train, balanced_train_label, cv=10, scoring='roc_auc')


    print("\n%0.2f precision with a standard deviation of %0.2f" % (score_precision.mean(), score_precision.std()))
    print("\n%0.2f recall with a standard deviation of %0.2f" % (score_recall.mean(), score_recall.std()))
    print("\n%0.2f AUC with a standard deviation of %0.2f" % (score_roc_auc.mean(), score_roc_auc.std()))

    trained_model_list.append(clf)
    scores_list.append(scores)

    scores_precision_list.append(score_precision)
    scores_recall_list.append(score_recall)
    scores_roc_auc_list.append(score_roc_auc)


## Train 10 models witouth cross validation

In [150]:
# train 10 models
from sklearn.model_selection import cross_val_score

MODEL_CNT = int(len(train_0)/len(train_1))
sample_cnt_per_portion = int(len(train_0)/MODEL_CNT)

positive_count = len(train_1)
negative_count = len(train_1)

print("Model count: ", MODEL_CNT)
print("sample_cnt_per_portion:", sample_cnt_per_portion)
trained_model_list = []

score_roc_auc_list = []
score_precision_list = []
score_recall_list = []

for i in range(MODEL_CNT):
    print(f"Training # {i + 1} model...")
    start_row = i * sample_cnt_per_portion
    end_row = start_row + sample_cnt_per_portion
    # print(start_row, end_row)

     # create a balancd training set
    train_1 = train_df[train_label_df[0] == 1].sample(frac=1)  # shuffle

    balanced_train = np.concatenate((train_1, train_0[start_row:end_row]), axis=0)
    balanced_train_label = np.concatenate(([1] * positive_count, [0] * negative_count), axis=0)

    # Use SVM
    # clf = make_pipeline(StandardScaler(), SVC(gamma='auto',
    #                                           kernel='rbf',
    #                                           verbose=True, probability=True))
    
    # Use random forest
    clf = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=500))

    clf.fit(balanced_train, balanced_train_label)

    trained_model_list.append(clf)


Model count:  10
sample_cnt_per_portion: 909
Training # 1 model...
Training # 2 model...
Training # 3 model...
Training # 4 model...
Training # 5 model...
Training # 6 model...
Training # 7 model...
Training # 8 model...
Training # 9 model...
Training # 10 model...


## Evaluate the trained 10 models

In [152]:
print("Evaluating...")
test_pred_list = []
for idx, clf in enumerate(trained_model_list):
    print(f"Testing # {idx + 1} model...")
    test_pred = clf.predict_proba(test_df)
    test_pred_list.append(test_pred)


Evaluating...
Testing # 1 model...
Testing # 2 model...
Testing # 3 model...
Testing # 4 model...
Testing # 5 model...
Testing # 6 model...
Testing # 7 model...
Testing # 8 model...
Testing # 9 model...
Testing # 10 model...


## Print out assessment

In [153]:
test_pred_all = np.array(test_pred_list)

pred_label = np.where(test_pred_all[:, :, 1].mean(axis=0)>0.5, 1, 0).astype(int)

precision_score = metrics.precision_score(np.array(test_label_df), pred_label, average='macro')
precision_score = metrics.precision_score(np.array(test_label_df), pred_label, average='macro')

recall_score = metrics.recall_score(np.array(test_label_df), pred_label, average='macro')
mcc_score = metrics.matthews_corrcoef(np.array(test_label_df), pred_label)
roc_auc_score = metrics.roc_auc_score(test_label_df, test_pred_all[:, :, 1].mean(axis=0), average=None)

print("precision_score:", precision_score)
print("recall_score:", recall_score)
print("mcc_score:", mcc_score)
print("roc_auc_score:", roc_auc_score)

precision_score: 0.5383174018953414
recall_score: 0.6096338049093124
mcc_score: 0.12962843151138675
roc_auc_score: 0.6366664750885578


## Save the predicted label

In [17]:
pred_label = np.where(test_pred_all[:, :, 1].mean(axis=0)>0.5, 1, 0).astype(int)
np.savetxt('predict.csv', pred_label, fmt='%d' )

##Train 10 models with cross validation
Will take about 30 min to train 300 models.

In [None]:
# train 10 models
from sklearn.model_selection import cross_val_score

MODEL_CNT = int(len(train_0)/len(train_1))
sample_cnt_per_portion = int(len(train_0)/MODEL_CNT)

positive_count = len(train_1)
negative_count = len(train_1)

print("Model count: ", MODEL_CNT)
print("sample_cnt_per_portion:", sample_cnt_per_portion)

trained_model_list = []
scores_precision_list = []
scores_recall_list = []
scores_roc_auc_list =[]
scores_mcc_list = []
score_accuracy_list = []

USE_CROSS_VALIDATION = True   # 

def simple_matthews_corrcoef(true_np, predict_np):
    cm = metrics.confusion_matrix(true_np, predict_np)
    assert(cm.shape == (2, 2)), 'Support binary classification (2-class) only!'
    TP = cm[1, 1]
    TN = cm[0, 0]
    FP = cm[0, 1]
    FN = cm[1, 0]

    MCC = (TP * TN - FP * FN) / (np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)))

    return MCC

for i in range(MODEL_CNT):
    print(f"Training # {i + 1} model...")
    start_row = i * sample_cnt_per_portion
    end_row = start_row + sample_cnt_per_portion
    # print(start_row, end_row)

    # create a balancd training set
    balanced_train = np.concatenate((train_1, train_0[start_row:end_row]), axis=0)
    balanced_train_label = np.concatenate(([1] * positive_count, [0] * negative_count), axis=0)
    
    

    clf = make_pipeline(StandardScaler(), SVC(gamma='auto',
                                              kernel='rbf',
                                              verbose=True, probability=True))


    # calcuate precision, recall, AUC. actually train the model 300 times.
    # score_recall = cross_val_score(clf, balanced_train, balanced_train_label, cv=10, scoring='recall')
    # score_precision = cross_val_score(clf, balanced_train, balanced_train_label, cv=10, scoring='precision')
    # score_roc_auc = cross_val_score(clf, balanced_train, balanced_train_label, cv=10, scoring='roc_auc')


    # print("\n%0.2f precision with a standard deviation of %0.2f" % (score_precision.mean(), score_precision.std()))
    # print("\n%0.2f recall with a standard deviation of %0.2f" % (score_recall.mean(), score_recall.std()))
    # print("\n%0.2f AUC with a standard deviation of %0.2f" % (score_roc_auc.mean(), score_roc_auc.std()))

    # trained_model_list.append(clf)
    # scores_list.append(scores)

    # scores_precision_list.append(score_precision)
    # scores_recall_list.append(score_recall)
    # scores_roc_auc_list.append(score_roc_auc)

    # To calculate MCC only, actally train 100 models.
    score_accuracy = cross_validate(clf, balanced_train, balanced_train_label, cv=10, return_estimator=True)
    score_accuracy_list.append(score_accuracy)



Model count:  10
sample_cnt_per_portion: 909
Training # 1 model...
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]Training # 2 model...
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]Training # 3 model...
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]Training # 4 model...
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]Training # 5 model...
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]Training # 6 model...
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]Training # 7 model...
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]Training # 8 model...
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [None]:
clf.predict()

# calculate MCC
mcc_score = simple_matthews_corrcoef())
scores_mcc_list.append(mcc_score)

# Program to calculate model performance from two label files: Precision, Recall, MCC

## Load data

In [91]:
label_csv = r'https://github.com/gladcolor/SVM_DNN_testing/raw/master/test10000_label.zip'
predict_csv = r'https://raw.githubusercontent.com/gladcolor/SVM_DNN_testing/master/predict.csv'

label_df = pd.read_csv(label_csv)
predict_df = pd.read_csv(predict_csv)


true_np = np.array(label_df)
predict_np = np.array(predict_df)

In [154]:
class simple_metrics():

    @staticmethod
    def _load_data(true_csv, predict_csv):
        true_csv = r'https://github.com/gladcolor/SVM_DNN_testing/raw/master/test10000_label.zip'
        predict_csv = r'https://raw.githubusercontent.com/gladcolor/SVM_DNN_testing/master/predict.csv'

        true_df = pd.read_csv(true_csv)
        predict_df = pd.read_csv(predict_csv)

        true_np = np.array(true_df)
        predict_np = np.array(predict_df)

        return true_np, predict_np

    @staticmethod
    def get_confusion_matrix(true_np, predict_np): # inputs: should be integer numpy array (1D), using the same class index schema.
        true_unique = np.unique(true_np)
        predict_unique = np.unique(predict_np)

        cm = np.zeros((len(true_unique), len(true_unique)), dtype=int)   # cm: confusion_matrix, row is actual, column is predicted

        for true_, pred in zip(true_np[:].flatten(), predict_np[:].flatten()):
            cm[true_, pred] += 1
            # print(true_, pred)
        return cm

    @staticmethod
    def precision_recall_score(true_csv, predict_csv):  # CSV file has one column only withoud header.
        true_np, predict_np = simple_metrics._load_data(true_csv, predict_csv)
        confusion_matrix = simple_metrics.get_confusion_matrix(true_np, predict_np)
        class_cnt = confusion_matrix.shape[0]
        precisions = np.zeros((class_cnt))
        recalls = np.zeros((class_cnt))

 

        # compute recall, precision
        for c in range(class_cnt):
            TP = confusion_matrix[c, c]
            TP_FP = confusion_matrix[c, :].sum()
            TP_FN = confusion_matrix[:, c].sum()
            recalls[c] = TP / TP_FP
            precisions[c] = TP / TP_FN

        return precisions, recalls

    @staticmethod
    def matthews_corrcoef(true_csv, predict_csv): # CSV file has one column only withoud header.
        MCC = 0
        # compute MCC, current for binary classification only
        
        cm = simple_metrics.get_confusion_matrix(true_np, predict_np)
        assert(cm.shape == (2, 2)), 'Support binary classification (2-class) only!'
        TP = cm[1, 1]
        TN = cm[0, 0]
        FP = cm[0, 1]
        FN = cm[1, 0]

        MCC = (TP * TN - FP * FN) / (np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)))
        # https://en.wikipedia.org/wiki/Matthews_correlation_coefficient
        return MCC

class_precision, class_recall = simple_metrics.precision_recall_score(label_csv, predict_csv)
MCC =  simple_metrics.matthews_corrcoef(label_csv, predict_csv)

print("My results:")
print('class_precision:', class_precision.round(4))
print('class_recall：', class_recall.round(4))
print('Matthews_corrcoef: %.4f' % MCC)


print()
print("sklearn results:")
rpt = metrics.classification_report(true_np, predict_np, digits=4)
print(rpt)
print()
print("sklearn matthews_corrcoef: %.4f" % metrics.matthews_corrcoef(true_np, predict_np))

My results:
class_precision: [0.9392 0.1409]
class_recall： [0.6082 0.6198]
Matthews_corrcoef: 0.1351

sklearn results:
              precision    recall  f1-score   support

           0     0.9392    0.6082    0.7383      9060
           1     0.1409    0.6198    0.2295       939

    accuracy                         0.6093      9999
   macro avg     0.5400    0.6140    0.4839      9999
weighted avg     0.8642    0.6093    0.6905      9999


sklearn matthews_corrcoef: 0.1351


In [None]:
print("sklearn results:")
rpt = metrics.classification_report(true_np.flatten(), predict_np.flatten())
print(rpt)

In [92]:

metrics.recall_score(true_np.flatten(), predict_np.flatten())

0.6198083067092651

In [93]:
metrics.precision_score(true_np.flatten(), predict_np.flatten())

0.14085188770571153

In [99]:
rpt = metrics.classification_report(true_np.flatten(), predict_np.flatten())


              precision    recall  f1-score   support

           0       0.94      0.61      0.74      9060
           1       0.14      0.62      0.23       939

    accuracy                           0.61      9999
   macro avg       0.54      0.61      0.48      9999
weighted avg       0.86      0.61      0.69      9999



## Train 10 models with cross validation

Will take about 30 min to train 300 models.

precision_score: 0.5401121274658705
recall_score: 0.6141945392246775
mcc_score: 0.13536005191025305
roc_auc_score: 0.650464095030264


In [None]:
test_pred_all[:, :, 1].mean(axis=0)

array([0.48866811, 0.39856657, 0.48200092, ..., 0.35131929, 0.36412371,
       0.56005731])

In [None]:
np.where(test_pred_all[:, :, 1].mean(axis=0)>0.5, 1, 0).sum()

4118

In [None]:
np.argmax(test_pred_all, axis=1)

array([[3370, 7993],
       [3370, 1967],
       [6781, 9864],
       [3370, 8516],
       [2204, 7993],
       [5448, 5553],
       [1300, 8105],
       [   4, 7510],
       [4460, 5553],
       [8481, 9035]])

In [None]:
recall_score = metrics.recall_score(test_label_df, np.where(test_pred_all[:, :, 1].mean(axis=0)>0.5, 1, 0), average=None)
recall_score

array([0.60964573, 0.61874334])

In [None]:
 metrics.recall_score(np.array(test_label_df), np.where(test_pred_all[:, :, 1].mean(axis=0)>0.5, 1, 0), average='macro')

0.6141945392246775

In [None]:
np.array(test_label_df)

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [None]:
scores_precision_np = np.array(scores_precision_list)
scores_recall_np = np.array(scores_recall_list)
scores_roc_auc_np = np.array(scores_roc_auc_list)

print("mean of precision %.4f, recall: %.4f, roc_auc: %.4f" % (scores_precision_np.mean(), scores_recall_np.mean(), scores_roc_auc_np.mean()))



mean of precision nan, recall: nan, roc_auc: 0.6182


  """
  ret = ret.dtype.type(ret / rcount)


In [None]:
sorted(metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']

In [None]:
print("Evaluating...")

test_pred_list = []
for clf in trained_model_list:
    test_pred = clf.predict_proba(test_df)
    test_pred_list.append(test_pred)

In [None]:
from sklearn.svm import SVC

balanced_train, balanced_train_label = sample_train_dataset(positive_count=len(train_1), negative_count=len(train_1))


clf = make_pipeline(StandardScaler(), SVC(gamma='auto', verbose=True, probability=True))
clf.fit(balanced_train, balanced_train_label)

from sklearn import metrics

test_pred = clf.predict_proba(test_df)
 

# metrics.f1_score(test_pred, balanced_train_label)
metrics.roc_auc_score(test_label_df, test_pred[:, 1], average=None)


Positive sample counts in the training set: 909
Negative sample counts in the training set: 909
[LibSVM]

0.6284599388430963

In [None]:
# train 10 models
from sklearn.model_selection import cross_val_score

MODEL_CNT = int(len(train_0)/len(train_1))
sample_cnt_per_portion = int(len(train_0)/MODEL_CNT)

positive_count = len(train_1)
negative_count = len(train_1)

print("Model count: ", MODEL_CNT)
print("sample_cnt_per_portion:", sample_cnt_per_portion)
trained_model_list = []
scores_list = []
for i in range(MODEL_CNT):
    print(f"Training # {i + 1} model...")
    start_row = i * sample_cnt_per_portion
    end_row = start_row + sample_cnt_per_portion
    print(start_row, end_row)
    balanced_train = np.concatenate((train_1, train_0[start_row:end_row]), axis=0)
    balanced_train_label = np.concatenate(([1] * positive_count, [0] * negative_count), axis=0)


    
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto',
                                              kernel='rbf',
                                              verbose=True, probability=True))
    # clf.fit(balanced_train, balanced_train_label)

    scores = cross_val_score(clf, balanced_train, balanced_train_label, cv=10)

    print("\n%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

    trained_model_list.append(clf)
    scores_list.append(scores)

    # test_pred = clf.predict_proba(test_df)
    # auc_score = metrics.roc_auc_score(test_label_df, test_pred[:, 1], average=None)
    # print("auc_score:", auc_score, '\n')

print("Evaluating...")

test_pred_list = []
for clf in trained_model_list:
    test_pred = clf.predict_proba(test_df)
    test_pred_list.append(test_pred)

test_pred_all = np.array(test_pred_list)


auc_score = metrics.roc_auc_score(test_label_df, test_pred_all[:, :, 1].mean(axis=0), average=None)
print("auc_score:", auc_score)

In [None]:
test_pred_list = []
for clf in trained_model_list:
    test_pred = clf.predict_proba(test_df)
    test_pred_list.append(test_pred)

test_pred_np = np.array(test_pred_list)
test_pred_np

In [None]:
test_pred_np = np.array(test_pred_list)
test_pred_np.shape

(10, 10000, 2)

In [None]:
test_pred_np[:, :, 1].mean(axis=0).shape

(10000,)

In [None]:
auc_score = metrics.roc_auc_score(test_label_df, test_pred_np[:, :, 1].mean(axis=0), average=None)
print("auc_score:", auc_score)

auc_score: 0.6503078942286684


In [None]:
import  joblib
% cd /content/drive/MyDrive/USC_courses/CSCE822/HW2


/content/drive/MyDrive/USC_courses/CSCE822/HW2


In [None]:
joblib.dump(clf, 'SVM_oversampling.joblib') 

['SVM_oversampling.joblib']

# DNN for thermal regression

## Load packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.optim as optim
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline



## Load data

In [None]:
data_csv = r'https://raw.githubusercontent.com/gladcolor/SVM_DNN_testing/master/themal_dataset.csv'

data_df = pd.read_csv(data_csv)
features_df = data_df.iloc[:, 1:21]
y_label = data_df.iloc[:, 22]
y_label = np.array(y_label)
y_label

## Standardize features

In [None]:
from sklearn import preprocessing
input_features = preprocessing.StandardScaler().fit_transform(features_df)

print("Feature shape:", input_features.shape)


Feature shape: (370, 20)


## create k-fold dataset



In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


class FeatureDataset(Dataset):
    def __init__(self, features_np, labels_np):
        features_np = preprocessing.StandardScaler().fit_transform(features_np)
        self.features = torch.from_numpy(features_np)
        self.labels = torch.from_numpy(labels_np)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        features = self.features[index, 1:21]        
        label = self.labels[index]
        
        return features, label

feature_dataset = FeatureDataset(features_np=input_features, labels_np=y_label)

train_dataloader = DataLoader(feature_dataset, batch_size=1, shuffle=True)

# list(train_dataloader)

In [None]:
from sklearn.model_selection import KFold

fold_k = 10
k_fold_spliter = KFold(n_splits=fold_k, random_state=None, shuffle=True)

def train_a_model():

    pass

for  idx, (train_index, test_index) in enumerate(k_fold_spliter.split(input_features)):
    # print(idx,  train_index, test_index)
    print(f"Processing {idx} fold.")
    # print(input_features[train_index].shape)
    # print(y_label[test_index].shape)
    feature_dataset = FeatureDataset(features_np=input_features[train_index], labels_np=y_label[train_index])

    train_dataloader = DataLoader(feature_dataset, batch_size=1, shuffle=True)

    print(list(train_dataloader))


# New Section

## Create neural network

In [None]:
x = torch.tensor(input_features, dtype = float).to('cuda')
y = torch.tensor(y_label, dtype = float).to('cuda')


In [None]:
## 权重参数初始化 ([348,14] [14, 128] [128, 1])
# 将当前输入的特征 (这里有14个特征) 转换为隐藏的特征 (这里设计为 128 个神经元来表示)
weights = torch.randn((20, 128), dtype = float, requires_grad = True) 
# 偏置参数的 shape 与结果一致 (上面输出 128 个隐藏的特征), 故这里设置为 128, 即对这 128 个隐藏的特征都进行微调
biases = torch.randn(128, dtype = float, requires_grad = True) 
# 因为我们做的是回归任务, 需要得到一个实际的值, 即将这 128 个特征转换为一个值
weights2 = torch.randn((128, 1), dtype = float, requires_grad = True) 
# 同上, 取 1
biases2 = torch.randn(1, dtype = float, requires_grad = True) 

learning_rate = 0.0001 
losses = []

for i in range(100000):
    # 计算隐层
    hidden = x.mm(weights) + biases
    # 加入激活函数
    hidden = torch.relu(hidden)
    # 得到预测结果
    predictions = hidden.mm(weights2) + biases2
    # 计算损失值 (均方误差)
    loss = torch.mean((predictions - y) ** 2) 
    losses.append(loss.data.numpy())

    # 打印损失值
    if i % 1000 == 0:
        print('loss:', loss)
    #返向传播计算
    loss.backward()

    # 更新参数 (梯度下降)
    weights.data
    weights.data.add_(- learning_rate * weights.grad.data)  
    biases.data.add_(- learning_rate * biases.grad.data)
    weights2.data.add_(- learning_rate * weights2.grad.data)
    biases2.data.add_(- learning_rate * biases2.grad.data)

    # 每次迭代都得记得将梯度清空, 防止累加
    weights.grad.data.zero_()
    biases.grad.data.zero_()
    weights2.grad.data.zero_()
    biases2.grad.data.zero_()

loss: tensor(5161.3208, dtype=torch.float64, grad_fn=<MeanBackward0>)
loss: tensor(2414.9358, dtype=torch.float64, grad_fn=<MeanBackward0>)
loss: tensor(2406.8189, dtype=torch.float64, grad_fn=<MeanBackward0>)
loss: tensor(2403.7416, dtype=torch.float64, grad_fn=<MeanBackward0>)
loss: tensor(2402.1950, dtype=torch.float64, grad_fn=<MeanBackward0>)
loss: tensor(2401.3247, dtype=torch.float64, grad_fn=<MeanBackward0>)
loss: tensor(2400.7961, dtype=torch.float64, grad_fn=<MeanBackward0>)
loss: tensor(2400.4360, dtype=torch.float64, grad_fn=<MeanBackward0>)
loss: tensor(2400.1695, dtype=torch.float64, grad_fn=<MeanBackward0>)
loss: tensor(2399.9628, dtype=torch.float64, grad_fn=<MeanBackward0>)
loss: tensor(2399.7967, dtype=torch.float64, grad_fn=<MeanBackward0>)
loss: tensor(2399.6643, dtype=torch.float64, grad_fn=<MeanBackward0>)
loss: tensor(2399.5532, dtype=torch.float64, grad_fn=<MeanBackward0>)
loss: tensor(2399.4600, dtype=torch.float64, grad_fn=<MeanBackward0>)
loss: tensor(2399.38

KeyboardInterrupt: ignored

In [None]:
input_size = input_features.shape[1] # 20
hidden_size = 256
output_size = 1
batch_size = 8
my_nn = torch.nn.Sequential(
    # 第一层: 全连接层
    torch.nn.Linear(input_size, hidden_size),
    torch.nn.Linear(hidden_size, hidden_size),
    # 激活函数
    torch.nn.ReLU(),
    # 第二层: 全连接层
    torch.nn.Linear(hidden_size, output_size),
)
# 损失函数: 均方误差
cost = torch.nn.MSELoss(reduction='mean')
# 优化器 (动态调整学习率)
optimizer = torch.optim.Adam(my_nn.parameters(), lr = 0.001)


# 训练网络
losses = []
for i in range(10000):
    batch_loss = []
    # MINI-Batch 方法来进行训练
    for start in range(0, len(input_features), batch_size):
        end = start + batch_size if start + batch_size < len(input_features) else len(input_features)
        # 一个 batch 输入数据
        xx = torch.tensor(input_features[start:end], dtype = torch.float, requires_grad = True)
        # 一个 batch 的期望值
        yy = torch.tensor(y[start:end], dtype = torch.float, requires_grad = True)
        # 前向传播
        prediction = my_nn(xx).to('cuda')
        # 计算损失值
        loss = cost(prediction, yy)
        # 优化并对梯度做清零
        optimizer.zero_grad()
        # 反向传播
        loss.backward(retain_graph=True)
        # 更新参数
        optimizer.step()
        batch_loss.append(loss.cpu().data.numpy())

    # 打印损失
    if i % 500==0:
        losses.append(np.mean(batch_loss))
        print(i, np.mean(batch_loss))

0 2868.5024
500 1440.0936
1000 1292.2098
1500 1255.4539
2000 1234.3962
2500 1242.9777
3000 1239.6791
3500 1226.8606
4000 1236.2551
4500 1225.0917
5000 1226.589
5500 1224.7754
6000 1230.9584
6500 1222.3491
7000 1225.3822
7500 1226.4371
8000 1243.1149
8500 1227.5492
9000 1222.9178
9500 1224.8018


In [None]:
balanced_train_label.shape

(1818,)

In [None]:
## Standardize data
scaler = preprocessing.StandardScaler()

scaler.fit(train_df) 
train_np = scaler.transform(train_df)

scaler.fit(test_df) 
test_np = scaler.transform(test_df)

print("Train data after stardardizing: ")
train_np

Train data after stardardizing: 


array([[ 0.95865399, -0.07246637, -0.56780365, ...,  0.72306558,
        -0.18483915, -0.66374284],
       [-1.04312861,  0.35965273, -0.56780598, ...,  1.05977189,
        -0.86638465, -0.27401319],
       [ 0.95865399,  0.35965273, -0.56780598, ...,  0.38635927,
         0.48066999, -0.72105603],
       ...,
       [-1.04312861,  0.79177183, -0.56780598, ...,  1.28424277,
         0.43256089, -0.6178923 ],
       [ 0.95865399, -0.28852593, -0.56780598, ..., -0.96046598,
         0.40850635, -0.28547583],
       [ 0.95865399, -0.07246637,  1.76117084, ..., -0.28705335,
        -0.4334028 ,  0.17302965]])

n

In [None]:
from sklearn.svm import SVC

clf = make_pipeline(StandardScaler(), SVC(gamma='auto', verbose=True))
clf.fit(balanced_train, balanced_train_label)

[LibSVM]

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='auto', kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=True))],
         verbose=False)

0.1716793125514215

(1818,)

[LibSVM]

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='auto', kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=True))],
         verbose=False)

In [None]:
test_pred.shape

(10000,)

In [None]:
(np.isnan(train_df)).sum()

0

0.2203597710547833

In [None]:
test_np = np.array(test_df)
test_np

train_np = np.array(train_df)
train_np

train_label_np = np.array(train_label_df)
train_label_np

test_label_np = np.array(test_label_df)
test_label_np

## Types of data columns

In [None]:
print("Column data types: \n")
original_data.dtypes

Column data types: 



Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

## Unique values of nominal columns

In [None]:
print_str_unique(original_data)

Column        Suburb has   314 unique values.
Column       Address has 13378 unique values.
Column          Type has     3 unique values.
Column        Method has     5 unique values.
Column       SellerG has   268 unique values.
Column          Date has    58 unique values.
Column   CouncilArea has    34 unique values.
Column    Regionname has     8 unique values.


## Counts of missing values

In [None]:
print("Before imputing:")
count_column_nan(original_data)

Before imputing:
Column        Suburb has    0  (0.0%) nan values.
Column       Address has    0  (0.0%) nan values.
Column         Rooms has    0  (0.0%) nan values.
Column          Type has    0  (0.0%) nan values.
Column         Price has    0  (0.0%) nan values.
Column        Method has    0  (0.0%) nan values.
Column       SellerG has    0  (0.0%) nan values.
Column          Date has    0  (0.0%) nan values.
Column      Distance has    0  (0.0%) nan values.
Column      Postcode has    0  (0.0%) nan values.
Column      Bedroom2 has    0  (0.0%) nan values.
Column      Bathroom has    0  (0.0%) nan values.
Column           Car has   62  (0.5%) nan values.
Column      Landsize has    0  (0.0%) nan values.
Column  BuildingArea has 6450 (47.5%) nan values.
Column     YearBuilt has 5375 (39.6%) nan values.
Column   CouncilArea has 1369 (10.1%) nan values.
Column     Lattitude has    0  (0.0%) nan values.
Column    Longtitude has    0  (0.0%) nan values.
Column    Regionname has    0  (0

# Imputate missing values

In [None]:
IMPUTE_STRETEGY = ['most_frequent', 'mean', 'median']


imputed_df = impute_df(df=original_data, strategy='most_frequent')
print("After imputing:")
count_column_nan(imputed_df)

After imputing:
Column        Suburb has    0  (0.0%) nan values.
Column       Address has    0  (0.0%) nan values.
Column         Rooms has    0  (0.0%) nan values.
Column          Type has    0  (0.0%) nan values.
Column         Price has    0  (0.0%) nan values.
Column        Method has    0  (0.0%) nan values.
Column       SellerG has    0  (0.0%) nan values.
Column          Date has    0  (0.0%) nan values.
Column      Distance has    0  (0.0%) nan values.
Column      Postcode has    0  (0.0%) nan values.
Column      Bedroom2 has    0  (0.0%) nan values.
Column      Bathroom has    0  (0.0%) nan values.
Column           Car has    0  (0.0%) nan values.
Column      Landsize has    0  (0.0%) nan values.
Column  BuildingArea has    0  (0.0%) nan values.
Column     YearBuilt has    0  (0.0%) nan values.
Column   CouncilArea has    0  (0.0%) nan values.
Column     Lattitude has    0  (0.0%) nan values.
Column    Longtitude has    0  (0.0%) nan values.
Column    Regionname has    0  (0.

# Generate price classes

In [None]:
print("Price column description: \n")
imputed_df['Price'].describe()

Price column description: 



count    1.358000e+04
mean     1.075684e+06
std      6.393107e+05
min      8.500000e+04
25%      6.500000e+05
50%      9.030000e+05
75%      1.330000e+06
max      9.000000e+06
Name: Price, dtype: float64

In [None]:
imputed_df = assign_price_class(imputed_df)
gb = imputed_df.groupby('Price_class')['Price_class'].count().to_frame()
gb

Unnamed: 0_level_0,Price_class
Price_class,Unnamed: 1_level_1
0,2716
1,2716
2,2716
3,2716
4,2716


# Encode nominal columns (i.e., features)

## encode dates

In [None]:
from datetime import datetime


imputed_df = encode_dates(imputed_df)
imputed_df

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,Price_class,delta_days
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,2.5,3067.0,2.0,1.0,1.0,202.0,120.0,1970.0,Yarra,-37.79960,144.99840,Northern Metropolitan,4019.0,0,16872
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.80790,144.99340,Northern Metropolitan,4019.0,0,16893
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.80930,144.99440,Northern Metropolitan,4019.0,0,17259
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,2.5,3067.0,3.0,2.0,1.0,94.0,120.0,1970.0,Yarra,-37.79690,144.99690,Northern Metropolitan,4019.0,0,17259
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.80720,144.99410,Northern Metropolitan,4019.0,0,16897
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,12 Strada Cr,4,h,1245000.0,S,Barry,16.7,3150.0,4.0,2.0,2.0,652.0,120.0,1981.0,Moreland,-37.90562,145.16761,South-Eastern Metropolitan,7392.0,4,17404
13576,Williamstown,77 Merrett Dr,3,h,1031000.0,SP,Williams,6.8,3016.0,3.0,2.0,2.0,333.0,133.0,1995.0,Moreland,-37.85927,144.87904,Western Metropolitan,6380.0,4,17404
13577,Williamstown,83 Power St,3,h,1170000.0,S,Raine,6.8,3016.0,3.0,2.0,4.0,436.0,120.0,1997.0,Moreland,-37.85274,144.88738,Western Metropolitan,6380.0,4,17404
13578,Williamstown,96 Verdon St,4,h,2500000.0,PI,Sweeney,6.8,3016.0,4.0,1.0,5.0,866.0,157.0,1920.0,Moreland,-37.85908,144.89299,Western Metropolitan,6380.0,4,17404


Print the unique value counts for each column.

No need to encode addresses.

In [None]:
print("Unique value count for each nominal column: \n")
print_str_unique(original_data)

Unique value count for each nominal column: 

Column        Suburb has   314 unique values.
Column       Address has 13378 unique values.
Column          Type has     3 unique values.
Column        Method has     5 unique values.
Column       SellerG has   268 unique values.
Column          Date has    58 unique values.
Column   CouncilArea has    34 unique values.
Column    Regionname has     8 unique values.


## encode nominal values

In [None]:
import category_encoders as ce

ENCODING_COLUMNS = ['CouncilArea', 'Regionname', 'Type', 'Method']

DROPPED_COLUMNS = ['Suburb', 'SellerG', 'Address', 'Date', 'Price']

# DROPPED_COLUMNS = ['Suburb', 'SellerG', 'Address', 'delta_days']


encoded_df = encoder_nominals(imputed_df, encode_method='target')#.drop(columns=['Price'])

encoded_df

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,Rooms,Type,Method,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,Price_class,delta_days
0,2,2.097682,1.993017,2.5,3067.0,2.0,1.0,1.0,202.0,120.0,1970.0,1.275116,-37.79960,144.99840,1.913882,4019.0,0,16872
1,2,2.097682,1.993017,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,1.275116,-37.80790,144.99340,1.913882,4019.0,0,16893
2,3,2.097682,2.089254,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,1.275116,-37.80930,144.99440,1.913882,4019.0,0,17259
3,3,2.097682,1.890026,2.5,3067.0,3.0,2.0,1.0,94.0,120.0,1970.0,1.275116,-37.79690,144.99690,1.913882,4019.0,0,17259
4,4,2.097682,2.010842,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,1.275116,-37.80720,144.99410,1.913882,4019.0,0,16897
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,4,2.097682,1.993017,16.7,3150.0,4.0,2.0,2.0,652.0,120.0,1981.0,2.810427,-37.90562,145.16761,3.397778,7392.0,4,17404
13576,3,2.097682,2.089254,6.8,3016.0,3.0,2.0,2.0,333.0,133.0,1995.0,2.810427,-37.85927,144.87904,2.083786,6380.0,4,17404
13577,3,2.097682,1.993017,6.8,3016.0,3.0,2.0,4.0,436.0,120.0,1997.0,2.810427,-37.85274,144.88738,2.083786,6380.0,4,17404
13578,4,2.097682,1.890026,6.8,3016.0,4.0,1.0,5.0,866.0,157.0,1920.0,2.810427,-37.85908,144.89299,2.083786,6380.0,4,17404


# Split train/test set.

In [None]:

encoded_df = standardize_data(encoded_df, class_col='Price_class')

xTrain, yTrain, xVal, yVal, xTest, yTest = split_data(encoded_df)

print(f'Sample counts: xTrain: {len(xTrain)}, xVal: {len(xVal)}, x_test: {len(xTest)}')

Sample counts: xTrain: 10185, xVal: 1358, x_test: 2037


# K nearest neighbors

In [None]:
xTrain

Unnamed: 0,Rooms,Type,Method,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,delta_days
664,0.064876,0.659632,-0.087583,-0.159798,-0.014358,0.088284,0.673367,0.403998,-0.047717,0.102509,1.451197,-0.879128,0.310418,0.945839,-0.804755,0.080984,-1.344403
3270,-0.981463,0.659632,-0.087583,0.061723,-0.268015,-0.947035,-0.772376,0.403998,0.006912,-0.144742,-0.405266,0.148444,0.828984,0.513742,1.283410,-1.029462,-1.311000
3873,-0.981463,0.659632,-0.087583,0.181004,0.437813,-0.947035,-0.772376,-0.636847,-0.052729,-0.042783,0.110418,-0.295337,-0.731763,0.461775,-0.804755,0.307550,-0.832225
13170,0.064876,0.659632,-0.087583,1.612373,-0.323158,0.088284,-0.772376,-0.636847,-0.009376,-0.042783,0.110418,1.230403,2.153285,0.544441,-0.217939,0.792885,1.377935
1730,1.111216,0.659632,-0.087583,0.215084,0.636327,0.088284,0.673367,0.403998,0.032222,0.255448,0.557344,-1.360186,-1.058548,0.507005,-0.804755,0.083953,-0.565001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,0.064876,0.659632,1.119472,-0.841402,-0.543729,0.088284,-0.772376,0.403998,-0.086810,-0.042783,0.110418,1.230403,0.406939,-0.359595,-0.217939,1.019450,1.377935
3264,0.064876,0.659632,-0.087583,0.061723,-0.268015,0.088284,-0.772376,-0.636847,0.047509,-0.091213,-0.577161,0.148444,0.852957,0.508930,1.283410,-1.029462,0.208833
9845,1.111216,0.659632,-1.379354,-0.585801,-0.521672,1.123604,0.673367,0.403998,-0.029424,0.301330,1.210544,1.230403,0.927146,-0.218033,-0.217939,0.856378,0.871324
10799,0.064876,0.659632,-0.087583,0.317325,-0.356244,0.088284,-0.772376,-0.636847,0.011924,-0.042783,0.110418,-0.407697,1.118297,0.297693,-0.217939,3.242170,1.116279


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

k_range = range(3, 11)
scores_dict = {}
scores_list = []



for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)

    knn.fit(xTrain, yTrain)
    y_pred = knn.predict(xTest)
    score = metrics.accuracy_score(yTest, y_pred)
    scores_dict[f'k={k}'] = round(score, 5)
    scores_list.append(score)

scores_dict

{'k=10': 0.59548,
 'k=3': 0.58714,
 'k=4': 0.59352,
 'k=5': 0.60677,
 'k=6': 0.60088,
 'k=7': 0.59647,
 'k=8': 0.58567,
 'k=9': 0.59352}

# Evaluation

In [None]:
DROPPED_COLUMNS = ['SellerG',   'Address', 'Date', 'Price'] # , 'BuildingArea', 'YearBuilt'
ENCODING_COLUMNS = ['Suburb', 'Method', 'CouncilArea', 'Regionname', 'Type']

pd.options.display.float_format = '{:,.4f}'.format


IMPUTE_STRETEGY = ['mean', 'most_frequent',  'median']
k_range = list(range(3, 11)) + [20]

data_csv = r'https://github.com/gladcolor/Housing_RandomForest/raw/master/melb_data.csv'
original_data = pd.read_csv(data_csv).drop(columns=DROPPED_COLUMNS)



results_df = pd.DataFrame(columns=['Impute_strategy', 'Nominal_encoding', 
                                   'Radom_forest_50', 
                                   'Radom_forest_100', 
                                   'Radom_forest_200'] + 
                                   [f'KNN_{k}' for k in k_range]
                                   )

ENCODING_METHODS_DICT = {'one_hot': ce.OneHotEncoder,
                         'hasing':ce.HashingEncoder, 
                         'leave_one_out':ce.LeaveOneOutEncoder,  # Accuracy 1.0. Need to dig it.
                         'binary':ce.BinaryEncoder, 
                         'target':ce.TargetEncoder,
                        #  'baseN':ce.BaseNEncoder, # the same results as one-hot and binary when base = 1 or 2.
                         
                         }

def get_RandomForest_accuracy(xTrain, yTrain, xTest, yTest, n_estimators=50):
    clf = RandomForestClassifier(n_estimators=n_estimators)
    clf.fit(xTrain, yTrain)
    yPred = clf.predict(xTest)
    accuracy = metrics.accuracy_score(yTest, yPred)
    return accuracy


for impute_strategy in IMPUTE_STRETEGY:
    imputed_df = impute_df(df=original_data, strategy=impute_strategy)
    imputed_df = assign_price_class(imputed_df)
    
    for ce_encoder_name in ENCODING_METHODS_DICT.keys():   

        encoded_df = encoder_nominals(imputed_df, encode_method=ce_encoder_name)

        encoded_df = standardize_data(encoded_df, class_col='Price_class')
 
        xTrain, yTrain, xVal, yVal, xTest, yTest = split_data(encoded_df)
 
        current_row = len(results_df)

        # conduct Random forest        
        RF_score_50 = get_RandomForest_accuracy(xTrain, yTrain, xTest, yTest, n_estimators=50) 
        RF_score_100 = get_RandomForest_accuracy(xTrain, yTrain, xTest, yTest, n_estimators=100) 
        RF_score_200 = get_RandomForest_accuracy(xTrain, yTrain, xTest, yTest, n_estimators=200) 
        print(f"Computed Random Forest, {impute_strategy}, {ce_encoder_name}, score: {RF_score_50:.5f}, {RF_score_100:.5f}, {RF_score_200:.5f}")

        # conduct KNN     
        for k in k_range:
            knn = KNeighborsClassifier(n_neighbors = k)
            knn.fit(xTrain, yTrain)
            y_pred = knn.predict(xTest)
            score = metrics.accuracy_score(yTest, y_pred)
            
            # record the accuracy
            column_name = f'KNN_{k}'
            print(f"Computed  {column_name}, {impute_strategy}, {ce_encoder_name}, score: {score:.5f}")
            

            results_df.loc[current_row, column_name] = round(score, 5)
            results_df.loc[current_row, 'Nominal_encoding'] = ce_encoder_name
            results_df.loc[current_row, 'Impute_strategy'] = impute_strategy
            results_df.loc[current_row, 'Radom_forest_50'] = round(RF_score_50, 5)
            results_df.loc[current_row, 'Radom_forest_100'] = round(RF_score_100, 5)
            results_df.loc[current_row, 'Radom_forest_200'] = round(RF_score_200, 5)
            # print(results_df.head(10))

results_df.to_csv("results.csv")
results_df

  elif pd.api.types.is_categorical(cols):


Computed Random Forest, mean, one_hot, score: 0.71870, 0.71625, 0.71674
Computed  KNN_3, mean, one_hot, score: 0.65734
Computed  KNN_4, mean, one_hot, score: 0.67108
Computed  KNN_5, mean, one_hot, score: 0.67108
Computed  KNN_6, mean, one_hot, score: 0.66814
Computed  KNN_7, mean, one_hot, score: 0.66618
Computed  KNN_8, mean, one_hot, score: 0.67305
Computed  KNN_9, mean, one_hot, score: 0.66961
Computed  KNN_10, mean, one_hot, score: 0.67207
Computed  KNN_20, mean, one_hot, score: 0.64948


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, mean, hasing, score: 0.73834, 0.73883, 0.73490
Computed  KNN_3, mean, hasing, score: 0.56063
Computed  KNN_4, mean, hasing, score: 0.55572
Computed  KNN_5, mean, hasing, score: 0.57339
Computed  KNN_6, mean, hasing, score: 0.57094
Computed  KNN_7, mean, hasing, score: 0.57732
Computed  KNN_8, mean, hasing, score: 0.57634
Computed  KNN_9, mean, hasing, score: 0.57388
Computed  KNN_10, mean, hasing, score: 0.57192
Computed  KNN_20, mean, hasing, score: 0.57192


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, mean, leave_one_out, score: 0.99951, 1.00000, 0.99951
Computed  KNN_3, mean, leave_one_out, score: 0.59254
Computed  KNN_4, mean, leave_one_out, score: 0.60285
Computed  KNN_5, mean, leave_one_out, score: 0.61463
Computed  KNN_6, mean, leave_one_out, score: 0.60432
Computed  KNN_7, mean, leave_one_out, score: 0.61267
Computed  KNN_8, mean, leave_one_out, score: 0.60874
Computed  KNN_9, mean, leave_one_out, score: 0.61217
Computed  KNN_10, mean, leave_one_out, score: 0.60776
Computed  KNN_20, mean, leave_one_out, score: 0.59254


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, mean, binary, score: 0.74521, 0.75012, 0.74865
Computed  KNN_3, mean, binary, score: 0.63770
Computed  KNN_4, mean, binary, score: 0.64359
Computed  KNN_5, mean, binary, score: 0.64507
Computed  KNN_6, mean, binary, score: 0.64899
Computed  KNN_7, mean, binary, score: 0.65390
Computed  KNN_8, mean, binary, score: 0.66323
Computed  KNN_9, mean, binary, score: 0.65636
Computed  KNN_10, mean, binary, score: 0.65930
Computed  KNN_20, mean, binary, score: 0.65488


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, mean, target, score: 0.75503, 0.75749, 0.75945
Computed  KNN_3, mean, target, score: 0.59352
Computed  KNN_4, mean, target, score: 0.60628
Computed  KNN_5, mean, target, score: 0.61561
Computed  KNN_6, mean, target, score: 0.60579
Computed  KNN_7, mean, target, score: 0.61954
Computed  KNN_8, mean, target, score: 0.61267
Computed  KNN_9, mean, target, score: 0.61512
Computed  KNN_10, mean, target, score: 0.61168
Computed  KNN_20, mean, target, score: 0.59352


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, most_frequent, one_hot, score: 0.71379, 0.71625, 0.71870
Computed  KNN_3, most_frequent, one_hot, score: 0.66176
Computed  KNN_4, most_frequent, one_hot, score: 0.67354
Computed  KNN_5, most_frequent, one_hot, score: 0.67501
Computed  KNN_6, most_frequent, one_hot, score: 0.66716
Computed  KNN_7, most_frequent, one_hot, score: 0.66618
Computed  KNN_8, most_frequent, one_hot, score: 0.67108
Computed  KNN_9, most_frequent, one_hot, score: 0.66618
Computed  KNN_10, most_frequent, one_hot, score: 0.67108
Computed  KNN_20, most_frequent, one_hot, score: 0.64850


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, most_frequent, hasing, score: 0.74325, 0.74521, 0.73981
Computed  KNN_3, most_frequent, hasing, score: 0.55817
Computed  KNN_4, most_frequent, hasing, score: 0.55817
Computed  KNN_5, most_frequent, hasing, score: 0.57732
Computed  KNN_6, most_frequent, hasing, score: 0.56848
Computed  KNN_7, most_frequent, hasing, score: 0.56996
Computed  KNN_8, most_frequent, hasing, score: 0.57192
Computed  KNN_9, most_frequent, hasing, score: 0.57339
Computed  KNN_10, most_frequent, hasing, score: 0.56652
Computed  KNN_20, most_frequent, hasing, score: 0.57094


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, most_frequent, leave_one_out, score: 0.99951, 0.99951, 0.99951
Computed  KNN_3, most_frequent, leave_one_out, score: 0.59499
Computed  KNN_4, most_frequent, leave_one_out, score: 0.60530
Computed  KNN_5, most_frequent, leave_one_out, score: 0.61954
Computed  KNN_6, most_frequent, leave_one_out, score: 0.61463
Computed  KNN_7, most_frequent, leave_one_out, score: 0.61267
Computed  KNN_8, most_frequent, leave_one_out, score: 0.60825
Computed  KNN_9, most_frequent, leave_one_out, score: 0.61119
Computed  KNN_10, most_frequent, leave_one_out, score: 0.60383
Computed  KNN_20, most_frequent, leave_one_out, score: 0.59107


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, most_frequent, binary, score: 0.74227, 0.74472, 0.75061
Computed  KNN_3, most_frequent, binary, score: 0.64016
Computed  KNN_4, most_frequent, binary, score: 0.64752
Computed  KNN_5, most_frequent, binary, score: 0.64850
Computed  KNN_6, most_frequent, binary, score: 0.64850
Computed  KNN_7, most_frequent, binary, score: 0.65439
Computed  KNN_8, most_frequent, binary, score: 0.66078
Computed  KNN_9, most_frequent, binary, score: 0.65390
Computed  KNN_10, most_frequent, binary, score: 0.66421
Computed  KNN_20, most_frequent, binary, score: 0.65439


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, most_frequent, target, score: 0.75503, 0.76092, 0.76043
Computed  KNN_3, most_frequent, target, score: 0.59745
Computed  KNN_4, most_frequent, target, score: 0.60481
Computed  KNN_5, most_frequent, target, score: 0.61905
Computed  KNN_6, most_frequent, target, score: 0.61561
Computed  KNN_7, most_frequent, target, score: 0.61561
Computed  KNN_8, most_frequent, target, score: 0.61365
Computed  KNN_9, most_frequent, target, score: 0.61807
Computed  KNN_10, most_frequent, target, score: 0.60923
Computed  KNN_20, most_frequent, target, score: 0.59450


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, median, one_hot, score: 0.71821, 0.71969, 0.72214
Computed  KNN_3, median, one_hot, score: 0.66127
Computed  KNN_4, median, one_hot, score: 0.67354
Computed  KNN_5, median, one_hot, score: 0.67403
Computed  KNN_6, median, one_hot, score: 0.66716
Computed  KNN_7, median, one_hot, score: 0.66568
Computed  KNN_8, median, one_hot, score: 0.67108
Computed  KNN_9, median, one_hot, score: 0.66618
Computed  KNN_10, median, one_hot, score: 0.67108
Computed  KNN_20, median, one_hot, score: 0.64850


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, median, hasing, score: 0.73490, 0.74718, 0.73883
Computed  KNN_3, median, hasing, score: 0.55817
Computed  KNN_4, median, hasing, score: 0.55817
Computed  KNN_5, median, hasing, score: 0.57634
Computed  KNN_6, median, hasing, score: 0.56750
Computed  KNN_7, median, hasing, score: 0.56996
Computed  KNN_8, median, hasing, score: 0.57192
Computed  KNN_9, median, hasing, score: 0.57290
Computed  KNN_10, median, hasing, score: 0.56652
Computed  KNN_20, median, hasing, score: 0.57094


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, median, leave_one_out, score: 1.00000, 1.00000, 0.99951
Computed  KNN_3, median, leave_one_out, score: 0.59597
Computed  KNN_4, median, leave_one_out, score: 0.60530
Computed  KNN_5, median, leave_one_out, score: 0.61905
Computed  KNN_6, median, leave_one_out, score: 0.61512
Computed  KNN_7, median, leave_one_out, score: 0.61119
Computed  KNN_8, median, leave_one_out, score: 0.60972
Computed  KNN_9, median, leave_one_out, score: 0.61168
Computed  KNN_10, median, leave_one_out, score: 0.60383
Computed  KNN_20, median, leave_one_out, score: 0.59107


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, median, binary, score: 0.74423, 0.75061, 0.74963
Computed  KNN_3, median, binary, score: 0.63967
Computed  KNN_4, median, binary, score: 0.64703
Computed  KNN_5, median, binary, score: 0.64752
Computed  KNN_6, median, binary, score: 0.64850
Computed  KNN_7, median, binary, score: 0.65390
Computed  KNN_8, median, binary, score: 0.66078
Computed  KNN_9, median, binary, score: 0.65390
Computed  KNN_10, median, binary, score: 0.66421
Computed  KNN_20, median, binary, score: 0.65439


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, median, target, score: 0.75896, 0.75994, 0.75945
Computed  KNN_3, median, target, score: 0.59794
Computed  KNN_4, median, target, score: 0.60481
Computed  KNN_5, median, target, score: 0.61856
Computed  KNN_6, median, target, score: 0.61561
Computed  KNN_7, median, target, score: 0.61561
Computed  KNN_8, median, target, score: 0.61414
Computed  KNN_9, median, target, score: 0.61807
Computed  KNN_10, median, target, score: 0.60874
Computed  KNN_20, median, target, score: 0.59450


Unnamed: 0,Impute_strategy,Nominal_encoding,Radom_forest_50,Radom_forest_100,Radom_forest_200,KNN_3,KNN_4,KNN_5,KNN_6,KNN_7,KNN_8,KNN_9,KNN_10,KNN_20
0,mean,one_hot,0.7187,0.7163,0.7167,0.6573,0.6711,0.6711,0.6681,0.6662,0.6731,0.6696,0.6721,0.6495
1,mean,hasing,0.7383,0.7388,0.7349,0.5606,0.5557,0.5734,0.5709,0.5773,0.5763,0.5739,0.5719,0.5719
2,mean,leave_one_out,0.9995,1.0,0.9995,0.5925,0.6028,0.6146,0.6043,0.6127,0.6087,0.6122,0.6078,0.5925
3,mean,binary,0.7452,0.7501,0.7487,0.6377,0.6436,0.6451,0.649,0.6539,0.6632,0.6564,0.6593,0.6549
4,mean,target,0.755,0.7575,0.7594,0.5935,0.6063,0.6156,0.6058,0.6195,0.6127,0.6151,0.6117,0.5935
5,most_frequent,one_hot,0.7138,0.7163,0.7187,0.6618,0.6735,0.675,0.6672,0.6662,0.6711,0.6662,0.6711,0.6485
6,most_frequent,hasing,0.7432,0.7452,0.7398,0.5582,0.5582,0.5773,0.5685,0.57,0.5719,0.5734,0.5665,0.5709
7,most_frequent,leave_one_out,0.9995,0.9995,0.9995,0.595,0.6053,0.6195,0.6146,0.6127,0.6082,0.6112,0.6038,0.5911
8,most_frequent,binary,0.7423,0.7447,0.7506,0.6402,0.6475,0.6485,0.6485,0.6544,0.6608,0.6539,0.6642,0.6544
9,most_frequent,target,0.755,0.7609,0.7604,0.5975,0.6048,0.619,0.6156,0.6156,0.6137,0.6181,0.6092,0.5945


# Problem 1 validation

In [None]:
from sklearn import metrics
import numpy as np
 

classes = ['politics', 'business', 'tech', 'entertainment', 'sport']

y_true = [0] * 141 + [1] * 167 + [2] * 133 + [3] * 128 + [4] * 166


# row: actual, col: predict
confusion_matrix = np.array([[140, 1, 0, 0, 0],
                             [4, 160, 2, 0, 1],
                             [1, 3, 128, 0, 1],
                             [0, 0, 1, 127, 0],
                             [0, 1, 0, 0, 165],])

row_cnt, col_cnt = confusion_matrix.shape
y_true = []
y_pred = []
for row in range(row_cnt):
    y_true += [row] *  confusion_matrix[row, :].sum()
    for col in range(col_cnt):
        y_pred += [col] *  confusion_matrix[row, col] 

print("Confusion matix:")
print(metrics.confusion_matrix(y_true, y_pred))

print(metrics.classification_report(y_true, y_pred, digits=4))

Confusion matix:
[[140   1   0   0   0]
 [  4 160   2   0   1]
 [  1   3 128   0   1]
 [  0   0   1 127   0]
 [  0   1   0   0 165]]
              precision    recall  f1-score   support

           0     0.9655    0.9929    0.9790       141
           1     0.9697    0.9581    0.9639       167
           2     0.9771    0.9624    0.9697       133
           3     1.0000    0.9922    0.9961       128
           4     0.9880    0.9940    0.9910       166

    accuracy                         0.9796       735
   macro avg     0.9801    0.9799    0.9799       735
weighted avg     0.9797    0.9796    0.9796       735

