In [1]:
from xgb_results import xgb_results_regression
from split import split_data
from pprint import pprint
from utils import *
from get_data import get_uci_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
x,y, description = get_uci_dataset(2) #Dataset 1 has 2 colums we need to predict...
pprint(description)

{'categorical features': [],
 'dataset name': 'AIDS Clinical Trials Group Study 175',
 'number of features': 23,
 'number of test examples': 2139,
 'number of training examples': 2139,
 'variables information':        name     role        type         demographic  \
0    pidnum       ID     Integer                None   
1       cid   Target      Binary                None   
2      time  Feature     Integer                None   
3       trt  Feature     Integer                None   
4       age  Feature     Integer                 Age   
5      wtkg  Feature  Continuous                None   
6      hemo  Feature      Binary                None   
7      homo  Feature      Binary  Sexual Orientation   
8     drugs  Feature      Binary                None   
9    karnof  Feature     Integer                None   
10   oprior  Feature      Binary                None   
11      z30  Feature      Binary                None   
12   zprior  Feature      Binary                None   
13  p

In [3]:
x_train, x_test, y_train, y_test = split_data(x, y)
x_train, x_test, y_train, y_test = convert_to_numpy_dataset(x_train, x_test, y_train, y_test)

Number of features:  23
Number of training examples:  1711
Number of test examples:  428


In [4]:
print(y_test)

[0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1
 1 1 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 1 0
 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0
 0 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 1 1
 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0
 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0
 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0
 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0
 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0
 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0]


In [5]:
results = xgb_results_regression(x_train, x_test, y_train, y_test)
pprint(results)

metrics = ['MSE', 'R^2 Score', 'Pearson', 'Spearman']

{'Testing Metrics': {'MSE': 0.09032473560251866,
                     'Pearson': 0.7116221023900426,
                     'R^2 Score': 0.49901455274133955,
                     'Spearman': 0.607362871658546},
 'Training Metrics': {'MSE': 0.0004781508511815307,
                      'Pearson': 0.9988596827205314,
                      'R^2 Score': 0.9974183972225262,
                      'Spearman': 0.7454155672704781}}


In [6]:
guesses = get_baseline_guesses(np.concatenate((y_train, y_test)))
baseline_results = get_baseline_results(y_test, guesses)
pprint(baseline_results)

{'mean': {'MSE': 0.18035174548510868, 'R^2': -0.00031956111497111195},
 'median': {'MSE': 0.23598130841121495, 'R^2': -0.308868501529052}}


In [7]:
# sample rates are from 0 to 1 in increments of 0.01
sample_rates = np.arange(0.01, 1.01, 0.01)
# create a dictionary for each metric
mse_dict = {}
r2_dict = {}
pearson_dict = {}
spearman_dict = {}

for index, sample_rate in enumerate(sample_rates):
    if index % 10 == 0:
        print('Sample Rate: ', sample_rate)
    for i in range (10):
        x_train_sample, y_train_sample = sample_data(x_train, y_train, sample_rate, seed = i)
        results = xgb_results_regression(x_train_sample, x_test, y_train_sample, y_test)
        test_results = results['Testing Metrics']
        if sample_rate not in mse_dict:
            mse_dict[sample_rate] = [test_results['MSE']]
        else:
            mse_dict[sample_rate].append(test_results['MSE'])

    mse_dict[sample_rate] = np.mean(mse_dict[sample_rate])



# graph mse results
plt.plot(list(mse_dict.keys()), list(mse_dict.values()))
plt.xlabel('Sample Rate')
plt.ylabel('MSE')
plt.title('MSE vs Sample Rate')
plt.show()



Sample Rate:  0.01
Sample Rate:  0.11
Sample Rate:  0.21000000000000002
Sample Rate:  0.31
Sample Rate:  0.41000000000000003
Sample Rate:  0.51
Sample Rate:  0.61
Sample Rate:  0.7100000000000001
Sample Rate:  0.81
