In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score

In [3]:
####IMPLEMENT LDA: Linear Discriminant Analysis####

In [4]:
#data cleaning code
#Read the data from the csv file
data_df = pd.read_csv('Breast_Cancer.csv')

data = data_df.to_numpy()
#Determine how many identifiable characteristics there are for each patient
print('Each row, aka each patient, has a total of {} characteristics'.format(len(data[0])))
print('There are {} total records of patients'.format(data.shape[0]))

######visualize the entire dataset####
# data_df
print('length of one column (ensuring its 16): {}'.format(len(data[0])))

print('Each patient has {} characteristics'.format(data.shape[1]))
cols = ['age', 'race', 'marital status', 'T Stage', 'N Stage', '6th Stage', 
        'differentiation', 'grade', 'A Stage', 'Tumor Size', 'Estrogen Status', 
        'Progestrone Status', 'Regional Node Examined', 'Regional Node Posititive', 
        'Survival Months', 'Status' ]
assert(len(cols) == 16)
#there are 11 columns with strng values

#GOAL : map these string values to integers; if three options, map to {1, 2, 3}
#option: create dictionaries for each string column and list out their categories
race = {'White': 1, 'Black': 2, 'Other': 3}
mS = {'Married': 1, 'Single': 2, 'Divorced': 3, 'Widowed': 4,'Seperated':5}

tStage = {'T1':1, 'T2':2, 'T3':3, 'T4':4}
nStage = {'N1': 1, 'N2': 2, 'N3': 3}

#FIVE VALUES for sixth stage
sixthStage = {'IIA':1, 'IIB':2, 'IIIA':3, 'IIIB':4, 'IIIC': 5}

grade = {'1': 1, '2': 2, '3': 3, 'anaplastic; Grade IV':4}

#four values for differentiated
differentiated = {'Undifferentiated':1, 'Poorly differentiated':2, 'Moderately differentiated':3,  'Well differentiated':4, }

#two values for A stage: reginal and distant
aStage = {'Regional': 1, 'Distant': 2}

#estrogen status and progestrogen status have two values: postive or negative
eStatus = {'Positive': 1, 'Negative': 2}
pStatus = {'Positive': 1, 'Negative': 2}

#status has two values; dead or alive
status = {'Dead':1, 'Alive':2}

###TEST THE MAPPING###

#create indices for each string column
race_index = 1
maritalStatus_index = 2

tStage_index = 3
nStage_index = 4

sixthStage_index = 5
differentiated_index = 6

aStage_index = 8
grade_index = 7

eStatus_index = 10
pStatus_index = 11
status_index = 15

#Test the before and after on one patient record
print('Before the mappinng, the third patient record is:\n {}\n'.format(data[2]))

#####use numpy vectorized mapping, applies to the entire dataset!!!!###
###ONLY one value is shown here to emphasize the change###

data[:, race_index] = np.vectorize(race.get)(data[:, race_index])

#this following for loop is used instead of vectors to fix the errors of None type being passed
#Basically there is an error when the numpy vectorization is used

for i in range(data.shape[0]):
    data[i, maritalStatus_index] = mS.get(data[i, maritalStatus_index], -1)

data[:, tStage_index] = np.vectorize(tStage.get)(data[:, tStage_index])

data[:, nStage_index] = np.vectorize(nStage.get)(data[:, nStage_index])

data[:, sixthStage_index] = np.vectorize(sixthStage.get)(data[:, sixthStage_index])

data[:, differentiated_index] = np.vectorize(differentiated.get)(data[:, differentiated_index])

data[:, aStage_index] = np.vectorize(aStage.get)(data[:, aStage_index])

for i in range(data.shape[0]):
    data[i, grade_index] = grade.get(data[i, grade_index], -1)

data[:, eStatus_index] = np.vectorize(eStatus.get)(data[:, eStatus_index])

data[:, pStatus_index] = np.vectorize(pStatus.get)(data[:, pStatus_index])

data[:, status_index] = np.vectorize(status.get)(data[:, status_index])

print('After the mapping, the third patient record is:\n {}\n'.format(data[2]))

####To test this cell you have to re-run all cells because the data array is overwritten!!!###



Each row, aka each patient, has a total of 16 characteristics
There are 4024 total records of patients
length of one column (ensuring its 16): 16
Each patient has 16 characteristics
Before the mappinng, the third patient record is:
 [58 'White' 'Divorced' 'T3' 'N3' 'IIIC' 'Moderately differentiated' '2'
 'Regional' 63 'Positive' 'Positive' 14 7 75 'Alive']

After the mapping, the third patient record is:
 [58 1 3 3 3 5 3 2 1 63 1 1 14 7 75 2]



In [5]:
#GOAL: Break the dataset into training and testing
N_total = data.shape[0]

N_training = int(0.8 * N_total)
print('There are {} records that are used for training'.format(N_training))
N_testing = N_total - N_training
print('There are {} records that are used for testing'.format(N_testing))

There are 3219 records that are used for training
There are 805 records that are used for testing


In [6]:


# Check for strings in the matrix
if np.any(np.vectorize(lambda x: isinstance(x, str))(data)):
    print('Matrix contains strings!')
else:
    print('Matrix does not contain strings!')


Matrix does not contain strings!


In [7]:
######ENSURE DATA IS NOT STRINGS####
if np.any([isinstance(x, str) for x in data]):
    print('Matrix contains strings!')
else: 
    print('Matrix does not contain strings!')

Matrix does not contain strings!


In [8]:
data_training = data[:N_training]
data_testing = data[N_training:]
print('There are {} total records of patients in the training set'.format(data_training.shape[0]))
print('There are {} total records of patients in the testing set'.format(data_testing.shape[0]))
print(len(data_training[0]))

There are 3219 total records of patients in the training set
There are 805 total records of patients in the testing set
16


In [9]:
#seperate the target variable from the dataset

#target variable is the value at index 14

#input matrix: X
#target vector: y

#####DEBUGGING######
# print(data_training[0][:14])
# print(data_training[0][15:])

# print('sum', np.concatenate((data_training[0][:14] ,data_training[0][15:])))

X_train = [np.concatenate((row[:14], row[15:])) for row in data_training]
X_train = np.array(X_train)
# print(X_train[0])

y_train = [row[14] for row in data_training]
y_train = np.array(y_train)

X_test = [np.concatenate((row[:14], row[15:])) for row in data_testing]
X_test = np.array(X_test)

y_test = [row[14] for row in data_testing]
y_test = np.array(y_test)

In [10]:
print('Length of X_train: {}'.format(X_train.shape))
print('Length of y_train: {}'.format(y_train.shape))
print('Length of X_test: {}'.format(X_test.shape))
print('Length of y_test: {}'.format(y_test.shape))

#the length should be 15, 16 - 1
assert(len(X_train[0]) == 15)


Length of X_train: (3219, 15)
Length of y_train: (3219,)
Length of X_test: (805, 15)
Length of y_test: (805,)


In [11]:
#TRAIN THE MODEL
rfr = RandomForestRegressor(n_estimators = 50, random_state=19)
rfr.fit(X_train, y_train)

In [12]:
#PREDICT
y_predict = rfr.predict(X_test)
print(y_predict.shape)
print('y test:', y_test.shape)

(805,)
y test: (805,)


In [13]:
#EVALUATE MODEL
# accuracy = accuracy_score(y_predict, y_test)
# print('The model accuracy for this data set is:\n'.format(accuracy))

# Evaluate The Model:

-RMSE allows us to find the 

-The accuracy is found by : 1 - RMSE

In [14]:
def RMSE(y_pred, y_t):
    # print('y_pred: {}'.format(y_pred))
    assert(len(y_pred) == len(y_t))
    
    return np.sqrt(np.mean((y_t - y_pred)**2))

In [15]:
score = RMSE(y_predict, y_test)
print('The RMSE score measures the deviations of the predictions from the true value: {}'.format(score))

The RMSE score measures the deviations of the predictions from the true value: 19.989414194588573


In [16]:
###NORMALIZE RMSE####
def normalize_rmse(rmse, ymin, ymax):
    return (rmse - ymin)/(ymax - ymin)

In [17]:
error = normalize_rmse(score, np.min(y_train), np.max(y_train))
print('The normalized RMSE value is: {}'.format(error))
accuracy = np.round(1 - error, 6)
print('The accuracy of this model is: {}'.format(accuracy))

The normalized RMSE value is: 0.17914541693008088
The accuracy of this model is: 0.820855
