In [1]:
import pandas as pd
import numpy as np

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score

In [2]:
#data cleaning code



#Read the data from the csv file
data_df = pd.read_csv('Breast_Cancer.csv')

data = data_df.to_numpy()
#Determine how many identifiable characteristics there are for each patient
print('Each row, aka each patient, has a total of {} characteristics'.format(len(data[0])))
print('There are {} total records of patients'.format(data.shape[0]))

######visualize the entire dataset####
# data_df
print('length of one column (ensuring its 16): {}'.format(len(data[0])))

print('Each patient has {} characteristics'.format(data.shape[1]))
cols = ['age', 'race', 'marital status', 'T Stage', 'N Stage', '6th Stage', 
        'differentiation', 'grade', 'A Stage', 'Tumor Size', 'Estrogen Status', 
        'Progestrone Status', 'Regional Node Examined', 'Regional Node Posititive', 
        'Survival Months', 'Status' ]
assert(len(cols) == 16)
#there are 11 columns with strng values

#GOAL : map these string values to integers; if three options, map to {1, 2, 3}
#option: create dictionaries for each string column and list out their categories
race = {'White': 1, 'Black': 2, 'Other': 3}
mS = {'Married': 1, 'Single': 2, 'Divorced': 3, 'Widowed': 4,'Seperated':5}

tStage = {'T1':1, 'T2':2, 'T3':3, 'T4':4}
nStage = {'N1': 1, 'N2': 2, 'N3': 3}

#FIVE VALUES for sixth stage
sixthStage = {'IIA':1, 'IIB':2, 'IIIA':3, 'IIIB':4, 'IIIC': 5}

grade = {'1': 1, '2': 2, '3': 3, 'anaplastic; Grade IV':4}

#four values for differentiated
differentiated = {'Undifferentiated':1, 'Poorly differentiated':2, 'Moderately differentiated':3,  'Well differentiated':4, }

#two values for A stage: reginal and distant
aStage = {'Regional': 1, 'Distant': 2}

#estrogen status and progestrogen status have two values: postive or negative
eStatus = {'Positive': 1, 'Negative': 2}
pStatus = {'Positive': 1, 'Negative': 2}

#status has two values; dead or alive
status = {'Dead':1, 'Alive':2}

###TEST THE MAPPING###

#create indices for each string column
race_index = 1
maritalStatus_index = 2

tStage_index = 3
nStage_index = 4

sixthStage_index = 5
differentiated_index = 6

aStage_index = 8
grade_index = 7

eStatus_index = 10
pStatus_index = 11
status_index = 15

#Test the before and after on one patient record
print('Before the mappinng, the third patient record is:\n {}\n'.format(data[2]))

#####use numpy vectorized mapping, applies to the entire dataset!!!!###
###ONLY one value is shown here to emphasize the change###

data[:, race_index] = np.vectorize(race.get)(data[:, race_index])

#this following for loop is used instead of vectors to fix the errors of None type being passed
#Basically there is an error when the numpy vectorization is used

for i in range(data.shape[0]):
    data[i, maritalStatus_index] = mS.get(data[i, maritalStatus_index], -1)

data[:, tStage_index] = np.vectorize(tStage.get)(data[:, tStage_index])

data[:, nStage_index] = np.vectorize(nStage.get)(data[:, nStage_index])

data[:, sixthStage_index] = np.vectorize(sixthStage.get)(data[:, sixthStage_index])

data[:, differentiated_index] = np.vectorize(differentiated.get)(data[:, differentiated_index])

data[:, aStage_index] = np.vectorize(aStage.get)(data[:, aStage_index])

for i in range(data.shape[0]):
    data[i, grade_index] = grade.get(data[i, grade_index], -1)

data[:, eStatus_index] = np.vectorize(eStatus.get)(data[:, eStatus_index])

data[:, pStatus_index] = np.vectorize(pStatus.get)(data[:, pStatus_index])

data[:, status_index] = np.vectorize(status.get)(data[:, status_index])

print('After the mapping, the third patient record is:\n {}\n'.format(data[2]))

####To test this cell you have to re-run all cells because the data array is overwritten!!!###



Each row, aka each patient, has a total of 16 characteristics
There are 4024 total records of patients
length of one column (ensuring its 16): 16
Each patient has 16 characteristics
Before the mappinng, the third patient record is:
 [58 'White' 'Divorced' 'T3' 'N3' 'IIIC' 'Moderately differentiated' '2'
 'Regional' 63 'Positive' 'Positive' 14 7 75 'Alive']

After the mapping, the third patient record is:
 [58 1 3 3 3 5 3 2 1 63 1 1 14 7 75 2]



In [3]:
#GOAL: Break the dataset into training and testing
N_total = data.shape[0]

N_training = int(0.8 * N_total)
print('There are {} records that are used for training'.format(N_training))
N_testing = N_total - N_training
print('There are {} records that are used for testing'.format(N_testing))

There are 3219 records that are used for training
There are 805 records that are used for testing


In [4]:
data_training = data[:N_training]
data_testing = data[N_training:]
print('There are {} total records of patients in the training set'.format(data_training.shape[0]))
print('There are {} total records of patients in the testing set'.format(data_testing.shape[0]))

There are 3219 total records of patients in the training set
There are 805 total records of patients in the testing set


In [5]:
#seperate the target variable from the dataset

#target variable is the value at index 14

#input matrix: X
#target vector: y
X_train = [row[:14] + row[15:] for row in data_training]
y_train = [row[14] for row in data_training]

X_test = [row[:14] + row[15:] for row in data_testing]
y_test = [row[14] for row in data_testing]

In [12]:
print('Length of X_train: {}'.format(X_train.shape))
print('Length of y_train: {}'.format(y_train.shape))
print('Length of X_test: {}'.format(X_test.shape))
print('Length of y_test: {}'.format(y_test.shape))


AttributeError: 'list' object has no attribute 'shape'

In [7]:
#TRAIN THE MODEL
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

In [8]:
#PREDICT
y_predict = lda.predict(X_test)
print(len(y_predict))

805


In [9]:
#EVALUATE MODEL
accuracy = accuracy_score(y_test, y_predict)
print('The model accuracy for this data set is:\n'.format(accuracy))

The model accuracy for this data set is:



In [10]:
def RMSE(y_pred, y_t):
    diff = y_t - y_pred
    N = len(y_t)
    return np.sqrt(diff / N)

In [11]:
score = RMSE(y_predict, y_test)
score

  return np.sqrt(diff / N)


array([       nan,        nan,        nan, 0.12707898,        nan,
              nan,        nan,        nan, 0.12209354, 0.15762208,
              nan, 0.307262  , 0.20551405,        nan, 0.13187609,
       0.        ,        nan, 0.08633317,        nan, 0.07049074,
              nan, 0.07049074, 0.21438921,        nan, 0.21438921,
       0.12209354, 0.12209354, 0.10573611, 0.12707898, 0.30927686,
       0.2660967 , 0.08633317,        nan,        nan, 0.11689566,
       0.13650473, 0.21438921,        nan, 0.25415797, 0.17971682,
       0.11145564, 0.17266634,        nan,        nan,        nan,
              nan, 0.20551405,        nan, 0.14953344,        nan,
       0.07881104, 0.11689566, 0.19304684, 0.09968896,        nan,
       0.20851441,        nan,        nan, 0.06104677, 0.11145564,
       0.17622684,        nan, 0.24418708, 0.16531543,        nan,
       0.34352936, 0.25659016,        nan, 0.27975144,        nan,
              nan,        nan, 0.18314031,        nan, 0.07881