In [None]:
###############################################################################################################################

############    In this exercise, we will be predicting whether or not a patient has breast cancer.        ####################
############   The data is from the University of Wisconsin and contains attributes, or characteristics,   ####################
############   about groups of cells that doctors had to determine were cancerous or not.                  ####################
############   We will be using SUPERVISED machine learning techniques to label each group of cells either ####################
############   cancer ("malignant") or not cancer ("benign"). This is called a classification problem.     ####################

######   The data can be found here: http://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28diagnostic%29   #######
######   The repository that provides the data is cited below:
# Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.#
# Template for this exercise can be found here: https://towardsdatascience.com/building-a-simple-machine-learning-model-on-breast-cancer-data-eca4b3b99fa3 #

In [None]:

# First, we import the libraries we will be needing to run our code:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd



# Next, we read the dataset into pandas. Pandas is where your data lives in this notebook. This "house" for your data
# looks like a table and is called a "DataFrame".
# We want to work on that cancer dataset from the University of Wisconsin. This data is stored in a CSV file. 
# Pandas will extract the data from that CSV into a DataFrame (like a table).


# Pandas is denoted here by "pd".

dataset = pd.read_csv('wisconsin_breastcancer_dataset_UseThis.csv')  # Now 'dataset' is a pandas dataframe of our data.


# Third, what does our data look like? The command '.head()' let's us see a specified amount of rows. 
# Tell '.head' how many rows you want to see by inputing a number into the paratheneses (), like this: .head(6)

dataset.head(#enter how many rows you want to see here, make sure it's at least 5#)

In [None]:
# Notice that you can't see all the column names above? 
# Type the command "list()" to see all the column names in the dataset.

# Now you try:

list(#insert name of your pandas dataframe#)

In [None]:
# First thing we want to do after loading the data into pandas is to make sure ALL the data was loaded. 
# To do this, we check the size of the data (how many rows and columns) and make sure this matches the number of 
# rows and columns in our csv. You don't have to check the size of the csv yourself, below we've provided the numbers 
# that you should see. 

print("Cancer data set dimensions : {}".format(dataset.shape))
#You should see -> Cancer data set dimensions : (569, 32)


# Do you know what these 2 numbers mean? 
# Type your answer here:                                                              #

# If you don't know, no worries. :) You can take a guess and we'll go over it at the end.

In [None]:
# Now we need to make sure all the data is the right type. This is important because if a column comes in as a string but 
# we really need it to be an integer, it can mess up our machine learning model and we'll get an error.

# This command shows us the data types of each column
dataset.dtypes

In [None]:
# ‘Diagnosis’ is the column which we are going to predict.
# It says if the group of cells is M = malignant or B = benign. 
# So how many people have malignant cells and how many don't?


dataset['Diagnosis'].value_counts()

In [None]:
# Look above - We can identify that out of the 569 persons, 357 are labeled as B (benign) and 212 as M (malignant). 

In [None]:
# For what percent of the data is the cancer malignant?

# Enter code to calculate the percent malignant cancer out of all the data here: type answer here                         #

In [None]:
# Cleaning the data:

# Machine learning packages need you to do something about the null values before you can run them. 
# Let's check and see if there are nulls in our data. 

dataset.isnull().sum()
dataset.isna().sum()

# Doesn't look like there are any nulls! If there were, we would need to either delete that row or fill in that value, likely 
# with the mean of the column (if numeric). Ask Emily or Shweta to talk more about imputation, another way to fill in nulls.

In [None]:
# Now we need to separate our data into the target (what we want to predict, "malignant" or "benign" - the "Diagnosis" column) 
#and the data, or attributesthat are going to help us predict this target (all the other columns besides the "Diagnosis" column).

X = dataset.iloc[:, 2:31].values # all the other columns, or attributes
Y = dataset.iloc[:, 1].values # the target column for whcih we're trying to predict the classification 

In [None]:
# Let's see what X looks like. Run this to find out.

X

In [None]:
# Let's see what Y looks like. Run this to find out.

Y

In [None]:
#Encoding categorical data values

from sklearn.preprocessing import LabelEncoder

# Emily will explain this part. Let her know when you've reached here.
# The code below is complete, you just need to run it.

labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)

In [None]:
Y
# 1 means the diagnosis is malignant and 0 means benign. 

In [None]:
# Split the dataset into the Training set and Test set.

from sklearn.model_selection import train_test_split

# Emily will explain this part. Let her know when you've reached here.
# The code below is almost complete, you just need to put in the test size, in the form of a decimal (ex. "0.25").

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = #enter test size here#, random_state = 0)

In [None]:
# Run to see the training data for the attributes.

X_train

In [None]:
# Run to see the test data for the attributes.

X_test

In [None]:
# Run to see the training data for the target, aka the label of "malignant" or "benign" that has been one-hot encoded.

Y_train

In [None]:
# Run to see the test data for the target, aka the label of "malignant" or "benign" that has been one-hot encoded.

Y_test

In [None]:
#Feature Scaling

from sklearn.preprocessing import StandardScaler

# Emily will explain this part. Let her know when you've reached here.
# The code below is almost complete, you just need to fill in the data for X (train & test).

sc = StandardScaler()
X_train = sc.fit_transform(# enter numerical training data here #)
X_test = sc.transform(# enter numerical test data here #)

In [None]:
# Using Logistic Regression Algorithm to the Training Set

from sklearn.linear_model import LogisticRegression

# Emily will explain this part. Let her know when you've reached here.
# The code below is almost complete, you just need to fill in the training data.


classifier_logr = LogisticRegression(random_state = 0)
classifier_logr.fit(#enter numerical training data here# , #enter target training data here#)

Y_pred_logr = classifier_logr.predict(X_test)

In [None]:
# Do you guys remember what Shweta was saying about the confusion matrix, back when we were doing the 'dog or cookie?' example?
# Raise your hand if you want to explain the concept again, just to give us all a refresher. 
# No worries if you don't totally remember, we're going to go over it again here - just let Emily know you've reached this part.


# Now that we've gone over confusion matrices again, run the below code and type out in comments what each of the numbers mean.
from sklearn.metrics import confusion_matrix


cm_logr = confusion_matrix(Y_test, Y_pred_logr)

cm_logr

In [None]:
# What do each of the 4 numbers in the confusion matrix above represent? Write your answers below.
#
#
#
#

In [None]:
#Using KNeighborsClassifier Method of neighbors class to use Nearest Neighbor algorithm

from sklearn.neighbors import KNeighborsClassifier


# Emily will explain this part. Let her know when you've reached here.
# The code below is almost complete, you just need to fill in the training data.


classifier_knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_knn.fit(#enter numerical training data here# , #enter target training data here#)

Y_pred_knn = classifier_knn.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix

# Now that we've gone over confusion matrices again, run the below code and type out in comments what each of the numbers mean.

cm_knn = confusion_matrix(Y_test, Y_pred_knn)

cm_knn

In [None]:
# What do each of the 4 numbers in the confusion matrix above represent? Write your answers below.
#
#
#
#

In [None]:
#Using DecisionTreeClassifier of tree class to use Decision Tree Algorithm

from sklearn.tree import DecisionTreeClassifier


# Emily will explain this part. Let her know when you've reached here.
# The code below is almost complete, you just need to fill in the training data.


classifier_dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_dt.fit(#enter numerical training data here# , #enter target training data here#)

Y_pred_dt = classifier_dt.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix

# Now that we've gone over confusion matrices again, run the below code and type out in comments what each of the numbers mean.

cm_dt = confusion_matrix(Y_test, Y_pred_dt)

cm_dt

In [None]:
# What do each of the 4 numbers in the confusion matrix above represent? Write your answers below.
#
#
#
#