In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
url = 'https://raw.githubusercontent.com/dramicos/Breast-Cancer-Detection/main/Resources/data.csv'

In [None]:
df = pd.read_csv(url)
df.head()

In [None]:
# Choose a cutoff value and create a list of diagnosis to be replaced
# use the variable name `diagnosis_to_replace`

# Transform diagnosis
def diagnosis_to_replace(diagnosis):
    if diagnosis == "M":
        return 1
    else:
        return 0
    

df["diagnosis"] = df["diagnosis"].apply(diagnosis_to_replace)
df.head()

In [None]:
# Labels for the sections of our pie chart
labels = ["Malignant", "Benign"]

# The values of each section of the pie chart
sizes = [212, 357]

# The colors of each section of the pie chart
colors = ["lightcoral", "lightskyblue"]

# Tells matplotlib to separate the "Benign" section from the others
explode = (0.1, 0)

In [None]:
# Creates the pie chart based upon the values above
# Automatically finds the percentages of each part of the pie chart
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)

In [None]:
# Save piechart as Image
plt.savefig('piechart_mal_vs_be.png')

In [None]:
#Locate the value 1 in column diagnosis
malignant = df.loc[(df["diagnosis"] == 1)]
malignant

In [None]:
# Split our preprocessed data into our features and target arrays also drop the id as that is not useful
malignant_X = malignant.drop(["diagnosis", "id"], axis='columns')
malignant_y = malignant["diagnosis"]
malignant_X 

In [None]:
malignant_X.hist(figsize=(15,15), color = "lightcoral")
plt.show()

In [None]:
malignant.describe()

In [None]:
 # Set x axis and tick locations
# x_axis = np.arange(len(malignant))
# tick_locations = [value+0.4 for value in x_axis]

In [None]:
#Locate the value 0 in column diagnosis
benign = df.loc[(df["diagnosis"] == 0)]
benign

In [None]:
# Split our preprocessed data into our features and target arrays also drop the id as that is not useful
benign_X = benign.drop(["diagnosis", "id"], axis='columns')
benign_y = benign["diagnosis"]
benign_X 

In [None]:
benign_X.hist(figsize=(15,15), color = "lightskyblue")
plt.show()

In [None]:
benign.describe()

In [None]:
# Split our preprocessed data into our features and target arrays also drop the id as that is not useful
X = df.drop(["diagnosis", "id"], axis='columns')
y = df["diagnosis"]
X

In [None]:
X.hist(figsize=(15,15))
plt.show()

In [None]:
# Save histogram as Image
plt.savefig('histogram.png')

In [None]:
 # Create a variable for y
# mean = "radius_mean"
# se = "radius_se"
# worst = "radius_worst"

In [None]:
# Set type of comparison
# columns_to_compare = "radius"

In [None]:

# radius_mean = benign.loc[mean,
#                          [f"radius{columns_to_compare}",
#                           f"radius{columns_to_compare}", 
#                           f"radius{columns_to_compare}"]]

# radius_se = benign.loc[se,
#                          [f"radius{columns_to_compare}",
#                           f"radius{columns_to_compare}", 
#                           f"radius{columns_to_compare}"]]

# radius_worst = benign.loc[worst,
#                          [f"radius{columns_to_compare}",
#                           f"radius{columns_to_compare}", 
#                           f"radius{columns_to_compare}"]]

In [None]:
#Locate the value 0 in column diagnosis
# benign_mul_plot = benign.loc[(benign["diagnosis"] == 0)]
# benign

In [None]:
# Create plot multiple columns 
# benign_multi_plot = benign.plot(kind="bar", figsize=(20,5))

# # PandasPlot.set_xticklabels() can be used to set the tick labels as well
# benign_multi_plot.set_xticklabels(benign["radius_mean"], rotation=45)

# plt.show()
# plt.tight_layout()

In [None]:
benign = df.loc[df['diagnosis']==0]
benign.drop(columns=['id','diagnosis'], inplace=True)

In [None]:
benign.shape

In [None]:
malign = df.loc[df['diagnosis']==1]
malign.drop(columns=['id','diagnosis'], inplace=True)

In [None]:
malign.shape

In [None]:
balanced_df = df.drop(df.loc[df['diagnosis'] == 0].sample(n=(benign.shape[0] - malign.shape[0])).index ).reset_index(drop=True)

In [None]:
balanced_df.shape

In [None]:
balanced_df.head()

In [None]:
balanced_df.groupby('diagnosis').count()

In [None]:
b_X = balanced_df.drop(["diagnosis", "id"], axis='columns')
b_y = balanced_df['diagnosis']

In [None]:
b_X.columns

In [None]:
# balanced_df.groupby('radius_mean').mean()

In [None]:
# from sklearn.datasets import make_blobs

# X, y = make_blobs(centers=2, random_state=42)

# print(f"Labels: {y[:32]}")
# print(f"Data: {X[:32]}")

In [None]:
 # Visualizing both classes
# plt.scatter(X[:, 0], X[:, 1], c=y)

In [None]:
# compare the 2 datasets balanced and normal
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
b_X_train, b_X_test, b_y_train, b_y_test = train_test_split(b_X, b_y, random_state=2, stratify=b_y, test_size=0.31)

In [None]:
# Create a MinMaxScaler instance since all values are positive to try for better results
scaler = MinMaxScaler()

# Fit the MinMaxScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Fit the MinMaxScaler for balanced
b_X_scaler = scaler.fit(b_X_train)

# Scale the data
b_X_train_scaled = b_X_scaler.transform(b_X_train)
b_X_test_scaled = b_X_scaler.transform(b_X_test)

In [None]:
#  Trial one: Logistic Regression Algorithm raw
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state = 78, max_iter=10000)
lr.fit(X_train, y_train)

print(f"Training Data Score: {lr.score(X_train, y_train)}")
print(f"Testing Data Score: {lr.score(X_test, y_test)}")

In [None]:
 # Generate a new data point (the red circle)
# import numpy as np
# new_data = np.array([[-2, 6]])
# plt.scatter(X[:, 0], X[:, 1], c=y)
# plt.scatter(new_data[0, 0], new_data[0, 1], c="r", marker="o", s=100)

In [None]:
import seaborn as sns

#define the predictor variable and the response variable:'radius_mean'
x = balanced_df['diagnosis']
y = balanced_df['radius_mean']

#plot logistic regression curve
sns.regplot(x=x, y=y, data=balanced_df, logistic=True, ci=None)

In [None]:
#define the predictor variable and the response variable: 'radius_se'
x = balanced_df['diagnosis']
y = balanced_df['radius_se']

#plot logistic regression curve
sns.regplot(x=x, y=y, data=balanced_df, logistic=True, ci=None)

In [None]:
#define the predictor variable and the response variable: 'radius_worst'
x = balanced_df['diagnosis']
y = balanced_df['radius_worst']

#plot logistic regression curve
sns.regplot(x=x, y=y, data=balanced_df, logistic=True, ci=None)

In [None]:
#define the predictor variable and the response variable:'texture_mean'
x = balanced_df['diagnosis']
y = balanced_df['texture_mean']

#plot logistic regression curve
sns.regplot(x=x, y=y, data=balanced_df, logistic=True, ci=None, color = 'red')

In [None]:
#define the predictor variable and the response variable: 'texture_se'
x = balanced_df['diagnosis']
y = balanced_df['texture_se']

#plot logistic regression curve
sns.regplot(x=x, y=y, data=balanced_df, logistic=True, ci=None, color = 'red')

In [None]:
#define the predictor variable and the response variable: 'texture_worst'
x = balanced_df['diagnosis']
y = balanced_df['texture_worst']

#plot logistic regression curve
sns.regplot(x=x, y=y, data=balanced_df, logistic=True, ci=None, color = 'red')

In [None]:
#  Trial two: Logistic Regression Algorithm balanced
from sklearn.linear_model import LogisticRegression

b_lr = LogisticRegression(random_state = 78, max_iter=10000)
b_lr.fit(b_X_train, b_y_train)

print(f"Training Data Score: {lr.score(b_X_train, b_y_train)}")
print(f"Testing Data Score: {lr.score(b_X_test, b_y_test)}")

In [None]:
#  Trial two: Random Forest Classifier raw
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=78, n_estimators=250).fit(X_train_scaled, y_train)

print(f'Training Score: {rfc.score(X_train_scaled, y_train)}')
print(f'Testing Score: {rfc.score(X_test_scaled, y_test)}')

In [None]:
from sklearn.utils import resample

In [None]:
rfcs = []
scores = []
for i in range(50):
    
    # Sample the data for each new tree
    X_train_scaled_bootstrap, y_train_bootstrap = resample(X_train_scaled, y_train, random_state=i)
    
    # Create a decision tree and append it to our list of classifiers
    rfc = RandomForestClassifier(random_state=i+200).fit(X_train_scaled_bootstrap, y_train_bootstrap)
    rfcs.append(rfc)
    
    # Take the median score of all the created classifiers
    y_preds = [b_rfc.predict(X_test_scaled) for rfc in rfcs]
    y_pred = pd.DataFrame(y_preds).median().round()
    score = accuracy_score(y_test, y_pred)
    scores.append(score)

plt.plot(scores)
plt.show()
print(f'score: {score}')

In [None]:
#  Trial two: Random Forest Classifier raw
from sklearn.ensemble import RandomForestClassifier

b_rfc = RandomForestClassifier(random_state=78, n_estimators=100).fit(b_X_train_scaled, b_y_train)

print(f'Training Score: {rfc.score(b_X_train_scaled, b_y_train)}')
print(f'Testing Score: {rfc.score(b_X_test_scaled, b_y_test)}')

In [None]:
b_rfcs = []
scores = []
for i in range(50):
    
    # Sample the data for each new tree
    X_train_scaled_bootstrap, y_train_bootstrap = resample(X_train_scaled, y_train, random_state=i)
    
    # Create a decision tree and append it to our list of classifiers
    b_rfc = RandomForestClassifier(random_state=i+200).fit(X_train_scaled_bootstrap, y_train_bootstrap)
    b_rfcs.append(b_rfc)
    
    # Take the median score of all the created classifiers
    y_preds = [b_rfc.predict(X_test_scaled) for b_rfc in b_rfcs]
    y_pred = pd.DataFrame(y_preds).median().round()
    score = accuracy_score(y_test, y_pred)
    scores.append(score)

plt.plot(scores)
plt.show()
print(f'score: {score}')

In [None]:
y_pred = rfc.predict(X_test_scaled)

In [None]:
b_y_pred = rfc.predict(b_X_test_scaled)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
b_cm = confusion_matrix(b_y_test, b_y_pred)
b_cm

In [None]:
btn, bfp, bfn, btp = confusion_matrix(b_y_test, b_y_pred).ravel()

In [None]:
bad = bfn/(btn + bfp + bfn + btn)
print(bad)

In [None]:
threshold = 0.5

In [None]:
pred_proba = b_rfc.predict_proba(b_X_test)
predicted = (pred_proba [:,1] >= threshold).astype('int')

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy = accuracy_score(b_y_test, predicted)
print(round(accuracy,4)*100, "%")

In [None]:
# We decided to drop a few more benign rows to bias toward malignant

nb_df = df.drop(   df.loc[df['diagnosis'] == 0].sample(n=(benign.shape[0] - malign.shape[0]+50)).index ).reset_index(drop=True)

In [None]:
nb_df.shape

In [None]:
nb_df.head()

In [None]:
nb_X = balanced_df.drop(["diagnosis", "id"], axis='columns')
nb_y = balanced_df['diagnosis']

In [None]:
# We tried varying the test size and got the best results at 31% test size
nb_X_train, nb_X_test, nb_y_train, nb_y_test = train_test_split(nb_X, nb_y, random_state=74, stratify=nb_y, test_size=0.31)

In [None]:
nb_X_scaler = scaler.fit(nb_X_train)

# Scale the data
nb_X_train_scaled = nb_X_scaler.transform(nb_X_train)
nb_X_test_scaled = nb_X_scaler.transform(nb_X_test)

In [None]:
nb_rfc = RandomForestClassifier(random_state=78, n_estimators=100).fit(nb_X_train_scaled, nb_y_train)

print(f'Training Score: {rfc.score(nb_X_train_scaled, nb_y_train)}')
print(f'Testing Score: {rfc.score(nb_X_test_scaled, nb_y_test)}')

In [None]:
nb_y_pred = rfc.predict(nb_X_test_scaled)

In [None]:
nb_cm = confusion_matrix(nb_y_test, nb_y_pred)
nb_cm

In [None]:
btn, bfp, bfn, btp = confusion_matrix(nb_y_test, nb_y_pred).ravel()

In [None]:
bad = bfn/(btn + bfp + bfn + btn)
print(bad)

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(learning_rate=35)

In [None]:
tsne_features = tsne.fit_transform(nb_X_train_scaled)

In [None]:
tsne_features.shape

In [None]:
# Plot the results
plt.scatter(tsne_features[:,0], tsne_features[:,1])
plt.show()

In [None]:
# Save Plot as Image
plt.savefig('tsne_plot_image.jpeg')

In [None]:
plt.savefig('tsne_plot.png')

In [None]:
pca = PCA(n_components=10)

In [None]:
df_pca = pca.fit_transform(nb_X_train_scaled)

In [None]:
df_pca = pd.DataFrame(
    data=df_pca, columns=["pc 1", "pc 2","pc 3","pc 4","pc 5","pc 6","pc 7","pc 8","pc 9","pc 10"])
df_pca.head()

In [None]:
pca.explained_variance_ratio_

In [None]:
sum = 0

for i in pca.explained_variance_ratio_:
  sum += i

In [None]:
print(sum)

### Analysis
According to the explained variance, the 10 component contains 96% of the information in the original dataset, and we will see whether increasing the number of principal components to 3 will increase the explained variance.

In [None]:
from sklearn.cluster import KMeans

In [None]:
# Initialize PCA model for 3 principal components
pca = PCA(n_components=3, random_state=5)

# Get 3 principal components for the iris data.
breast_pca = pca.fit_transform(df_pca)

In [None]:
 # Transform PCA data to a DataFrame
df_pca = pd.DataFrame(
    data=breast_pca,
    columns=["principal component 1", "principal component 2", "principal component 3"],
)
df_pca.head()

In [None]:
# Initialzie the K-Means model
model = KMeans(n_clusters=3, random_state=5)

In [None]:
# Fit the model
model.fit(df_pca)

In [None]:
# Prediction Clusters
predictions = model.predict(df_pca)
print(predictions)

In [None]:
# Add the predicted class columns
df_pca["class"] = model.labels_
df_pca.head()

In [None]:
# Fetch the explained variance
pca.explained_variance_ratio_

In [None]:
# Plot the 3 principal components
import plotly.express as px
fig = px.scatter_3d(
    df_pca,
    x="principal component 3",
    y="principal component 2",
    z="principal component 1",
    color ="class",
    symbol="class",
    width=800)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:
# Save 3D as Image
plt.savefig('three_d_principle_plot.png')

In [None]:
plt.scatter(x=df_pca["principal component 1"], 
            y=df_pca["principal component 2"],
           c=df_pca["principal component 3"])
plt.xlabel("principal component 1")
plt.ylabel("principal component 2")
plt.show()

In [None]:
# Save Plot as Image
plt.savefig('three_principle_plot_a.png')

### Analysis With three principal components 
we have approximately 99% of the information in the original dataset.

### Perform a Cluster Analysis with K-means
Finding the best value for k using the Elbow Curve

In [None]:
inertia = []
# Same as K = List(range(1, 11))
k= [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
# Looking for the best k
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_pca)
    inertia.append(km.inertia_)

In [None]:
# Defrine a DataFrame to plot the Elobow Curce using hvPlot
elbow_data={"k":k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

In [None]:
plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(range(1,11))
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
# Save Elbow Curve as Image
plt.savefig('kmeans_elb_curve.png')