In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from sklearn.feature_selection import RFE

In [4]:
path = "data.csv"

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)


data = pd.read_csv(path)
#data.sort_values(by=['id'])

display(data.head())

FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'

In [None]:
#Finding missing values
null_feat = pd.DataFrame(len(data['id']) - data.isnull().sum(), columns = ['Count'])

trace = go.Bar(x = null_feat.index, y = null_feat['Count'] ,opacity = 0.8, marker=dict(color = 'lightgrey',
        line=dict(color='#000000',width=1.5)))

layout = dict(title =  "Missing Values")

fig = dict(data = [trace], layout=layout)
py.iplot(fig)

#Finding duplicate observations
duplicate = data[data.duplicated()]
duplicate

In [None]:
#Remove unnecessary variables
data = data.drop(['id', 'Unnamed: 32'], axis = 1)

y = np.array(data.diagnosis.tolist())
data2 = data.copy()
data2.diagnosis.replace(to_replace = dict(M = 1, B = 0), inplace = True)

#Replace diagnosis values (M = 1, B = 0)
data.diagnosis.replace(to_replace = dict(M = 1, B = 0), inplace = True)
display(data.head())
display(data.describe())

In [None]:
#EDA
M = data[(data['diagnosis'] != 0)]
B = data[(data['diagnosis'] == 0)]
counts = [len(M), len(B)]

#Bar Plot
plt.figure(figsize=(15, 7))
plt.barh(['Malignant', 'Benign'], counts, color=['darkred', 'wheat'], edgecolor='#000000', alpha=0.8)
plt.title('Count of diagnosis variable')
plt.show()

#Pie Chart
diagnosis_counts = data['diagnosis'].value_counts()
plt.figure(figsize=(7, 7))
plt.pie(diagnosis_counts, labels=['benign', 'malignant'], colors=['wheat','darkred'], autopct='%1.1f%%', startangle=90, wedgeprops=dict(width=1, edgecolor='#000000'))
plt.axis('equal')
plt.title('Distribution of diagnosis variable')
plt.show()

In [None]:
#Distribution Plots
def distributionPlots(variable, bin):
  mal = M[variable]
  ben = B[variable]

  plt.figure(figsize=(10, 6))

  sns.kdeplot(mal, color='darkred', label='Malignant', fill=True, linewidth=2)
  sns.kdeplot(ben, color='wheat', label='Benign', fill=True, linewidth=2)

  plt.xlabel(variable)
  plt.ylabel('Density')
  plt.title('Density Plot for ' + variable)

  plt.legend()
  plt.show()

In [None]:
#Mean Features
distributionPlots('radius_mean', .5)
distributionPlots('texture_mean', .5)
distributionPlots('perimeter_mean', 5)
distributionPlots('area_mean', 10)
distributionPlots('smoothness_mean', .5)
distributionPlots('compactness_mean', .5)
distributionPlots('concavity_mean', .5)
distributionPlots('concave points_mean', .5)
distributionPlots('symmetry_mean', .5)
distributionPlots('fractal_dimension_mean', .5)

In [None]:
#Standard Error Features
distributionPlots('radius_se', .1)
distributionPlots('texture_se', .1)
distributionPlots('perimeter_se', .5)
distributionPlots('area_se', 5)
distributionPlots('smoothness_se', .5)
distributionPlots('compactness_se', .5)
distributionPlots('concavity_se', .5)
distributionPlots('concave points_se', .5)
distributionPlots('symmetry_se', .5)
distributionPlots('fractal_dimension_se', .5)

In [None]:
#Worst Features
distributionPlots('radius_worst', .5)
distributionPlots('texture_worst', .5)
distributionPlots('perimeter_worst', 5)
distributionPlots('area_worst', 10)
distributionPlots('smoothness_worst', .5)
distributionPlots('compactness_worst', .5)
distributionPlots('concavity_worst', .5)
distributionPlots('concave points_worst', .5)
distributionPlots('symmetry_worst', .5)
distributionPlots('fractal_dimension_worst', .5)

In [None]:
#Correlation Matrix
correlation = data.corr()
predictors = data.drop(['diagnosis'], axis = 1)

f,ax = plt.subplots(figsize=(12, 12))
sns.heatmap(predictors.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

In [None]:
#Scatter Plots
def scatterPlots(data, x1, y1, x2, y2, palette, edgecolor, title):
    diagnosis_labels = {0: 'Malignant', 1: 'Benign'}

    # Plot +
    fig = plt.figure(figsize=(14, 12))

    plt.subplot(221)
    scatter1 = sns.scatterplot(x=data[x1], y=data[y1], hue="diagnosis", data=data, palette=palette, edgecolor=edgecolor)
    scatter1.legend(labels=[diagnosis_labels[label] for label in sorted(data['diagnosis'].unique())])
    plt.title(f'{x1} vs {y1}')

    plt.subplot(222)
    scatter2 = sns.scatterplot(x=data[x2], y=data[y2], hue="diagnosis", data=data, palette=palette, edgecolor=edgecolor)
    scatter2.legend(labels=[diagnosis_labels[label] for label in sorted(data['diagnosis'].unique())])
    plt.title(f'{x2} vs {y2}')

    fig.suptitle(title, fontsize=20)
    plt.tight_layout()
    #plt.savefig(title.lower().replace(' ', '_') + '.png')
    plt.show()

In [None]:
#Features with Positive Correlation
scatterPlots(data, 'perimeter_mean', 'radius_worst', 'area_mean', 'radius_worst', {0: 'wheat', 1: 'darkred'}, 'grey', 'Features with Positive Correlation')
scatterPlots(data, 'texture_mean', 'texture_worst', 'area_worst', 'radius_worst', {0: 'wheat', 1: 'darkred'}, 'grey', 'Features with Positive Correlation')

In [None]:
#Features with No Correlation
scatterPlots(data, 'smoothness_mean','texture_mean', 'radius_mean','fractal_dimension_worst', {0: 'wheat', 1: 'darkred'}, 'grey', 'Features with No Correlation')
scatterPlots(data, 'texture_mean','symmetry_mean', 'texture_mean','symmetry_se', {0: 'wheat', 1: 'darkred'}, 'grey', 'Features with No Correlation')

In [None]:
# Features with Neagtive Correlation
scatterPlots(data, 'area_mean','fractal_dimension_mean', 'radius_mean','fractal_dimension_mean', {0: 'wheat', 1: 'darkred'}, 'grey', 'Features with Negative Correlation')
scatterPlots(data, 'area_mean','smoothness_se', 'smoothness_se','perimeter_mean', {0: 'wheat', 1: 'darkred'}, 'grey', 'Features with Negative Correlation')

In [None]:
df = pd.read_csv(path)

df = df.drop(['Unnamed: 32', 'id'], axis=1)

df.diagnosis.replace(to_replace=dict(M=1, B=0), inplace=True)

M = df[df['diagnosis'] != 0]
B = df[df['diagnosis'] == 0]

y = np.array(df.diagnosis.tolist())
X = df.drop('diagnosis', axis=1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# standardize & PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# convert pc to df
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

# explained variance. using this to select the # best of components
def plot_variance(pca):
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr, color='indigo')
    axs[0].set(xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0))
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-", color='indigo')
    axs[1].set(xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0))
    fig.set(figwidth=8, dpi=100)
    plt.show()

plot_variance(pca)

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

loadings = pca.components_
abs_loadings = np.abs(loadings)


# top three for each PC
top_three_indices_pc1 = np.argsort(abs_loadings[0])[::-1][:3]
top_three_indices_pc2 = np.argsort(abs_loadings[1])[::-1][:3]
top_three_features_pc1 = X.columns[top_three_indices_pc1]
top_three_features_pc2 = X.columns[top_three_indices_pc2]
print("Top three features for PC1:", top_three_features_pc1)
print("Top three features for PC2:", top_three_features_pc2)

X_subset = X[top_three_features_pc1.append(top_three_features_pc2)]

# Standardize the subset of features
scaler_subset = StandardScaler()
X_subset_scaled = scaler_subset.fit_transform(X_subset)

#pca using best # of components
pca_subset = PCA(n_components=2)
X_pca_subset = pca_subset.fit_transform(X_subset_scaled)

component_names_subset = [f"PC{i+1}" for i in range(X_pca_subset.shape[1])]
X_pca_subset = pd.DataFrame(X_pca_subset, columns=component_names_subset)

# plot PCA w/ k=2. each PC has features selected using the three highest features
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue=y, data=X_pca_subset, palette=['wheat', 'darkred'])
plt.title('PCA Visualization with Subset of Features (Malignant vs. Benign)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend(title='Diagnosis', loc='upper right')
plt.grid(True)

#explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

plt.annotate(f'Explained Variance Ratio PC1: {explained_variance_ratio[0]:.2f}',
             xy=(0.05, 0.95), xycoords='axes fraction', fontsize=10, color='indigo')
plt.annotate(f'Explained Variance Ratio PC2: {explained_variance_ratio[1]:.2f}',
             xy=(0.05, 0.90), xycoords='axes fraction', fontsize=10, color='darksalmon')
plt.annotate(f'Cumulative Explained Variance Ratio: {cumulative_variance_ratio[1]:.2f}',
             xy=(0.05, 0.85), xycoords='axes fraction', fontsize=10, color='orange')

plt.show()


In [None]:

# calculate mutual info scores
def make_mi_scores(X, y):
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores, color='darkred'):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores, color=color)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

mi_scores = make_mi_scores(X, y)
print(mi_scores)

plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores.head(20), color='indigo')