In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
data=pd.read_csv('Cancer.csv')
import warnings
warnings.filterwarnings('ignore')

FileNotFoundError: [Errno 2] No such file or directory: 'Cancer.csv'

In [None]:
data.head()

In [None]:
# shape of the dataset
print(data.shape)

In [None]:
# checking missing values
print(data.isnull().sum())

In [None]:
# Display data types and basic info
print(data.info())

In [None]:
# summary statistics
print(data.describe())

In [None]:
data.drop(columns=['Unnamed: 32','id'], inplace=True)  # Drop the empty column
data.drop_duplicates(inplace=True)
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})

## Correlation

In [None]:
plt.figure(figsize=(12, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Breast Cancer Features')
plt.show()

## Univariate Analysis

In [None]:
# Distribution of Radius Mean (key variable)
sns.histplot(data['radius_mean'], kde=True)
plt.title('Distribution of Radius Mean')
plt.show()

In [None]:
# Histogram for numeric features
data.hist(figsize=(16, 12), bins=20)
plt.tight_layout()
plt.show()

In [None]:
# Diagnosis count
sns.countplot(x='diagnosis', data=data)
plt.title('Diagnosis Count (0 = Benign, 1 = Malignant)')
plt.show()

In [None]:
# Kernel Density Estimation (KDE)plot for the radius_mean
sns.kdeplot(data['radius_mean'], shade=True)
plt.title('KDE plot for Radius Mean')
plt.show()

## Bivariate Analysis

In [None]:
# Box plot of radius_mean by Diagnosis
sns.boxplot(x='diagnosis', y='radius_mean', data=data)
plt.title('Boxplot of Radius Mean by Diagnosis')
plt.show()

In [None]:
# Violin plot for texture_mean by diagnosis
sns.violinplot(x='diagnosis', y='texture_mean', data=data)
plt.title('Violin Plot for Texture Mean by Diagnosis')
plt.show()

In [None]:
# Pairplot for multiple features
sns.pairplot(data[['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'diagnosis']], hue='diagnosis')
plt.title('Pairplot of Key Features')
plt.show()

## Multivariate Analysis

In [None]:
# Correlation matrix and heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Features')
plt.show()

In [None]:
# Scatterplot for multiple features
sns.pairplot(data[['radius_mean', 'texture_mean', 'area_mean', 'perimeter_mean']], kind='scatter')
plt.show()

In [None]:
# Correlation of features with the diagnosis
correlation_with_diagnosis = data.corr()['diagnosis'].sort_values(ascending=False)
print(correlation_with_diagnosis)

## Outliers Detection

In [None]:
# Boxplot for detecting outliers
plt.figure(figsize=(10, 6))
sns.boxplot(data=data.drop('diagnosis', axis=1))
plt.title('Boxplot for Outliers Detection')
plt.show()

In [None]:
from scipy import stats

# Z-score to detect outliers
z_scores = np.abs(stats.zscore(data.drop('diagnosis', axis=1)))
print(np.where(z_scores > 3))

## Dimensionality Reduction using PCA (Principal Component Analysis) 

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data.drop('diagnosis', axis=1))

# PCA to reduce dimensionality
pca = PCA(n_components=2)
pca_result = pca.fit_transform(data_scaled)
print(f'Explained variance by 2 components: {pca.explained_variance_ratio_.sum()}')

# Plot the PCA result
plt.scatter(pca_result[:, 0], pca_result[:, 1], c=data['diagnosis'], cmap='coolwarm')
plt.title('PCA of Breast Cancer Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

## Feature Engineering

In [None]:
# A new feature ratio of radius_mean to perimeter_mean
data['radius_perimeter_ratio'] = data['radius_mean'] / data['perimeter_mean']

# Checking the distribution of the new feature
sns.histplot(data['radius_perimeter_ratio'], kde=True)
plt.title('Distribution of Radius to Perimeter Ratio')
plt.show()