In [None]:
mkdir breast_cancer_analysis
cd breast_cancer_analysis
git init


In [None]:
python -m venv venv
source venv/bin/activate  # For MacOS/Linux
venv\Scripts\activate     # For Windows


In [3]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target



Dataset Acquisition and Preparation

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('data 2.csv')

# Check if the dataset has a target column
if 'target' not in df.columns:
    raise ValueError("The dataset does not contain a 'target' column.")

# Preprocess the data
X = df.drop('target', axis=1)
y = df['target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save preprocessed data
pd.DataFrame(X_train).to_csv('X_train.csv', index=False)
pd.DataFrame(X_test).to_csv('X_test.csv', index=False)
pd.DataFrame(y_train).to_csv('y_train.csv', index=False)
pd.DataFrame(y_test).to_csv('y_test.csv', index=False)


ValueError: The dataset does not contain a 'target' column.

Feature Selection

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif

# Load preprocessed data
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv').values.ravel()

# Select the top k features
selector = SelectKBest(score_func=f_classif, k=10)
X_train_selected = selector.fit_transform(X_train, y_train)

# Save the selected features
pd.DataFrame(X_train_selected).to_csv('X_train_selected.csv', index=False)
pd.DataFrame(selector.transform(pd.read_csv('X_test.csv'))).to_csv('X_test_selected.csv', index=False)


Grid Search CV for Model Tuning

In [None]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

# Load selected features
X_train_selected = pd.read_csv('X_train_selected.csv')
y_train = pd.read_csv('y_train.csv').values.ravel()

# Define the parameter grid
param_grid = {
    'hidden_layer_sizes': [(50, 50, 50), (50, 100, 50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant', 'adaptive'],
}

# Initialize the MLPClassifier
mlp = MLPClassifier(max_iter=100)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, n_jobs=-1, cv=3)

# Fit the model
grid_search.fit(X_train_selected, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Save the best parameters
with open('best_params.txt', 'w') as f:
    f.write(str(best_params))


Implementing an Artificial Neural Network (ANN) Model

In [None]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load selected features
X_train_selected = pd.read_csv('X_train_selected.csv')
y_train = pd.read_csv('y_train.csv').values.ravel()
X_test_selected = pd.read_csv('X_test_selected.csv')
y_test = pd.read_csv('y_test.csv').values.ravel()

# Load the best parameters
with open('best_params.txt', 'r') as f:
    best_params = eval(f.read())

# Initialize the MLPClassifier with the best parameters
best_mlp = MLPClassifier(**best_params)

# Train the model
best_mlp.fit(X_train_selected, y_train)

# Evaluate the model
y_pred = best_mlp.predict(X_test_selected)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save the trained model
import joblib
joblib.dump(best_mlp, 'best_mlp_model.pkl')


Building a Streamlit App Locally

In [None]:
import streamlit as st
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# Load data and model
df = pd.read_csv('/mnt/data/data 2.csv')
model = joblib.load('best_mlp_model.pkl')

# Preprocess data for display
X = df.drop('target', axis=1)
y = df['target']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X_scaled, y)

st.title("Breast Cancer Prediction App")
st.write("Interact with the dataset and get predictions")

# User input for prediction
user_input = []
for i, col in enumerate(df.columns[:-1]):
    user_input.append(st.slider(col, float(X_selected[:, i].min()), float(X_selected[:, i].max()), float(X_selected[:, i].mean())))

# Predict
if st.button("Predict"):
    prediction = model.predict([user_input])
    st.write(f"The predicted class is: {prediction[0]}")


Deployment and Version Control

In [None]:
git add .
git commit -m "Initial commit"
git remote add origin <your-repository-url>
git push -u origin master
