In [5]:
 # For data manipulation
import pandas as pd 
# For numerical computations
import numpy as np  
# For data visualization
import seaborn as sns  
# For plotting graphs
import matplotlib.pyplot as plt
# For splitting data
from sklearn.model_selection import train_test_split  
# For scaling & encoding
from sklearn.preprocessing import StandardScaler, LabelEncoder  
# For handling missing values
from sklearn.impute import SimpleImputer  

In [7]:
# Load the dataset
df = pd.read_csv("Breast_Cancer_Global_Dataset.csv")  

In [9]:
# Display basic information about the dataset
print("Dataset Info:") 
df.info()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 25 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   Country                 1048575 non-null  object 
 1   Region                  1048575 non-null  object 
 2   HDI_Category            1048575 non-null  object 
 3   Population              1048575 non-null  int64  
 4   Screening_Rate          1048575 non-null  float64
 5   Median_Age              1048575 non-null  int64  
 6   Urbanization_Rate       1048575 non-null  float64
 7   Healthcare_Expenditure  1048575 non-null  int64  
 8   GDP_Per_Capita          1048575 non-null  int64  
 9   Obesity_Rate            1048575 non-null  float64
 10  Smoking_Rate            1048575 non-null  float64
 11  Alcohol_Consumption     1048575 non-null  float64
 12  Physical_Activity_Rate  1048575 non-null  float64
 13  Family_History_Rate     1048575 non-null  f

In [11]:
x=df.head(10)
x

Unnamed: 0,Country,Region,HDI_Category,Population,Screening_Rate,Median_Age,Urbanization_Rate,Healthcare_Expenditure,GDP_Per_Capita,Obesity_Rate,...,Average_Diagnosis_Age,Survival_Rate,Access_To_Care,Education_Level,Women_Population,Breast_Cancer_Cases,Breast_Cancer_Deaths,Cases_Per_100K,Deaths_Per_100K,Mortality_Rate
0,United Kingdom,Asia,High,191540888,66.17,33,67.05,5873,13759,25.52,...,47,80.36,48.78,Primary,95111187,205111,38170,215.65,40.13,18.61
1,Nigeria,North America,High,129793398,63.18,30,65.86,604,35571,15.78,...,42,96.64,35.5,Primary,64449968,138989,25865,215.65,40.13,18.61
2,Saudi Arabia,Africa,Low,3532234,12.91,42,69.01,9138,37653,12.11,...,64,68.65,86.96,Secondary,1753959,3782,703,215.63,40.08,18.59
3,Australia,South America,Low,41960151,12.64,38,26.66,3767,1947,30.66,...,57,73.11,25.97,Tertiary,20835654,44933,8361,215.65,40.13,18.61
4,France,South America,Medium,96709040,44.87,42,41.22,4364,12059,21.48,...,66,50.85,76.07,Primary,48021661,103560,19272,215.65,40.13,18.61
5,Egypt,Africa,Low,100152204,33.27,35,44.88,7775,28583,42.7,...,55,94.5,56.33,Primary,49731392,107247,19958,215.65,40.13,18.61
6,United Kingdom,Europe,Medium,78990845,55.33,42,27.93,3656,29391,45.54,...,35,51.95,54.18,Primary,39223547,84587,15741,215.65,40.13,18.61
7,South Korea,Asia,Low,91261004,72.61,48,29.49,132,34707,7.71,...,52,96.52,36.37,Primary,45316394,97726,18186,215.65,40.13,18.61
8,Bangladesh,Europe,Medium,145796310,83.96,37,73.3,6576,22677,32.4,...,52,80.06,68.1,Tertiary,72396344,156125,29054,215.65,40.13,18.61
9,Australia,North America,High,20815706,23.85,27,83.74,3244,44021,36.07,...,63,87.4,85.2,Primary,10336208,22290,4148,215.65,40.13,18.61


## Handle Missing Values

In [15]:
# Returns True if there are any missing values, otherwise False
print(df.isnull().values.any())  

False


In [18]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print(categorical_cols)

Index(['Country', 'Region', 'HDI_Category', 'Education_Level'], dtype='object')


In [21]:
# Apply Label Encoding to categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Convert categories to numerical values
    label_encoders[col] = le  # Store encoders for reference

print("\nFeature scaling completed!")


Feature scaling completed!


In [22]:
# Apply Standard Scaling to numerical columns
scaler = StandardScaler()
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print("\nFeature scaling completed!") 


Feature scaling completed!


## Splitting the Data into Train & Test

In [23]:
# Define features (X) and target variable (y)
X = df.drop(columns=['Mortality_Rate'])  # Excluding target variable
y = df['Mortality_Rate']  # Target variable

# Split dataset into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData successfully split into training and testing sets!")


Data successfully split into training and testing sets!


In [24]:
print("Training Set:", X_train.shape) 
print("Test Set:", X_test.shape)

Training Set: (838860, 24)
Test Set: (209715, 24)
