In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#from google.colab import drive

In [2]:
#drive.mount('/content/drive') # Uncomment if loading from Google Drive
#cancer_df = pd.read_csv('/content/drive/My Drive/PROYECTOS/DM_II_project/data/raw/patient_train_data.csv') # Uncomment if loading from Google Drive .Load the data from Google Drive
cancer_df = pd.read_csv('g:/Mi unidad/PROYECTOS/DM_II_project/data/raw/patient_train_data.csv') # Uncomment if loading from local directory

In [3]:
cancer_df.head()

Unnamed: 0,ID,Alcohol Consumption,Cancer Stage,Country,Date of Birth,Diabetes,Diabetes History,Diet Risk,Early Detection,Family History,...,Non Smoker,Obesity BMI,Physical Activity,Screening History,Smoking History,Transfusion History,Treatment Type,Tumor Size (mm),Urban or Rural,Survival Prediction
0,1,No,Localized,UK,29-01-1966,No,No,Moderate,No,No,...,Yes,Overweight,Low,Regular,No,-,Chemotherapy,33.0,Urban,Yes
1,2,No,Regional,Japan,21-12-1958,No,No,Low,No,No,...,No,Normal,Low,Irregular,Yes,-,Chemotherapy,17.0,Urban,No
2,3,No,Localized,France,16-06-1959,No,No,Low,Yes,No,...,No,Normal,Moderate,Never,Yes,-,Surgery,34.0,Urban,Yes
3,4,Yes,Localized,Japan,18-01-1956,Yes,No,Moderate,No,No,...,Yes,Obese,Low,Regular,No,-,Radiotherapy,71.0,Urban,No
4,5,No,Regional,USA,04-05-1975,No,No,Moderate,Yes,Yes,...,Yes,Overweight,Low,Regular,No,-,Radiotherapy,48.0,Rural,Yes


# 1. Basic cleaning and data split

Three columns were dropped from the `cancer_df` dataframe: 'Diabetes History', 'Marital Status', and 'Transfusion History'.

In [4]:
cancer_df_reduced = cancer_df.drop(columns=['Diabetes History', 'Marital Status', 'Transfusion History'], axis=1).copy()

The country names in the `cancer_df_reduced` dataframe were replaced by their respective continents to reduce the cardinality of the categorical variable. This helps in simplifying the analysis and modeling by grouping countries into broader regions.

In [5]:
cancer_df_reduced['Country'].unique()

array(['UK', 'Japan', 'France', 'USA', 'China', 'South Korea', 'Brazil',
       'Germany', 'Canada', 'Pakistan', 'Italy', 'New Zealand',
       'South Africa', 'India', 'Nigeria', 'Australia', nan], dtype=object)

In [6]:
continents = {
    'USA': 'North America',
    'Brazil': 'South America',
    'Germany': 'Europe',
    'France': 'Europe',
    'Italy': 'Europe',
    'UK': 'Europe',
    'China': 'Asia',
    'Japan': 'Asia',
    'India': 'Asia',
    'South Korea': 'Asia',
    'Pakistan': 'Asia',
    'South Africa': 'Africa',
    'Nigeria': 'Africa',
    'Australia': 'Oceania',
    'New Zealand': 'Oceania'
}
cancer_df_reduced['Continent'] = cancer_df_reduced['Country'].replace(continents)
cancer_df_reduced.drop(columns=['Country'], inplace=True)
cancer_df_reduced['Continent'].head()

0           Europe
1             Asia
2           Europe
3             Asia
4    North America
Name: Continent, dtype: object

The birth dates in the dataframe were converted to datetime format, and the age of each patient was calculated based on the current year. The original birth date column was then removed from the dataframe.

In [7]:
from datetime import datetime

cancer_df_reduced['Date of Birth'] = pd.to_datetime(cancer_df_reduced['Date of Birth'], format='%d-%m-%Y', errors='coerce')
cancer_df_reduced['Age'] = cancer_df_reduced['Date of Birth'].apply(lambda x: datetime.now().year - x.year if pd.notnull(x) else None)
cancer_df_reduced.drop(columns=['Date of Birth'], inplace=True)
cancer_df_reduced['Age'].head()

0    59.0
1    67.0
2    66.0
3    69.0
4    50.0
Name: Age, dtype: float64

The 'Urban or Rural' column values were standardized by replacing 'urban' with 'Urban' and 'rural' with 'Rural' to ensure consistency in the dataset.

In [8]:
cancer_df_reduced['Urban or Rural'].unique()

array(['Urban', 'Rural', 'urban', 'rural', nan], dtype=object)

In [9]:
cancer_df_reduced['Urban or Rural'].replace({'urban': 'Urban', 'rural': 'Rural'}, inplace=True)
cancer_df_reduced['Urban or Rural'].unique()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cancer_df_reduced['Urban or Rural'].replace({'urban': 'Urban', 'rural': 'Rural'}, inplace=True)


array(['Urban', 'Rural', nan], dtype=object)

In [10]:
cancer_df_reduced.head()

Unnamed: 0,ID,Alcohol Consumption,Cancer Stage,Diabetes,Diet Risk,Early Detection,Family History,Gender,Genetic Mutation,Healthcare Access,...,Obesity BMI,Physical Activity,Screening History,Smoking History,Treatment Type,Tumor Size (mm),Urban or Rural,Survival Prediction,Continent,Age
0,1,No,Localized,No,Moderate,No,No,M,No,High,...,Overweight,Low,Regular,No,Chemotherapy,33.0,Urban,Yes,Europe,59.0
1,2,No,Regional,No,Low,No,No,M,No,Moderate,...,Normal,Low,Irregular,Yes,Chemotherapy,17.0,Urban,No,Asia,67.0
2,3,No,Localized,No,Low,Yes,No,M,No,High,...,Normal,Moderate,Never,Yes,Surgery,34.0,Urban,Yes,Europe,66.0
3,4,Yes,Localized,Yes,Moderate,No,No,M,No,Moderate,...,Obese,Low,Regular,No,Radiotherapy,71.0,Urban,No,Asia,69.0
4,5,No,Regional,No,Moderate,Yes,Yes,M,No,Low,...,Overweight,Low,Regular,No,Radiotherapy,48.0,Rural,Yes,North America,50.0


The dataset was split into training, validation, and test sets to ensure robust model evaluation and to prevent overfitting. The initial split separated 15% of the data as the test set, which will be used for the final evaluation of the model's performance. This ensures that the test set remains completely unseen during the training and validation phases, providing an unbiased assessment of the model's generalization capability.

The proportions were chosen to maintain a balance between having enough data for training the model and having sufficient data for validation and testing to ensure reliable performance metrics.

In [11]:
from sklearn.model_selection import train_test_split

X = cancer_df_reduced.drop(columns=['Survival Prediction'])
y = cancer_df_reduced['Survival Prediction'] 

# First split to separate the test set (15% of the data)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

# Split the remaining data into training and validation sets
# test_size=0.1765 because 15/85 ≈ 0.1765 (to get 15% of the original total)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp)

# Verify the sizes and proportions
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (52522, 28)
X_val shape: (11257, 28)
X_test shape: (11256, 28)
y_train shape: (52522,)
y_val shape: (11257,)
y_test shape: (11256,)


# 2. PREPROCESSING MANUAL

In [16]:
from sklearn.preprocessing import FunctionTransformer, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, f_classif
from scipy import stats
import numpy as np

In [13]:
# Imputar valores faltantes en columnas numéricas
num_imputer = SimpleImputer(strategy='median')
X_train_num = num_imputer.fit_transform(X_train.select_dtypes(include='number'))

# Imputar valores faltantes en columnas categóricas
cat_imputer = SimpleImputer(strategy='most_frequent')
X_train_cat = cat_imputer.fit_transform(X_train.select_dtypes(exclude='number'))

# Escalar características numéricas
scaler = RobustScaler()
X_train_num_scaled = scaler.fit_transform(X_train_num)

# Codificar variables categóricas
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_cat_encoded = encoder.fit_transform(X_train_cat).toarray()

# Combinar características numéricas y categóricas
X_train_preprocessed = np.hstack((X_train_num_scaled, X_train_cat_encoded))

In [28]:
from xgboost import XGBClassifier

# Encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Train the model with best parameters

scale_pos_weight = len(y_train_encoded[y_train_encoded == 0]) / len(y_train_encoded[y_train_encoded == 1])
model = XGBClassifier(scale_pos_weight=scale_pos_weight,
    random_state=42,
    colsample_bytree=0.8,
    learning_rate=0.2,
    max_depth=7,
    n_estimators=300,
    subsample=0.8
)
model = XGBClassifier(random_state=42)
model.fit(X_train_preprocessed, y_train_encoded)

In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Ensure the target variable is encoded
y_train_encoded = label_encoder.fit_transform(y_train)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1_weighted', cv=5, n_jobs=-1)
grid_search.fit(X_train_preprocessed, y_train_encoded)
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}


In [29]:
from sklearn.metrics import classification_report
# Preprocesar el conjunto de validación
X_val_num = num_imputer.transform(X_val.select_dtypes(include='number'))
X_val_cat = cat_imputer.transform(X_val.select_dtypes(exclude='number'))
X_val_num_scaled = scaler.transform(X_val_num)
X_val_cat_encoded = encoder.transform(X_val_cat).toarray()
X_val_preprocessed = np.hstack((X_val_num_scaled, X_val_cat_encoded))

# Hacer predicciones y evaluar
y_pred = model.predict(X_val_preprocessed)

# Decode the predicted labels back to the original string labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)

print(classification_report(y_val, y_pred_decoded))

              precision    recall  f1-score   support

          No       0.42      0.14      0.21      4496
         Yes       0.60      0.87      0.71      6761

    accuracy                           0.58     11257
   macro avg       0.51      0.50      0.46     11257
weighted avg       0.53      0.58      0.51     11257



# 3. PREPROCESSING PIPELINE

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, f_classif
from scipy import stats
import numpy as np

Winsorization of numeric columns with outliers

In [None]:
def winsorize_columns(X, limits=[0.01, 0.01]):
    X_winsorized = np.copy(X)
    for i in range(X.shape[1]):
        X_winsorized[:, i] = stats.mstats.winsorize(X[:, i], limits=limits)
    return X_winsorized

numerical_df = X_train.select_dtypes(include='number')
negative_columns = numerical_df.select_dtypes(include='number').columns[(numerical_df < 0).any()].tolist()


def negative_imputer(X, negative_columns):
    for i in range(X.shape[1]):
        if (X[:, i] < 0).any():
            column_median = np.median(X[:, i][X[:, i] >= 0])
            X[:, i] = np.where(X[:, i] < 0, column_median, X[:, i])
    return X

numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('negative imputer', FunctionTransformer(negative_imputer, kw_args={'negative_columns': negative_columns})), ('scaler', RobustScaler()), ('winsorizer', FunctionTransformer(winsorize_columns, kw_args={'limits': [0.01, 0.01]})), ('feature_selection', SelectKBest(score_func=f_classif, k=10))])
categorial_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, X_train.select_dtypes(include='number').columns), ('cat', categorial_transformer, X_train.select_dtypes(exclude='number').columns)])

random_forest = RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=0)

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', random_forest)])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_val)



In [None]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

          No       0.40      0.05      0.08      4496
         Yes       0.60      0.95      0.74      6761

    accuracy                           0.59     11257
   macro avg       0.50      0.50      0.41     11257
weighted avg       0.52      0.59      0.48     11257

