In [16]:
import pandas as pd
import numpy as np


In [17]:
# Load the dataset
file_path = 'Ovarian_patient_data.csv'
df = pd.read_csv(file_path)
print("Data loaded successfully. Data shape:", df.shape)

Data loaded successfully. Data shape: (200100, 34)


# Exploratory Data Analysis (EDA)
Before any data manipulation, it is important to understand the data structure, types, and potential issues like missing values or duplicates.  
This step helps in making informed preprocessing decisions, as described in Chen et al. (2019) :contentReference[oaicite:1]{index=1}.


In [18]:
# Display basic information about the dataset
print("===== Dataset Information =====")
print(df.info())



===== Dataset Information =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200100 entries, 0 to 200099
Data columns (total 34 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Timestamp               200100 non-null  object 
 1   Age                     200100 non-null  float64
 2   BMI                     200100 non-null  float64
 3   Comorbidity             200100 non-null  int64  
 4   Symptom                 200100 non-null  int64  
 5   CA125                   200100 non-null  float64
 6   CancerStage             200100 non-null  int64  
 7   Histopathology          200100 non-null  object 
 8   PreviousTreatment       200100 non-null  int64  
 9   MenstrualHistory        200100 non-null  object 
 10  Ethnicity               200100 non-null  object 
 11  Smoking                 200100 non-null  int64  
 12  Alcohol                 200100 non-null  int64  
 13  Residence               200100 non-null  o

In [19]:
print("\n===== Summary Statistics =====")
print(df.describe(include='all'))


===== Summary Statistics =====
                  Timestamp            Age            BMI    Comorbidity  \
count                200100  200100.000000  200100.000000  200100.000000   
unique               200100            NaN            NaN            NaN   
top     2019-01-01 00:00:00            NaN            NaN            NaN   
freq                      1            NaN            NaN            NaN   
mean                    NaN      59.901575      28.003226       0.300000   
std                     NaN      14.668235       4.980700       0.458259   
min                     NaN      18.000000      15.000000       0.000000   
25%                     NaN      49.891372      24.616558       0.000000   
50%                     NaN      60.025998      27.998428       0.000000   
75%                     NaN      70.141600      31.361152       1.000000   
max                     NaN      90.000000      49.096832       1.000000   

              Symptom          CA125    CancerStage His

In [20]:
print("\n===== Missing Values per Column =====")
print(df.isnull().sum())


===== Missing Values per Column =====
Timestamp                 0
Age                       0
BMI                       0
Comorbidity               0
Symptom                   0
CA125                     0
CancerStage               0
Histopathology            0
PreviousTreatment         0
MenstrualHistory          0
Ethnicity                 0
Smoking                   0
Alcohol                   0
Residence                 0
SocioeconomicStatus       0
BRCA_Mutation             0
GeneExpression            0
SNP_Status                0
DNAMethylation            0
miRNA                     0
TumorSize                 0
TumorLocation             0
EnhancementPattern        0
RadiomicTexture           0
RadiomicIntensity         0
RadiomicShape             0
DopplerVelocity           0
Parity                    0
OralContraceptives        0
HormoneTherapy            0
MenarcheAge               0
MenopauseAge              0
RiskLabel                 0
ProgressionProbability    0
dtype: in

# Remove Duplicate Records
Removing duplicate records is essential to prevent bias and redundancy in the analysis.  
Maintaining data quality is a critical step in the early detection modeling pipeline, as emphasized in relevant research (Chen et al., 2019) :contentReference[oaicite:2]{index=2}.


In [21]:
# Remove duplicate rows
df = df.drop_duplicates()
print("Duplicates removed. Data shape after duplicate removal:", df.shape)

Duplicates removed. Data shape after duplicate removal: (200100, 34)


# Identify Numerical and Categorical Features
It is necessary to separate numerical and categorical features because each type requires different preprocessing strategies.  
For instance, numerical features might need imputation and scaling, while categorical features require imputation and encoding.  
This strategy is commonly adopted in integrative data analysis for ovarian cancer detection (Chen et al., 2019) :contentReference[oaicite:3]{index=3}.


In [22]:
# Identify numerical and categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

print("Numerical columns:", num_cols)
print("Categorical columns:", cat_cols)


Numerical columns: ['Age', 'BMI', 'Comorbidity', 'Symptom', 'CA125', 'CancerStage', 'PreviousTreatment', 'Smoking', 'Alcohol', 'BRCA_Mutation', 'GeneExpression', 'SNP_Status', 'DNAMethylation', 'miRNA', 'TumorSize', 'EnhancementPattern', 'RadiomicTexture', 'RadiomicIntensity', 'RadiomicShape', 'DopplerVelocity', 'Parity', 'OralContraceptives', 'HormoneTherapy', 'MenarcheAge', 'MenopauseAge', 'RiskLabel', 'ProgressionProbability']
Categorical columns: ['Timestamp', 'Histopathology', 'MenstrualHistory', 'Ethnicity', 'Residence', 'SocioeconomicStatus', 'TumorLocation']


# Preprocess Numerical Features
For numerical data, we first impute missing values using the median.  
Median imputation is robust to outliers. After imputation, we apply standard scaling to normalize the data.  
Scaling improves model convergence and is a standard procedure in clinical data preprocessing (Chen et al., 2019) :contentReference[oaicite:4]{index=4}.


In [23]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Create an imputer and scaler for numerical features
num_imputer = SimpleImputer(strategy='median')
num_scaler = StandardScaler()

# Impute missing values in numerical columns
df[num_cols] = num_imputer.fit_transform(df[num_cols])
# Scale the numerical features
df[num_cols] = num_scaler.fit_transform(df[num_cols])

print("Numerical features preprocessed.")


Numerical features preprocessed.


# Preprocess Categorical Features
For categorical data, missing values are filled using the most frequent value (mode) strategy.  
After imputation, we convert categorical variables to a numerical format using one-hot encoding, which is crucial for machine learning models.  
These techniques are widely validated in research focused on early detection of ovarian cancer (Chen et al., 2019) :contentReference[oaicite:5]{index=5}.


In [24]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Impute missing values in categorical columns
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# One-hot encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_cat = encoder.fit_transform(df[cat_cols])
encoded_cat_df = pd.DataFrame(encoded_cat, columns=encoder.get_feature_names_out(cat_cols))
# Exclude timestamp column from encoding
timestamp_col = 'Timestamp'
cat_cols.remove(timestamp_col)
encoded_cat = encoder.fit_transform(df[cat_cols])
encoded_cat_df = pd.DataFrame(encoded_cat, columns=encoder.get_feature_names_out(cat_cols))
encoded_cat_df[timestamp_col] = df[timestamp_col].values
# Drop the original categorical columns and merge the encoded features
df = df.drop(columns=cat_cols)
df = pd.concat([df.reset_index(drop=True), encoded_cat_df.reset_index(drop=True)], axis=1)

print("Categorical features preprocessed.")


Categorical features preprocessed.


In [25]:
original_cat_dims = len(cat_cols)
encoded_cat_dims = encoded_cat.shape[1]
increased_dims = encoded_cat_dims - original_cat_dims
# Remove the last dimension of Timestamp
encoded_cat_df = encoded_cat_df.drop(columns=[timestamp_col])
encoded_cat_dims -= 1
print("Original categorical dimensions:", original_cat_dims)
print("Encoded categorical dimensions:", encoded_cat_dims)
print("Increased dimensions due to encoding:", increased_dims)

Original categorical dimensions: 6
Encoded categorical dimensions: 16
Increased dimensions due to encoding: 11


# Save Cleaned Dataset
After preprocessing, we save the cleaned dataset to disk.  
A reproducible and well-documented dataset is crucial for subsequent modeling and ensures that research findings can be validated by peers (Chen et al., 2019) :contentReference[oaicite:6]{index=6}.


In [30]:
# Save the cleaned dataset
output_path = 'Ovarian_patient_data_clean.csv'
df = df.iloc[:, :-1]
print(f"Shape: {df.shape}")
df.to_csv(output_path, index=False)

print("Cleaned dataset saved successfully to:", output_path)


Shape: (200100, 43)
Cleaned dataset saved successfully to: Ovarian_patient_data_clean.csv
