In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot

In [2]:
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-databases/00383/risk_factors_cervical_cancer.csv")

In [5]:
# read cervical cancer data into pandas data frame
df = pd.read_csv(target_url)

print(df.head())
print(df.tail())

   Age Number of sexual partners First sexual intercourse Num of pregnancies  \
0   18                       4.0                     15.0                1.0   
1   15                       1.0                     14.0                1.0   
2   34                       1.0                        ?                1.0   
3   52                       5.0                     16.0                4.0   
4   46                       3.0                     21.0                4.0   

  Smokes Smokes (years) Smokes (packs/year) Hormonal Contraceptives  \
0    0.0            0.0                 0.0                     0.0   
1    0.0            0.0                 0.0                     0.0   
2    0.0            0.0                 0.0                     0.0   
3    1.0           37.0                37.0                     1.0   
4    0.0            0.0                 0.0                     1.0   

  Hormonal Contraceptives (years)  IUD  ...    \
0                             0.0  0.0  ...

In [6]:
# perform some data validations

# lets check for null values in the heart disease data set
df.isnull().values.any()

False

In [7]:
# find dtypes for columns, all columns should be integers
df.dtypes

Age                                    int64
Number of sexual partners             object
First sexual intercourse              object
Num of pregnancies                    object
Smokes                                object
Smokes (years)                        object
Smokes (packs/year)                   object
Hormonal Contraceptives               object
Hormonal Contraceptives (years)       object
IUD                                   object
IUD (years)                           object
STDs                                  object
STDs (number)                         object
STDs:condylomatosis                   object
STDs:cervical condylomatosis          object
STDs:vaginal condylomatosis           object
STDs:vulvo-perineal condylomatosis    object
STDs:syphilis                         object
STDs:pelvic inflammatory disease      object
STDs:genital herpes                   object
STDs:molluscum contagiosum            object
STDs:AIDS                             object
STDs:HIV  

In [10]:
# there seem to be many columns with missing values

df['Number of sexual partners'] = pd.to_numeric(df['Number of sexual partners'], errors='coerce')
df['First sexual intercourse'] = pd.to_numeric(df['First sexual intercourse'], errors='coerce')
df['Num of pregnancies'] = pd.to_numeric(df['Num of pregnancies'], errors='coerce')
df['Smokes'] = pd.to_numeric(df['Smokes'], errors='coerce')
df['Smokes (years)'] = pd.to_numeric(df['Smokes (years)'], errors='coerce')
df['Smokes (packs/year)'] = pd.to_numeric(df['Smokes (packs/year)'], errors='coerce')
df['Hormonal Contraceptives'] = pd.to_numeric(df['Hormonal Contraceptives'], errors='coerce')
df['Hormonal Contraceptives (years)'] = pd.to_numeric(df['Hormonal Contraceptives (years)'], errors='coerce')
df['IUD'] = pd.to_numeric(df['IUD'], errors='coerce')
df['IUD (years)'] = pd.to_numeric(df['IUD (years)'], errors='coerce')
df['STDs'] = pd.to_numeric(df['STDs'], errors='coerce')

df['STDs (number)'] = pd.to_numeric(df['STDs (number)'], errors='coerce')
df['STDs:condylomatosis'] = pd.to_numeric(df['STDs:condylomatosis'], errors='coerce')
df['STDs:cervical condylomatosis'] = pd.to_numeric(df['STDs:cervical condylomatosis'], errors='coerce')
df['STDs:vaginal condylomatosis'] = pd.to_numeric(df['STDs:vaginal condylomatosis'], errors='coerce')
df['STDs:vulvo-perineal condylomatosis'] = pd.to_numeric(df['STDs:vulvo-perineal condylomatosis'], errors='coerce')
df['STDs:syphilis'] = pd.to_numeric(df['STDs:syphilis'], errors='coerce')
df['STDs:pelvic inflammatory disease'] = pd.to_numeric(df['STDs:pelvic inflammatory disease'], errors='coerce')

df['STDs:genital herpes'] = pd.to_numeric(df['STDs:genital herpes'], errors='coerce')
df['STDs:molluscum contagiosum'] = pd.to_numeric(df['STDs:molluscum contagiosum'], errors='coerce')
df['STDs:AIDS'] = pd.to_numeric(df['STDs:AIDS'], errors='coerce')
df['STDs:HIV'] = pd.to_numeric(df['STDs:HIV'], errors='coerce')
df['STDs:Hepatitis B'] = pd.to_numeric(df['STDs:Hepatitis B'], errors='coerce')
df['STDs:HPV'] = pd.to_numeric(df['STDs:HPV'], errors='coerce')

df['STDs: Time since first diagnosis'] = pd.to_numeric(df['STDs: Time since first diagnosis'], errors='coerce')
df['STDs: Time since last diagnosis'] = pd.to_numeric(df['STDs: Time since last diagnosis'], errors='coerce')


In [11]:
# replace the NaN values with the mean/median value of the column
df['Number of sexual partners'] = df['Number of sexual partners'].fillna(df['Number of sexual partners'].median())
df['First sexual intercourse'] = df['First sexual intercourse'].fillna(df['First sexual intercourse'].median())

df['Num of pregnancies'] = df['Num of pregnancies'].fillna(df['Num of pregnancies'].median())

