### Importing libraries

In [43]:
import pandas as pd

### Reading data set

In [44]:
df = pd.read_csv('thyroid_cancer_risk_data.csv')

## Cleaning the Data
### 1. Duplicate records
check for duplicated records if there are any then remove them

In [45]:
duplicates = df[df.duplicated()]

if duplicates.empty:
    print("There are no duplicate records in the dataset.")
else:
    df.drop_duplicates(inplace=True)
    count = len(duplicates)
    if count == 1:
        print("One duplicate record was removed from the dataset.")
    else:
        print(f"{count} duplicate records were removed from the dataset.")

There are no duplicate records in the dataset.


### 2. Missing Values

#### a. Replace Missing Values
    
##### Median: Replace missing Age values with median if there are any missing values for Age.


In [46]:
medianAge = round(df["Age"].median())

df.fillna({"Age": medianAge}, inplace=True)
print(f"Median for Age attribute is: {medianAge}")

missing_count = df["Age"].isna().sum()
print(f"{missing_count} missing 'Age' values were filled with the median.")

Median for Age attribute is: 52
0 missing 'Age' values were filled with the median.


##### Mean: If there are any empty cells for Nodule_Size attribute, then replace it with the mean.

In [47]:
meanSize = round(df["Nodule_Size"].mean(), 2)

df.fillna({"Nodule_Size": meanSize}, inplace=True)
print(f"Mean for Nodule_Size attribute is: {meanSize}")

missing_count = df["Nodule_Size"].isna().sum()
print(f"{missing_count} missing 'Nodule_Size' values were filled with the mean.")

Mean for Nodule_Size attribute is: 2.5
0 missing 'Nodule_Size' values were filled with the mean.


##### Mode: Calculate the MODE for Country, and replace any empty values of the Country column with it

In [48]:
modeForCountry = df["Country"].mode()[0]

df.fillna({"Country": modeForCountry}, inplace=True)
print(f"Mode for Country attribute is: {modeForCountry}")

missing_count = df["Country"].isna().sum()
print(f"{missing_count} missing 'Country' values were filled with the mode.")

Mode for Country attribute is: India
0 missing 'Country' values were filled with the mode.


#### b. Delete entire record if any Diagnosis values are missing

In [49]:
df.dropna(subset=['Diagnosis'], inplace = True)

missing_count = df["Diagnosis"].isna().sum()
print(f"{missing_count} records removed because of missing 'Diagnosis' values.")

0 records removed because of missing 'Diagnosis' values.


### 3. Wrong Data
 #### a. Replace with correct data

In [50]:
for x in df.index:
  if df.loc[x, "Age"] > 100:
    df.loc[x, "Age"] = 100

#### b. Remove record which has wrong data

In [51]:
for x in df.index:
  if df.loc[x, "Nodule_Size"] > 20:
    df.drop(x, inplace = True)