# Pima Data cleaning



In [None]:
# Import Modules Here
import pandas as pd

## Part 1: Read the Data into Jupyter

In [None]:
# Read the data into a dataframe named pima_df.
pima_df = pd.read_csv('diabetes_uncleaned_data.csv')

In [None]:
#Question 1:  Find Columns with missing data
pima_df.isna().sum()

In [None]:
# Question 2:  Write code to remove rows with missing data and save the new dataframe as pima_missing_fixed_df
pima_df.info()

In [None]:
pima_missing_fixed_df = pima_df.dropna(how='any')
pima_missing_fixed_df.isna().sum()

In [None]:
print(pima_missing_fixed_df.info())

In [None]:
# Question 3:  Print out the number of rows in the dataframe.
len(pima_missing_fixed_df)

In [None]:
# Question 4: In your jupyter notebook, discuss why this could be a problem. What other methods could you use in this situation? 
Discuss why missing data could be a problem? Missing data could lead to incomplete analysis--if not handled properly, it could skew 
statistical measures and affect the reliability of the conclusions that are being drawn from the data, we dont want steakholders viewing
this data. We first need to understand why the data is missing, what kind of data is missing, and if it needs to be filled in before making 
conclusions. 

In [None]:
# Question 1: find the rows of duplicated data in the pima_missing_fixed_df and print them.  
pima_missing_fixed_df[pima_missing_fixed_df.duplicated()]

In [None]:
# Question 2:  Write code to remove the duplicate rows and save the new dataframe as pima_dedupped_df
# Remove duplicate rows and save
pima_dedupped_df = pima_missing_fixed_df.drop_duplicates()

In [None]:
# Question 3:  Print out the number of rows in the dataframe.
len(pima_dedupped_df)

In [None]:
# Question 1: find the columns where most of the data seems to be one type of data, and there seems to be a data error. In the markdown, after exploring in code, add a markdown cell discussing which columns have the data error. 
pima_dedupped_df.head()

In [None]:
print(pima_dedupped_df.dtypes)

In [None]:
for col, dtype in pima_dedupped_df.dtypes.items():
    if dtype == float or dtype == int:
        print(f'{col}: {pima_dedupped_df[col].min()} -> {pima_dedupped_df[col].max()}')

In [None]:
pima_dedupped_df.BloodPressure.max()

In [None]:
pima_dedupped_df.BMI.max()

In [None]:
pima_dedupped_df.Outcome.value_counts()

In [None]:
### Columns with Identified Data Errors (Based on Unique Values)

Upon examining the unique values in each column:

- **Pregnancies**: contains -100 which is an error based on num pregnancies in df.
- **Glucose**: given 70 to 200 mg/dL is for normal and diabetic populations, values like 0 and very low integers are likely errors or missing
data points. Values close to 200 should be checked for their validity.
- **BloodPressure**: Contains values like 0 adn 10000, which are likely errors given the context of blood pressure, as well as 'error' value
- **SkinThickness**: Zero (0) might indicate a missing value or an error, as its unusual to have zero skin thickness.
- **Insulin**: The value 10000 stands out as it is significantly higher than other values and might be an error or a rare extreme case.
- **BMI**: Includes values such as 0, which are biologically implausible for body mass index, as well as some extremes. Error input as str as well.
- **DiabetesPedigreeFunction**: need more insights
- **Age**: Fairly reasonable, could check validity of extreme high values
- **Outcome**: some error values

These unique values suggest potential errors or inconsistencies in the dataset that need to be addressed


In [None]:
import numpy as np

In [None]:
# Question 2: Remove the rows with data errors in your jupyter notebook.
error_columns = (pima_dedupped_df.BloodPressure == 'Error') | (pima_dedupped_df.BMI == 'Error') | (pima_dedupped_df.Outcome == 'ERROR')
pima_fixed_columns_df = pima_dedupped_df[~error_columns].copy()
pima_fixed_columns_df.Outcome.value_counts()

In [None]:
# Question 3: After the data is fixed in your columns change the columns to the correct type and save this as the pima_fixed_columns_df.
pima_fixed_columns_df['BloodPressure'] = pima_fixed_columns_df.BloodPressure.apply(lambda x: int(x))
pima_fixed_columns_df['BMI'] = pima_fixed_columns_df.BMI.apply(lambda x: float(x))

In [None]:
# Question 4: Run pima_fixed_columns_df.info() to confirm the columns have changed.
pima_fixed_columns_df.info()

## Part 5: Outlier Detection and Removal


In [None]:
# Question 1: Print out the Outliers in each column in the pima_fixed_columns_df dataframe, use the IQR method of outlier detection.
def is_outlier(column: pd.Series):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return ~column.between(lower_bound, upper_bound)

for col, dtype in pima_fixed_columns_df.dtypes.items():
    if dtype == float or dtype == int:
        outliers = is_outlier(pima_fixed_columns_df[col])
        outliers = pima_fixed_columns_df[col][outliers]
        if len(outliers):
            print(f'{col}: {outliers.values}')

In [None]:
# Question 2: Use loc to remove outliers in each of the columns that have outliers, save this as pima_outlier_removed_df.
outliers = is_outlier(pima_fixed_columns_df.Pregnancies) | is_outlier(pima_fixed_columns_df.BloodPressure) | is_outlier(pima_fixed_columns_df.Insulin)
pima_outlier_removed_df = pima_fixed_columns_df[~outliers]

In [None]:
# Question 3:  Print out the row count in the pima_outlier_removed_df and confirm this number is correct.
len(pima_outlier_removed_df)