In [1]:
# Python function that takes a Pandas DataFrame as input and returns another DataFrame 
# that shows the percentage of missing values in each column of the input DataFrame.
def getDataFrameWithNullValuesPercentage(df):
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                     'percent_missing': percent_missing}).sort_values(by='percent_missing', ascending=False)
    return missing_value_df

In [None]:
def getDataFrameWithNotNullValuesPercentage(df):
    missing_columns = [col for col in df.columns if not df[col].isnull().any()]
    reduced_df = df.drop(axis=1, columns=missing_columns)
    return reduced_df

In [2]:
# Python function that takes a Pandas DataFrame, a column name, and a threshold value 
# as input, and returns a list of column names in the DataFrame that have a higher percentage 
# of missing values than the threshold value for the specified column.
def getSeriesFromDataFrameThreesholdedByNullValues(df, column_name, threshoold):
    series = missing_value_df[column_name]
    selected_series = series[series > threshoold]
    column_with_higher_null_values = selected_series.index
    return column_with_higher_null_values

In [3]:
# Python function that performs K-Nearest Neighbors (KNN) imputation on a Pandas DataFrame 
# to fill in missing values in a specified target column.

from sklearn.neighbors import KNeighborsRegressor

def knn_impute(df, na_target):
    df = df.copy()
    
    numeric_df = df.select_dtypes(np.number)
    non_na_columns = numeric_df.loc[: ,numeric_df.isna().sum() == 0].columns
    
    y_train = numeric_df.loc[numeric_df[na_target].isna() == False, na_target]
    X_train = numeric_df.loc[numeric_df[na_target].isna() == False, non_na_columns]
    X_test = numeric_df.loc[numeric_df[na_target].isna() == True, non_na_columns]
    
    knn = KNeighborsRegressor()
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)
    
    df.loc[df[na_target].isna() == True, na_target] = y_pred
    
    return df

In [7]:

# Python function that performs mode imputation on a Pandas DataFrame to fill in missing values 
# in a specified variable column, and then creates a countplot to visualize the distribution of 
# values in that column.
def mode_impute_and_plot(df, variable):
    print('# of missing values: ' + str(df[variable].isna().sum()))
    plt.figure(figsize=(8,4))
    ax = sns.countplot(df[variable])
    ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
    plt.tight_layout()
    plt.show()
    
    df[variable].replace(np.nan, all_data[variable].mode()[0], regex=True, inplace=True)