In [25]:
import os
os.chdir('../quafing/')
print(f"Working directory: {os.getcwd()}")
import quafing as q


import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter


plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({"font.size" : 15, 
                     "figure.dpi" : 100, 
                     "legend.fontsize" : 13, 
                     "grid.alpha" : 0.3, 
                     "axes.grid": True, 
                     "axes.axisbelow" : True, 
                     "figure.figsize":(6, 5)})

Working directory: /Users/charlesdupont/Desktop/Thesis/code/quafing


In [2]:
def load_data(path):
    """
    Loads .dta file using provided path.
    """
    return pd.read_stata(path, convert_categoricals=False)


def plot_missing_prop(df):
    """
    Plots bar graph to visualize proportion of missing entries for each column of provided df.
    """
    missing_prop = {}
    L = len(df)
    for col in df.columns:
        missing_prop[col] = df[col].isna().sum()/L
    ordered_keys = sorted(missing_prop.keys(), key=lambda x: missing_prop[x], reverse=True)
    plt.figure(figsize=(10, 4))
    plt.bar(np.arange(len(missing_prop)), [missing_prop[k] for k in ordered_keys], color="dodgerblue")
    plt.xticks(np.arange(len(missing_prop)), ordered_keys, rotation=90)
    plt.ylabel("Proportion Missing")
    plt.show()
    
    
def plot_categorical_feature_frequency(df, col, sort=True, xlabel=None, xticks_dict={}, 
                                       save=False, save_filename=None):
    freq = Counter(df[col])
    freq = {k:v for k,v in freq.items() if not np.isnan(k)}
    L = len(freq)
    if sort:
        ordered_keys = sorted(freq.keys(), key=lambda x: freq[x], reverse=True)
    else:
        ordered_keys = sorted(freq.keys(), key=lambda x: x)
    plt.figure(figsize=(12,4))
    plt.bar(np.arange(L), [freq[k] for k in ordered_keys], color="dodgerblue", alpha=0.8)
    if xticks_dict:
        plt.xticks(np.arange(L), [xticks_dict[k] for k in ordered_keys], rotation=90)
    else:
        plt.xticks(np.arange(L), ordered_keys, rotation=90)
    plt.ylabel("Frequency")
    if xlabel:
        plt.xlabel(xlabel, labelpad=20)
    else:
        plt.xlabel(col, labelpad=20)
        
    if save:
        plt.savefig(save_filename)
        
    plt.show()

In [3]:
DATA_DIR = "../../BCCASII/"

# Community

In [4]:
DATA_SUBDIR = "Community/"