In [1]:
#from ipynb.fs.full.my_functions import *

In [2]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns; sns.set()

In [3]:
def unflag(column_series):
    unflaged_list = list(column_series)
    for c in column_series:
        if c[-6:] == '(flag)':
            unflaged_list.remove(c)
    return unflaged_list

In [4]:
irrelevant_columns = ['Unnamed: 0', 
                      'Year of survey', 
                      'Country', 
                      'Month of birth',
                      'Father ID',
                      'Mother ID',
                      'Spouse/partner ID', 
                      'Month of the personal interview', 
                      'Year of the personal interview', 
                      'Minutes to complete the personal questionnaire', 
                      'Household ID', 
                      'Primary sampling units (PSU)', 
                      'Houshold cross-sectional weight', 
                      'Month of household interview', 
                      'Year of household interview', 
                      'Person responding the household questionnaire', 
                      'Person 2 responsible for the accommodation', 
                      'Number of minutes to complete the household questionnaire',
                      'Residential status']

In [5]:
def weighted_freq(df, cat_column):
    dummy = pd.get_dummies(cat_column)
    for c in dummy.columns:
        dummy['weight_' + str(c)] = dummy[c] * df.weight
    freq_dict = {}
    for c in dummy.columns:
        if str(c)[0] == 'w':
            freq_dict[c[7:]] = dummy[c].sum() / df.weight.sum()
    return freq_dict

In [6]:
def weighted_cat(df, cat_column):
    dummy = pd.get_dummies(cat_column)
    for c in dummy.columns:
        dummy['weight_' + str(c)] = dummy[c] * df.weight
    weighted_cat = []
    for i in range(len(df)):
        for c in dummy.columns:
            if c[0] == 'w':
                for x in range(int(round(dummy[c][i]/1000, 0))):
                       weighted_cat.append(c[7:])   
    return weighted_cat

In [7]:
def weighted_num(df, num_column):
    weighted_num = []
    for i in range(len(df)):
        for x in range(int(round(df.weight[i]/1000, 0))):
            weighted_num.append(num_column[i])  
    return weighted_num

In [8]:
def histplot(x, data, bins):
    plt.figure(figsize=(6.4*1.5, 4.8*1.5))
    sns.histplot(x=data[x], data=data, weights='weight', bins=bins, stat='density')

In [9]:
def barplot(x, data, legend=False):
    labels = sorted(data[x].unique())
    label_colors = ['C'+str(i) for i in range(len(labels))]
    x_ticks = range(len(labels))
    freq = weighted_freq(data, data[x])
    plt.figure(figsize=(6.4*1.5, 4.8*1.5))
    ax = plt.subplot()
    ax.bar(x_ticks, [freq[label] for label in labels], color=label_colors)
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(labels)
    plt.xlabel(x)
    plt.ylabel('Frequency')
    if legend==False:
        ax.set_xticklabels(labels)
    else:
        ax.set_xticklabels(['' for label in labels])
        for i in range(len(labels)):
            ax.bar([0], [0], label=labels[i][:40], color=label_colors[i])
            ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [10]:
def boxplot(x, y, data, legend=False):
    weighted_data = pd.DataFrame({x: weighted_cat(data, data[x]), y: weighted_num(data, data[y])})
    labels = sorted(data[x].unique())
    palette = {labels[i]: 'C'+str(i) for i in range(len(labels))}
    plt.figure(figsize=(6.4*1.5, 4.8*1.5))
    ax = plt.subplot()
    boxplot = sns.boxplot(x=x, y=y, data=weighted_data, order=sorted(data[x].unique()), palette=palette)
    if legend == True:
        boxplot.set(xticklabels=[])
        for i in range(len(labels)):
            ax.bar([0], [0], label=labels[i][:40], color=palette[labels[i]])
            ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [11]:
def length_calculator(labels):
    a = 4
    multiplier = 1
    rows = 2
    for i in range(labels):
        if i+1 >= a + 3:
            a += 3
            multiplier += 0.5
            rows += 1
    return multiplier, rows

In [12]:
def bardiagram(x, hue, data, legend=False):
    titles = sorted(data[x].unique())
    labels = sorted(data[hue].unique())
    label_colors = ['C'+str(i) for i in range(len(labels))]
    x_ticks = range(len(labels))
    length_multiplier, subplot_rows = length_calculator(len(titles))
    plt.figure(figsize=(6.4*2, 4.8*2*length_multiplier))
    subplot = 1
    for title in titles:
        subplot_data = data[data[x] == title]
        freq = weighted_freq(subplot_data, subplot_data[hue])
        ax = plt.subplot(subplot_rows, 3, subplot)
        ax.title.set_text(title[:40])
        ax.bar(x_ticks, [freq[label] if label in freq.keys() else 0 for label in labels], color=label_colors)
        ax.set_xticks(x_ticks)
        if legend==False:
            ax.set_xticklabels(labels)
        else:
            ax.set_xticklabels(['' for label in labels])
        subplot += 1
    if legend==True:
        for i in range(len(labels)):
            ax.bar([0], [0], label=labels[i][:20], color=label_colors[i])
            ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [13]:
def get_parent(key, column, data):
    new_column = []
    for i in range(len(data)):
        if data[column][i] == 'Missing':
            new_column.append(list(data[data.id == int(data[key][i])][column])[0])
        else:
            new_column.append(data[column][i])
    return new_column

In [None]:
def scatterplot(x, y, data):
    weighted_data = pd.DataFrame({x: weighted_num(data, data[x]), y: weighted_num(data, data[y])})
    plt.figure(figsize=(6.4*1.5, 4.8*1.5))
    sns.scatterplot(x=x, y=y, data=weighted_data)

In [1]:
def weighted_df(df):
    weighted_df = pd.DataFrame()
    for c in df.columns[2:]:
        if type(df[c][0]) == type('string'):
            weighted_df[c] = weighted_cat(df, df[c])
        else:
            weighted_df[c] = weighted_num(df, df[c])
    return weighted_df