In [1]:

import numpy as np     
import pandas as pd 
import re 
import gzip  
import csv
import random   
import matplotlib.pyplot as plt     
from pandas.plotting import scatter_matrix
import seaborn as sns   

from sklearn.utils import resample

# Distribution of Grade and Class of Original Dataset

In [2]:
def percentage(dataframe):
    years = sorted(list(dataframe.issued_yr.unique()))
    grade = sorted(list(dataframe.grade.unique()))
    dist = {}
    for x in years:
        for y in grade:
            num = dataframe[dataframe['grade'] == y][dataframe['issued_yr']== x][dataframe['loan_status_n'] ==1].term.count()
            percentage = round(num / len(dataframe[dataframe['loan_status_n']==1].term),5)
            place = str(x) +'_'+ y
            dist[place] = percentage
    return dist

# Down Sample with Proportion to Year and Grade

In [5]:
def pro_down_sample_combined_dict(data):
    dist = percentage(data)
    minority_count = data[data['loan_status_n'] == 0].shape[0]
    data_1 = data[data['loan_status_n'] == 1]
    years_ = list(data.issued_yr.unique())
    grade_ = list(data.grade.unique())
    sample_down_df = pd.DataFrame(columns = data.columns)
    #desired size of sample down size======= might have to change a little bit to match the # of bad loans
    #size_of_sample = percent_of_data*(data_1.shape[0])
    size_of_sample = minority_count
    for x in years_:
        for y in grade_:
            year_class = str(x)+'_'+y
            # this 'dist' needs to refer to the output of percentage(dataframe) assigned to 'dist' variable name
            year_class_prop = dist[year_class]
            target_num = int(size_of_sample* year_class_prop)
            temp_df = data_1[data_1['grade'] == y][data_1['issued_yr']== x]
            sample_temp_df = resample(temp_df,
                                     replace = False,
                                     n_samples = target_num,
                                     random_state=123)
            sample_down_df = pd.concat([sample_down_df,sample_temp_df])
    return sample_down_df

# Distribution of Grade and Year of Down Sample Data 

In [4]:
def percentage_down_sample(dataframe):
    dist_test = {}

    years = sorted(list(dataframe.issued_yr.unique()))
    grade = sorted(list(dataframe.grade.unique()))
    for x in years:
        for y in grade:
            num = dataframe[dataframe['grade'] == y][dataframe['issued_yr']== x][dataframe['loan_status_n'] ==1].term.count()
            percentage = round(num / len(dataframe.term),5)
            place = str(x) +'_'+ y
            dist_test[place] = percentage
    return dist_test

# If Downsizing of Minority Class

In [8]:
def pro_down_sample_minority(data, percent_of_data):
    dist = percentage_minority(data)
    data_1 = data[data['loan_status_n'] == 0]
    years_ = list(data.issued_yr.unique())
    grade_ = list(data.grade.unique())
    sample_down_df = pd.DataFrame(columns = data.columns)
    #desired size of sample down size
    size_of_sample = percent_of_data*(data_1.shape[0])

    #print(size_of_sample)
    for x in years_:
        for y in grade_:
            year_class = str(x)+'_'+y
            year_class_prop = dist[year_class]
            target_num = int(size_of_sample* year_class_prop)
            #delete_later = size_of_sample* year_class_prop
            #print(str(target_num) + '_' + str(delete_later))
            temp_df = data_1[data_1['grade'] == y][data_1['issued_yr']== x]
            sample_temp_df = resample(temp_df,
                                     replace = False,
                                     n_samples = target_num,
                                     random_state=123)
            sample_down_df = pd.concat([sample_down_df,sample_temp_df])
    return sample_down_df

# If Downsizing of Minority Class (dictionary proportion for minority class)


In [7]:
def percentage_minority(dataframe):
    years = sorted(list(dataframe.issued_yr.unique()))
    grade = sorted(list(dataframe.grade.unique()))
    dist = {}
    for x in years:
        for y in grade:
            num = dataframe[dataframe['grade'] == y][dataframe['issued_yr']== x][dataframe['loan_status_n'] ==1].term.count()
            percentage = round(num / len(dataframe[dataframe['loan_status_n']==1].term),5)
            place = str(x) +'_'+ y
            dist[place] = percentage
    return dist

# If Downsizing of Minority Class: Downsize Majority Class

In [None]:
def pro_down_sample_combined_dict_minority(data, downsized_minority_size):
    dist = percentage(data)
    data_1 = data[data['loan_status_n'] == 1]
    years_ = list(data.issued_yr.unique())
    grade_ = list(data.grade.unique())
    sample_down_df = pd.DataFrame(columns = data.columns)
    #desired size of sample down size======= might have to change a little bit to match the # of bad loans
    #size_of_sample = percent_of_data*(data_1.shape[0])
    size_of_sample = downsized_minority_size
    for x in years_:
        for y in grade_:
            year_class = str(x)+'_'+y
            # this 'dist' needs to refer to the output of percentage(dataframe) assigned to 'dist' variable name
            year_class_prop = dist[year_class]
            target_num = int(size_of_sample* year_class_prop)
            temp_df = data_1[data_1['grade'] == y][data_1['issued_yr']== x]
            sample_temp_df = resample(temp_df,
                                     replace = False,
                                     n_samples = target_num,
                                     random_state=123)
            sample_down_df = pd.concat([sample_down_df,sample_temp_df])
    return sample_down_df