In [97]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

drop_feature=['quantity_group',        # identical as 'quantity'
              'source_type',           # 1-to-1 subset of 'source'
              'source_class',          # 1-to-1 subset of 'source_class'
              'waterpoint_type_group', # 1-to-1 subset of 'waterpoint_type'
              'quality_group',         # 1-to-1 subset of 'water_quality'
              'payment_type',          # 1-to-1 subset of 'payment'
              'management_group',      # 1-to-1 subset of 'management'
              'extraction_type_group', # 1-to-1 subset of 'extraction_type'
              'extraction_type_class'  # 1-to-1 subset of 'extraction_type'
             ]
def balance_class(traincsv, label_name, drop_list=None, balance_rate=None, labelcsv=None):
    """
    traincsv: 
        type: str
        train csv file name
    label_name: 
        type: str
        column name of label in label csv or train csv
    drop_list: 
        type: list
        a list of column names which will be dropped out
    balance_rate:
        type: float
        smaller classes will increase to balance_rate * number of biggest class
        ie, 250 data are class1, 100 data are class2, if balance_rate = 0.6
        class2 will increase to 150( = 250*0.6)
    labelcsv:
        type: str
        label csv file name
    """
    
    train = pd.DataFrame.from_csv(traincsv, index_col = None)
    if labelcsv is not None:
        label = pd.DataFrame.from_csv(labelcsv, index_col = None)
        train[label_name] = label[label_name]
        
    class_count = dict()
    class_count_max = 0
   
    for c in train[label_name].unique():
        class_count[c] = train[train[label_name] == c].shape[0]
        if class_count[c] > class_count_max:
            class_count_max = class_count[c]
    
    
    aug_list = list()
    for c in class_count:
        aug_temp = train[train[label_name] == c]
        aug_num = int(class_count_max*balance_rate) - aug_temp.shape[0]
        if aug_num > 0:
            aug_temp = aug_temp.sample(n=aug_num, replace = True).copy()
            aug_list.append(aug_temp)
    
    
    train_aug = pd.concat(aug_list)
    train_aug_shuf = shuffle(train_aug)
    trainout = pd.concat([train, train_aug_shuf])

    trainout.drop(drop_list, axis=1, inplace = True)
    
    labelout = pd.DataFrame(trainout[label_name])
    if labelcsv is not None:
        trainout.drop([label_name], axis=1, inplace = True)
        labelout.to_csv(labelcsv.split('.csv')[0] + '_out.csv', index=False, header = list(labelout.columns))
    
    
    trainout.to_csv(traincsv.split('.csv')[0] + '_out.csv', index=False, header = list(trainout.columns))

In [98]:
# example:
balance_class(traincsv = "train.csv", label_name = "status_group",drop_list = drop_feature, 
                       balance_rate = 0.8, labelcsv = "label.csv")

ddd = pd.DataFrame.from_csv("train_out.csv", index_col=None)
kkk = pd.DataFrame.from_csv("label_out.csv", index_col=None)