In [None]:
from datetime import datetime, timedelta

def last_day_of_previous_month(any_day):
    return any_day - timedelta(days=any_day.day)

def current_month(any_day):
    next_month = any_day.replace(day=28) + timedelta(days=4)
    return next_month - timedelta(days=next_month.day)

target_df['evt_dt'] = pd.to_datetime(target_df['evt_dt'], format='%Y-%m-%d')
dates_df['mnth'] = dates_df['evt_dt'].apply(current_month)

def gen_date_list(start_date, end_date):
    s_date = datetime.strptime(start_date, '%Y-%m-%d').date()
    e_date = datetime.strptime(end_date, '%Y-%m-%d').date()

    date_list = []
    for i in range( (e_date - s_date).days + 1):
        date_list.append(str(s_date + timedelta(days=i)))
        
    return date_list

In [None]:
# список признаков, по которым только одно уникальное значение
def single_unique(dataset):
    unique_counts = dataset.nunique()
    unique_counts = pd.DataFrame(unique_counts).reset_index().rename(columns = {'index': 'feature', 0: 'nunique'})
    single_unique = pd.DataFrame(unique_counts[unique_counts['nunique'] <= 1])
    single_unique = list(single_unique['feature'])
    pd.DataFrame(single_unique).to_csv('dop_files/drop_single.csv', index=False) #############################
    return single_unique

# удаление полей по бизнес-логике
def dropping(dataset):
    # удаление полей, заданных в файле "Приложение2.xlsx"
    on_drop = []
    data_on_delete = pd.read_excel('Приложение2.xlsx') 
    for i in data_on_delete['Атрибут']:
        if i in dataset.columns: on_drop.append(i) 
    #удаление из датафрейма полей ВСП,ОСБ
    dataset.drop(on_drop, 1, inplace=True)  
    # удаление дат
    dates = [i for i in dataset.columns if i.find('_dt') != -1]
    dataset.drop(dates, 1, inplace=True)
    return dataset

# определение типов полей
def types(dataset):
    cat_features = [i[0] for i in dict(dataset.dtypes).items() if 'obj' in str(i[1])]
    float_features = [i[0] for i in dict(dataset.dtypes).items() if 'float' in str(i[1])]
    int_features = [i[0] for i in dict(dataset.dtypes).items() if 'int' in str(i[1])]
    #date_features = [i[0] for i in dict(dataset.dtypes).items() if 'date' in str(i[1])]
    date_features = [i for i in dataset.columns if i.find('_dt') != -1]
    # добавление кодов (например код тер.банка) в категориальные признаки
    cat_dop = [i for i in dataset.columns if i.find('_cd') != -1]      
    for i in cat_dop:
        if i not in cat_features: 
            cat_features.append(i)
            if i in float_features: float_features.remove(i)
            if i in int_features: int_features.remove(i)
    # удаление полей типа даты из категориальных
    for i in date_features: 
        if i in cat_features: cat_features.remove(i)  
    # объединение целых и вещественных полей
    numeric_features = float_features + int_features
    return cat_features, numeric_features, date_features

# поиск и изменение аномальных значений
def anomalies(dataset, columns):
    for i in columns: #numbers / not categories
        col = dataset[[i]].sort_values(i)
        percent1 = np.percentile(col, 1)
        percent99 = np.percentile(col, 99)
        dataset[i][dataset[i] > percent99] = percent99
        dataset[i][dataset[i] < percent1] = percent1

# преобразование признаков для моделей градиентного бустинга
def transform_data(dataset, cat_features, numeric_features, date_features, fillna = True):
 
    # преобразование полей типа дата
    today = datetime.date.today()
    for i in date_features:  #date_features:
        dataset[i] = pd.to_datetime(dataset[i], format='%Y-%m-%d')
        dataset[i] = dataset[i].apply(lambda x: ((today - x.date()).days) if x is not None else x)
    pd.DataFrame(date_features).to_csv('dop_files/fillna_date.csv', index=False, header=False) #############################
      
    if fillna == True: # заполнять или нет пустые ячейки
        # заполнение пропусков для категор. приз-в на -99999
        numeric_features = numeric_features + date_features
        a = [i for i in numeric_features if dataset[i].count() != dataset.shape[0]]    ############################# 
        pd.DataFrame(a).to_csv('dop_files/fillna_999.csv', index=False, header=False)  #############################   
        dataset[numeric_features] = dataset[numeric_features].fillna(-99999)
        
        # заполнение пропусков для категор. приз-в на "NULL"
        a = [i for i in cat_features if dataset[i].count() != dataset.shape[0]]        #############################
        pd.DataFrame(a).to_csv('dop_files/fillna_NULL.csv', index=False, header=False) #############################
        dataset[cat_features].fillna('NULL', inplace=True)
        
        # преобразование полей строкового типа
        le = LabelEncoder()
        pd.DataFrame(cat_features).to_csv('dop_files/labelEncoding.csv', index=False, header=False) #############################
        for i in cat_features:
            dataset[i] = dataset[i].astype(str)
            dataset[i] = le.fit_transform(dataset[i])
            dataset[i] = dataset[i].astype(int)
    else:
        dataset_copy = dataset.copy()
        # преобразование полей строкового типа
        le = LabelEncoder()
        for i in cat_features:
            dataset[i] = dataset[i].astype(str)
            dataset[i] = le.fit_transform(dataset[i])
            dataset[i] = dataset[i].astype(int)
        # оставление пустых ячеек (без заполнения)
        s = 1
        for j in cat_features:
            print(str(s)+'/'+str(len(cat_features)))
            s+=1
            for i in dataset.index:
                if dataset_copy.loc[[i],j].isnull().values[0]: dataset.loc[i,j] = None   
            
    return dataset


#удаление признаков, имеющих высокую корреляцию с др. пр-ми
def corr_drop(X,y):
    dataset = X
    dataset['y'] = y
    corr = dataset.fillna(0).corr() #!!!!! fillna(0)
    corr.to_csv('dop_files/corr_1.csv')  #############################
    on_drop = ['y']
    for i in range(corr.shape[0]-1):
        for j in range(i+1,corr.shape[1]-1): #последний столбец - 'y'
            if abs(corr.ix[i,j]) > 0.7:
                if abs(corr.ix[i,corr.shape[1]-1]) < abs(corr.ix[j,corr.shape[1]-1]): 
                    on_drop.append(corr.columns[i])
                else: 
                    on_drop.append(corr.columns[j])
    pd.DataFrame(on_drop).to_csv('dop_files/drop_corr.csv', index=False, header=False) #############################
    dataset.drop(on_drop, axis = 1, inplace = True)
    
    corr = dataset.fillna(0).corr()       #############################
    corr.to_csv('dop_files/corr_2.csv')   #############################
    return dataset


# возвращает список из N признаков, имеющих наибольшую значимость относительно y, согласно статистике
def select_statistic(X, y, count, percent=10):  
    select = SelectPercentile(percentile=percent)
    select.fit(X.fillna(0), y)
    sel = [[i,j] for i,j in zip(X.columns, select.scores_)]
    sel = pd.DataFrame( sel, columns=['feature', 'importance'] ).sort_values(by=['importance'], ascending=False)
    sel_top = sel.iloc[:60].feature.values
    return sel_top


#преобразование признаков для моделей лог.регрессии и случайного леса (аналогично, как для бустинга, с небольшими изменениями)
def transform_for_rfe_logregr(dataset, cat_features, numeric_features, date_features):
    dataset[cat_features].fillna('NULL', inplace=True)
    le = LabelEncoder()
    on_drop = []
    for i in cat_features:
        dataset[i] = dataset[i].astype(str)
        if dataset[i].describe()[1] <= 2: dataset[i] = le.fit_transform(dataset[i])
        elif dataset[i].describe()[1] < 10 or i == 'tb_name':
            dataset = pd.get_dummies(dataset, columns=[i])
        else: on_drop.append(i)
    dataset.drop(on_drop, 1, inplace = True)
    today = datetime.date.today()
    for i in date_features:  #date_features:
        dataset[i] = pd.to_datetime(dataset[i], format='%Y-%m-%d')
        dataset[i] = dataset[i].apply(lambda x: ((today - x.date()).days) if x is not None else x)
    numeric_features = numeric_features + date_features
    dataset[numeric_features] = dataset[numeric_features].fillna(dataset[numeric_features].mean())
    return dataset


# оценивание information value по всем признакам
def calc_iv(df, feature, target, pr=False):
    """
    Set pr=True to enable printing of output.
    
    Output: 
      * iv: float,
      * data: pandas.DataFrame
    """

    lst = []

    df[feature] = df[feature].fillna('NULL')

    for i in range(df[feature].nunique()):
        val = list(df[feature].unique())[i]
        lst.append([feature,                                                        # Variable
                    val,                                                            # Value
                    df[df[feature] == val].count()[feature],                        # All
                    df[(df[feature] == val) & (df[target] == 0)].count()[feature],  # Good (think: Fraud == 0)
                    df[(df[feature] == val) & (df[target] == 1)].count()[feature]]) # Bad (think: Fraud == 1)

    data = pd.DataFrame(lst, columns=['Variable', 'Value', 'All', 'Good', 'Bad'])

    data['Share'] = data['All'] / data['All'].sum()
    data['Bad Rate'] = data['Bad'] / data['All']
    data['Distribution Good'] = (data['All'] - data['Bad']) / (data['All'].sum() - data['Bad'].sum())
    data['Distribution Bad'] = data['Bad'] / data['Bad'].sum()
    data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])

    data = data.replace({'WoE': {np.inf: 0, -np.inf: 0}})

    data['IV'] = data['WoE'] * (data['Distribution Good'] - data['Distribution Bad'])

    data = data.sort_values(by=['Variable', 'Value'], ascending=[True, True])
    data.index = range(len(data.index))

    if pr:
        print(data)
        print('IV = ', data['IV'].sum())

    iv = data['IV'].sum()

    return iv, data

def iv_sort(dataset, y):
    dataset['target'] = y
    IV = []
    p = 1
    for i in range(len(dataset.columns)):
        if i % 5 == 0: 
            print(' ' + '-'*p + ' ',i)
            p+=1
        iv, dt = calc_iv(dataset.fillna(0), dataset.columns[i], 'target')
        IV.append([dataset.columns[i], iv])
    IV = pd.DataFrame(IV, columns=['feature', 'iv'])
    IV.sort_values('iv', ascending=False, inplace=True)
    IV.index = range(IV.shape[0]) 
    dataset.drop('target', axis=1, inplace=True)

    return IV

def _check_param_value(parameters: dict) -> dict:
    """проверка значений гиперпараметров на целочисленность"""
    
    int_params = [
        "max_depth",
        "num_leaves",
        "min_data_in_leaf",
        "subsample_for_bin",
        "n_estimators",
    ]

    for param_name in int_params:
        if parameters.get(param_name, None):
            parameters[param_name] = int(parameters[param_name])
    return parameters

def objective(parameters):
    """целевая функция для настройки гиперпараметров"""
    train_set = lgb.Dataset(X_train, y_train)
    
    parameters = _check_param_value(parameters)
    
    cv_results = lgb.cv(parameters, train_set, nfold=10, num_boost_round=10000, early_stopping_rounds = 100, metrics='auc', seed=50)
    best_score=max(cv_results['auc-mean'])
    loss = 1-best_score
    return {'loss': loss,
            'params':parameters,
            'status':STATUS_OK
           }

In [None]:
#---------удаление коррелирующих признаков---------
def corr_drop(X,y):
    dataset = X
    dataset['y'] = y
    corr = dataset.corr().abs() #!!!!! fillna(0)
    on_drop = ['y']

    for i in tqdm( range(corr.shape[0]-1) ):
        bar.update(i+1)
        for j in range(i+1,corr.shape[1]-1): #последний столбец - 'y'
            if corr.ix[i,j] > 0.7:
                if corr.ix[i,corr.shape[1]-1] < corr.ix[j,corr.shape[1]-1]: 
                    on_drop.append(corr.columns[i])
                else: 
                    on_drop.append(corr.columns[j])

    return list( set(dataset.columns) - set(on_drop) )