In [134]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

In [135]:
np.random.seed(0)
n = 10000
start_id = 4633210
start_date = pd.Timestamp('2024-01-01')
end_date = pd.Timestamp('2024-04-30 23:59:59')

In [136]:
def gini(y_true, y_score):
    auc = roc_auc_score(y_true, y_score)
    return 2 * auc - 1

In [137]:
ids = np.arange(start_id, start_id + n)

timestamps = pd.to_datetime(
    np.random.uniform(start_date.value, end_date.value, n)
).floor('s')
timestamps = pd.Series(timestamps).sort_values().values
df = pd.DataFrame({'id': ids, 'application_datetime': timestamps}).set_index('id')
df

Unnamed: 0_level_0,application_datetime
id,Unnamed: 1_level_1
4633210,2024-01-01 00:12:37
4633211,2024-01-01 00:12:50
4633212,2024-01-01 00:26:12
4633213,2024-01-01 00:44:13
4633214,2024-01-01 00:53:24
...,...
4643205,2024-04-30 23:47:59
4643206,2024-04-30 23:51:05
4643207,2024-04-30 23:52:25
4643208,2024-04-30 23:53:42


In [138]:
df['def_45'] = np.random.permutation([1]*3840 + [0]*6160)
df

Unnamed: 0_level_0,application_datetime,def_45
id,Unnamed: 1_level_1,Unnamed: 2_level_1
4633210,2024-01-01 00:12:37,1
4633211,2024-01-01 00:12:50,0
4633212,2024-01-01 00:26:12,0
4633213,2024-01-01 00:44:13,1
4633214,2024-01-01 00:53:24,1
...,...,...
4643205,2024-04-30 23:47:59,1
4643206,2024-04-30 23:51:05,0
4643207,2024-04-30 23:52:25,1
4643208,2024-04-30 23:53:42,0


#### генерация дженериков

In [139]:
def generate_feature(df, name, multiplier, noise, round=3):
    
    df[name] = df['def_45'] * multiplier + np.random.normal(loc=0, scale=df['def_45'].std() * noise, size=len(df))
    df.loc[df[name] < 0, name] = np.random.uniform(0, 1, (df[name] < 0).sum())
    df.loc[df[name] > 1, name] = np.random.uniform(0, 1, (df[name] > 1).sum())
    df[name] = df[name].round(round)

    return gini(df['def_45'], df[name])

In [140]:
generate_feature(df, 'MTS_score', 0.6, 0.6)

0.43610685538419913

In [141]:
generate_feature(df, 'fraud_score', 0.4, 0.3)

0.3410475006764071

In [142]:
generate_feature(df, 'OKB_score', 0.3, 0.3, 5)

0.17327571191828994

In [143]:
generate_feature(df, 'antibankrupt_score', 0.25, 0.15, 2)

0.25453471658549787

In [144]:
df['antibankrupt_score'] = 1-df['antibankrupt_score']

#### генерация мусорных фичей

In [149]:
columns_to_copy = ['avg_credit_card_utilization_last_12m', 'num_late_payments_last_24m', 'max_num_simultaneous_loans', 'max_utilization_single_card', 'revolving_balance_trend_12m']

In [None]:
for col in columns_to_copy:
    new_col = f'{col}_half'
    df[new_col] = df[col].copy()
    nan_indices = np.random.choice(df.index, size=len(df)//2, replace=False)
    df.loc[nan_indices, new_col] = np.nan
df

In [None]:
for i in range(204):
    cum = i%70
    col = f'{df.columns[cum]}_{i}'
    df[col] = np.nan
    nan_count = int(0.95 * len(df))
    not_nan_idx = np.random.choice(df.index, size=len(df)-nan_count, replace=False)
    df.loc[not_nan_idx, col] = np.random.randn(len(not_nan_idx))
df

In [None]:
for col in columns_to_copy:
    noise = np.random.normal(0, 0.2 * df[col].std(skipna=True), size=len(df))
    df[f'{col}_worse'] = df[col] + noise
df

In [None]:
for i in range(69):
    col = f'id_{np.random.randint(1000, 9999)}_{np.random.choice(["foo", "bar", "baz", "qux", "quux"])}'
    df[col] = np.random.randint(0, 2, size=len(df))
df

In [None]:
import uuid

df['system_name'] = ['sys_%03d' % i for i in range(10000)]
df['random_id'] = [str(uuid.uuid4()) for _ in range(10000)]
df['usbank_runentry_id'] = np.arange(10000, 20000)
df['status'] = np.random.choice(['0', '1', '2'], size=10000)
df['user_id'] = np.random.randint(10000, 99999, size=10000)
df['flag'] = np.random.choice([0, 1], size=10000)
df['zone'] = np.random.choice(['A', 'B', 'C'], size=10000)
df['priority'] = np.random.randint(1, 5, size=10000)
df['checksum'] = [str(uuid.uuid4().int)[:8] for _ in range(10000)]

df

In [None]:
df['id'] = ids
df.set_index('id', inplace=True)
df

In [None]:
col_names_random = [
    'ekapusta_vector_9',
    'Rusinterfinance_vector',
    'gamma_refactor_pod',
    'quanta_latch_jx',
    'hexmux_torque_sr',
    'delta_noise_pin',
    'polysync_emitter_vx',
    'wrangle_bitport_8',
    'trace_mux_qr',
    'scramble_node_5'
]
for col in col_names_random:
    df[col] = np.random.randn(len(df))
df

#### сохранение

In [None]:
df.to_csv('t.csv', index=True)