In [1]:
import pandas as pd
import numpy as np
import pickle
import catboost as catb

with open('/home/hchase/shared_data/data/venri/harrison_tmp/categories.pkl', 'rb') as f:
        categories = pickle.load(f)

def get_map(df):
    mapper = {}
    for k in categories:
        if k in df.columns:
            counts = dict(df[k].astype(str).value_counts())
            mapper[k] = counts
    return mapper
    
def label_encode_df(df, mapper):
    for k in categories:
        if k in df.columns:
            df[k] = df[k].apply(mapper[k].get)
    df[df == 'NO_DATA'] = np.nan
    return df

with open('/home/hchase/header2184_8.encode') as f:
    headers = f.read().split('\x07')

def load_file(filename, columns, col_names):
    df = pd.read_csv(filename, usecols=columns,sep='\a', header=None)
    col_names = col_names or headers
    df.columns = col_names
    return df

def get_X_df(df):
    return df[[c for c in df.columns if c.startswith('v')]]

def get_X_y(df):
    return get_X_df(df), df['tag']

def load_file_downsampled(filename, ratio, usecols=None, nrows=None, skiprows=None):
    if ratio is not None:
        small_test = pd.read_csv(
            filename,
            sep='\a',
            usecols=['tag'],
            nrows=nrows,
        )
        print('loaded all tags')
        neg_ind_list = list(small_test[small_test['tag'] == 0].index + 1)
        pos_egs = small_test['tag'].sum()
        np.random.shuffle(neg_ind_list)
        skip_inds = neg_ind_list[pos_egs*ratio:]
        nrows = nrows - len(skip_inds)
        print(len(skip_inds))
    else:
        skip_inds = skiprows
    return pd.read_csv(
        filename,
        header=0,
        sep='\a',
        skiprows=skip_inds,
        usecols=usecols,
        nrows=nrows,
    )
    

In [2]:
all_train_file = '/home/hchase/shared_data/data/venri/training_set/all_test_oot'
save_path = '/home/hchase/shared_data/data/venri/harrison_tmp/harrison_preds_by_model_1_16.csv'

In [3]:
headers_test = [col for col in headers if col not in {'cc_control', 'tag', 'brm_bad_tag_assigned'}]

In [79]:
!cat /home/hchase/shared_data/data/venri/training_set/all_test_oot  | wc -l

7760698


In [4]:
def logit(p):
    return np.log(p/(1-p))
def get_ps(mod, X):
    y_preds = mod.predict_proba(X)
    return [p[1] for p in y_preds]

In [5]:
models_to_run = [
    '6m_all_1st_weighted_log_tag.md', 
    '6m_all_2nd_weighted_log_tag.md',
    '6m_all_3rd_weighted_log_tag.md', 
    '6m_all_4th_weighted_log_tag.md',
    '12m_feats_4_2nd_weighted_log_tag.md',
    '12m_feats_4_weighted_log_tag.md',
    'all_all_weighted_log_tag_downsampled_2_num2.md',
    'all_all_weighted_log_tag_downsampled_2.md',
    '24m_feats_4_weighted_log_tag.md',
    '6m_feats_4_weighted_log_tag_iterative_29_3.md',
    '12m_all_1st_weighted_log_tag.md', 
    '12m_all.md', 
    '6m_all.md',
    '6m_all_2nd.md',
    '6m_feats_4_1st_weighted_log_mult_tag_hard_classes_only.md',
    '12m_feats_4_1st_weighted_log_tag_multiclass.md',
    '12m_feats_4_2nd_weighted_log_tag_multiclass.md',
    '12m_feats_4_2nd_weighted_log_tag_everyclass.md',
    '24m_feats_4_weighted_log_tag_multi.md',
    '12m_all_weighted_log_tag_multi.md',
]

In [7]:
chunksize = 1000000
total_row_count = 7760698
tot = total_row_count // chunksize + 1
for i in range(tot):
    print(i)
    if i == 0:
        skiprows=None
        write_mode = 'w'
        header_mode = True
    else:
        skiprows = chunksize * i
        write_mode = 'a'
        header_mode = False
    raw_df = pd.read_csv(all_train_file, nrows=chunksize, sep='\a', header=None, skiprows=skiprows)
    raw_df.columns = headers_test
    mapper = categories.copy()

    df = label_encode_df(raw_df.copy(), mapper)
    #df = raw_df

    X = get_X_df(df)
    pred_df = df[['trans_id', 'dol_wgt']].copy()
    for mod_name in models_to_run:
        print(mod_name)
        mod = catb.CatBoostClassifier(thread_count=10)
        mod.load_model(f'/home/hchase/shared_data/data/venri/harrison_tmp/{mod_name}')
        preds = mod.predict_proba(X)
        if len(preds[0]) == 2:
            pred_df[mod_name] = preds[:, 1]
        else:
            for i in range(len(preds[0])):
                pred_df[f'{mod_name}__{i}'] = preds[:, i]
    
    pred_df.to_csv(save_path, index=False, mode=write_mode, header=header_mode)

0
6m_all_1st_weighted_log_tag.md
6m_all_2nd_weighted_log_tag.md
6m_all_3rd_weighted_log_tag.md
6m_all_4th_weighted_log_tag.md
12m_feats_4_2nd_weighted_log_tag.md
12m_feats_4_weighted_log_tag.md
all_all_weighted_log_tag_downsampled_2_num2.md
all_all_weighted_log_tag_downsampled_2.md
24m_feats_4_weighted_log_tag.md
6m_feats_4_weighted_log_tag_iterative_29_3.md
12m_all_1st_weighted_log_tag.md
12m_all.md
6m_all.md
6m_all_2nd.md
6m_feats_4_1st_weighted_log_mult_tag_hard_classes_only.md
12m_feats_4_1st_weighted_log_tag_multiclass.md
12m_feats_4_2nd_weighted_log_tag_multiclass.md
12m_feats_4_2nd_weighted_log_tag_everyclass.md
24m_feats_4_weighted_log_tag_multi.md
12m_all_weighted_log_tag_multi.md
1


  interactivity=interactivity, compiler=compiler, result=result)


6m_all_1st_weighted_log_tag.md
6m_all_2nd_weighted_log_tag.md
6m_all_3rd_weighted_log_tag.md
6m_all_4th_weighted_log_tag.md
12m_feats_4_2nd_weighted_log_tag.md
12m_feats_4_weighted_log_tag.md
all_all_weighted_log_tag_downsampled_2_num2.md
all_all_weighted_log_tag_downsampled_2.md
24m_feats_4_weighted_log_tag.md
6m_feats_4_weighted_log_tag_iterative_29_3.md
12m_all_1st_weighted_log_tag.md
12m_all.md
6m_all.md
6m_all_2nd.md
6m_feats_4_1st_weighted_log_mult_tag_hard_classes_only.md
12m_feats_4_1st_weighted_log_tag_multiclass.md
12m_feats_4_2nd_weighted_log_tag_multiclass.md
12m_feats_4_2nd_weighted_log_tag_everyclass.md
24m_feats_4_weighted_log_tag_multi.md
12m_all_weighted_log_tag_multi.md
2


  interactivity=interactivity, compiler=compiler, result=result)


6m_all_1st_weighted_log_tag.md
6m_all_2nd_weighted_log_tag.md
6m_all_3rd_weighted_log_tag.md
6m_all_4th_weighted_log_tag.md
12m_feats_4_2nd_weighted_log_tag.md
12m_feats_4_weighted_log_tag.md
all_all_weighted_log_tag_downsampled_2_num2.md
all_all_weighted_log_tag_downsampled_2.md
24m_feats_4_weighted_log_tag.md
6m_feats_4_weighted_log_tag_iterative_29_3.md
12m_all_1st_weighted_log_tag.md
12m_all.md
6m_all.md
6m_all_2nd.md
6m_feats_4_1st_weighted_log_mult_tag_hard_classes_only.md
12m_feats_4_1st_weighted_log_tag_multiclass.md
12m_feats_4_2nd_weighted_log_tag_multiclass.md
12m_feats_4_2nd_weighted_log_tag_everyclass.md
24m_feats_4_weighted_log_tag_multi.md
12m_all_weighted_log_tag_multi.md
3
6m_all_1st_weighted_log_tag.md
6m_all_2nd_weighted_log_tag.md
6m_all_3rd_weighted_log_tag.md
6m_all_4th_weighted_log_tag.md
12m_feats_4_2nd_weighted_log_tag.md
12m_feats_4_weighted_log_tag.md
all_all_weighted_log_tag_downsampled_2_num2.md
all_all_weighted_log_tag_downsampled_2.md
24m_feats_4_weighted

  interactivity=interactivity, compiler=compiler, result=result)


6m_all_1st_weighted_log_tag.md
6m_all_2nd_weighted_log_tag.md
6m_all_3rd_weighted_log_tag.md
6m_all_4th_weighted_log_tag.md
12m_feats_4_2nd_weighted_log_tag.md
12m_feats_4_weighted_log_tag.md
all_all_weighted_log_tag_downsampled_2_num2.md
all_all_weighted_log_tag_downsampled_2.md
24m_feats_4_weighted_log_tag.md
6m_feats_4_weighted_log_tag_iterative_29_3.md
12m_all_1st_weighted_log_tag.md
12m_all.md
6m_all.md
6m_all_2nd.md
6m_feats_4_1st_weighted_log_mult_tag_hard_classes_only.md
12m_feats_4_1st_weighted_log_tag_multiclass.md
12m_feats_4_2nd_weighted_log_tag_multiclass.md
12m_feats_4_2nd_weighted_log_tag_everyclass.md
24m_feats_4_weighted_log_tag_multi.md
12m_all_weighted_log_tag_multi.md
6


  interactivity=interactivity, compiler=compiler, result=result)


6m_all_1st_weighted_log_tag.md
6m_all_2nd_weighted_log_tag.md
6m_all_3rd_weighted_log_tag.md
6m_all_4th_weighted_log_tag.md
12m_feats_4_2nd_weighted_log_tag.md
12m_feats_4_weighted_log_tag.md
all_all_weighted_log_tag_downsampled_2_num2.md
all_all_weighted_log_tag_downsampled_2.md
24m_feats_4_weighted_log_tag.md
6m_feats_4_weighted_log_tag_iterative_29_3.md
12m_all_1st_weighted_log_tag.md
12m_all.md
6m_all.md
6m_all_2nd.md
6m_feats_4_1st_weighted_log_mult_tag_hard_classes_only.md
12m_feats_4_1st_weighted_log_tag_multiclass.md
12m_feats_4_2nd_weighted_log_tag_multiclass.md
12m_feats_4_2nd_weighted_log_tag_everyclass.md
24m_feats_4_weighted_log_tag_multi.md
12m_all_weighted_log_tag_multi.md
7
6m_all_1st_weighted_log_tag.md
6m_all_2nd_weighted_log_tag.md
6m_all_3rd_weighted_log_tag.md
6m_all_4th_weighted_log_tag.md
12m_feats_4_2nd_weighted_log_tag.md
12m_feats_4_weighted_log_tag.md
all_all_weighted_log_tag_downsampled_2_num2.md
all_all_weighted_log_tag_downsampled_2.md
24m_feats_4_weighted

In [6]:
df1 = pd.read_csv(save_path)

In [7]:
df1.head()

Unnamed: 0,trans_id,harrison_preds
0,147781333032,0.015718
1,147781779038,0.000428
2,147783146838,0.011318
3,147787692999,0.133143
4,147795181902,0.019916


In [10]:
df2 = pd.read_csv(all_train_file, sep='\a', header=None, usecols=[0,3])

In [12]:
df2.shape

(7760698, 2)

In [15]:
df1['dol_wgt'] = df2[3].values

In [17]:
df1.columns=['trans_id', 'score', 'dol_wgt']

In [18]:
df1.to_csv('/home/hchase/shared_data/data/venri/harrison_tmp/second_pass_predictions_harrison.csv', index=False)

In [19]:
df3 = pd.read_csv('/home/hchase/shared_data/data/venri/harrison_tmp/second_pass_predictions_harrison.csv', nrows=5)

In [20]:
df3.head()

Unnamed: 0,trans_id,score,dol_wgt
0,147781333032,0.015718,300.0
1,147781779038,0.000428,79.5
2,147783146838,0.011318,498.5
3,147787692999,0.133143,2000.0
4,147795181902,0.019916,478.5


In [21]:
df4 = pd.read_csv(all_train_file, sep='\a', header=None, nrows=5)

In [24]:
df4.columns = headers_test

In [25]:
df4.head()

Unnamed: 0,trans_id,pmt_month,pmt_start_date,dol_wgt,usd_amt,v1,v2,v3,v4,v5,...,v2175,v2176,v2177,v2178,v2179,v2180,v2181,v2182,v2183,v2184
0,147781333032,2018/11,2018/11/01,300.0,6.0,426,14,36890766,1.0,GOOD_CONSISTENCY,...,0,-1.0,-1.0,8108.4,44,US,0.006,0.015,0,0
1,147781779038,2018/11,2018/11/01,79.5,1.59,2405,80,207853378,0.8,MED_CONSISTENCY,...,3,0.333333,1.0,6912.37,44,C2,0.004,0.001,0,0
2,147783146838,2018/11,2018/11/02,498.5,9.97,1278,42,110449780,0.9,MED_CONSISTENCY,...,0,-1.0,-1.0,8108.4,44,FR,0.0,0.0,0,0
3,147787692999,2018/11,2018/11/02,2000.0,40.0,0,0,4,-1.0,NO_DATA,...,0,-1.0,-1.0,28725.3,44,US,-1.0,-1.0,0,1
4,147795181902,2018/11,2018/11/02,478.5,9.57,13,0,1194782,0.192173,GOOD_CONSISTENCY,...,0,-1.0,-1.0,8108.4,44,IN,-1.0,-1.0,0,0


In [1]:
!cat /home/hchase/shared_data/data/venri/harrison_tmp/first_pass_predictions_harrison.csv | wc -l

7760699
