In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [2]:
%config Completer.use_jedi = False

In [3]:
input_df = pd.read_csv("../data/data_interview_test.csv", sep=':')
print(len(input_df))
input_df.head()

12034


Unnamed: 0,receipt_id,company_id,matched_transaction_id,feature_transaction_id,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch
0,10000,10000,10468,10000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,10000,10000,10468,10001,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,10000,10000,10468,10003,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,10000,10000,10468,10004,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,10000,10000,10468,10005,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [4]:
input_df[input_df.receipt_id.eq("10,001")]

Unnamed: 0,receipt_id,company_id,matched_transaction_id,feature_transaction_id,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch
20,10001,10000,10605,10596,0.0,0.4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
21,10001,10000,10605,10597,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
22,10001,10000,10605,10598,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
23,10001,10000,10605,10599,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
24,10001,10000,10605,10600,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
25,10001,10000,10605,10605,0.9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
26,10001,10000,10605,10606,0.9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
input_df.groupby(['receipt_id', 'company_id'])['matched_transaction_id'].apply(lambda x: x.nunique()).unique()

array([1])

In [7]:
input_df.matched_transaction_id.eq(input_df.feature_transaction_id).mean()

0.071214891141765

#### Target Feature

In [8]:
# creating target feature(best_match), with value as 1 when matched_transaction_id == feature_transaction_id, otherwise 0
input_df['best_match'] = input_df.matched_transaction_id.eq(input_df.feature_transaction_id).astype(int)

In [9]:
input_df.head()

Unnamed: 0,receipt_id,company_id,matched_transaction_id,feature_transaction_id,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch,best_match
0,10000,10000,10468,10000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
1,10000,10000,10468,10001,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,10000,10000,10468,10003,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3,10000,10000,10468,10004,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
4,10000,10000,10468,10005,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0


#### Train Test Split

In [10]:
# doing train test split based on group (one group corresponding to one (receipt_id, company_id, matched_transaction_id) comb)
indices = input_df[['receipt_id', 'company_id', 'matched_transaction_id']].drop_duplicates()
train_indices, test_indices = train_test_split(indices, random_state=0)

In [11]:
input_df = input_df.set_index(['receipt_id', 'company_id', 'matched_transaction_id'])
input_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,feature_transaction_id,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch,best_match
receipt_id,company_id,matched_transaction_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10000,10000,10468,10000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
10000,10000,10468,10001,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
10000,10000,10468,10003,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
10000,10000,10468,10004,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
10000,10000,10468,10005,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [12]:
train_indices.values

array([['50,056', 50000, '50,079'],
       ['10,067', 10000, '10,500'],
       ['50,176', 50000, '50,287'],
       ...,
       ['40,079', 40000, '40,108'],
       ['30,153', 30000, '30,913'],
       ['30,284', 30000, '31,199']], dtype=object)

In [13]:
# creating train test data based on train and test indices found earlier
x_train = input_df.loc[train_indices.values.tolist(), :].set_index('feature_transaction_id', append=True)
y_train = x_train["best_match"]
x_train = x_train.drop("best_match", axis=1)
x_test = input_df.loc[test_indices.values.tolist(), :].set_index('feature_transaction_id', append=True)
y_test = x_test["best_match"]
x_test = x_test.drop("best_match", axis=1)
input_df = input_df.set_index('feature_transaction_id', append=True)

In [14]:
print(f"Train Data. Length {len(x_train)}")
display(x_train.head())
display(y_train.head())
print(f"\nTest Data. Length {len(x_test)}")
display(x_test.head())
display(y_test.head())

Train Data. Length 9175


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch
receipt_id,company_id,matched_transaction_id,feature_transaction_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
50056,50000,50079,50077,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
50056,50000,50079,50079,0.95,0.0,0.4,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50056,50000,50079,50080,0.95,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
10067,10000,10500,10496,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
10067,10000,10500,10497,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


receipt_id  company_id  matched_transaction_id  feature_transaction_id
50,056      50000       50,079                  50,077                    0
                                                50,079                    1
                                                50,080                    0
10,067      10000       10,500                  10,496                    0
                                                10,497                    0
Name: best_match, dtype: int64


Test Data. Length 2859


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch
receipt_id,company_id,matched_transaction_id,feature_transaction_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
50068,50000,50308,50307,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
50068,50000,50308,50308,0.95,0.0,0.4,1.0,0.0,0.8,1.0,0.0,0.0,1.0
50068,50000,50308,50309,0.95,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50219,50000,50173,50170,0.0,0.4,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
50219,50000,50173,50171,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


receipt_id  company_id  matched_transaction_id  feature_transaction_id
50,068      50000       50,308                  50,307                    0
                                                50,308                    1
                                                50,309                    0
50,219      50000       50,173                  50,170                    0
                                                50,171                    0
Name: best_match, dtype: int64

In [15]:
input_df.dtypes

DateMappingMatch           float64
AmountMappingMatch         float64
DescriptionMatch           float64
DifferentPredictedTime     float64
TimeMappingMatch           float64
PredictedNameMatch         float64
ShortNameMatch             float64
DifferentPredictedDate     float64
PredictedAmountMatch       float64
PredictedTimeCloseMatch    float64
best_match                   int64
dtype: object

In [16]:
input_df.describe()

Unnamed: 0,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch,best_match
count,12034.0,12034.0,12034.0,12034.0,12034.0,12034.0,12034.0,12034.0,12034.0,12034.0,12034.0
mean,0.217901,0.03166,0.021522,0.986455,0.013877,0.024215,0.037893,0.753532,0.001005,0.076533,0.071215
std,0.384535,0.122611,0.116995,0.115597,0.116987,0.128646,0.190945,0.430972,0.020134,0.26586,0.257194
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,1.0,0.9,0.8,1.0,1.0,0.8,1.0,1.0,0.6,1.0,1.0


In [17]:
x_train.describe()

Unnamed: 0,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch
count,9175.0,9175.0,9175.0,9175.0,9175.0,9175.0,9175.0,9175.0,9175.0,9175.0
mean,0.21149,0.032033,0.019488,0.98703,0.013406,0.022779,0.039782,0.760436,0.001003,0.07564
std,0.380426,0.122726,0.110942,0.113151,0.115012,0.125216,0.195457,0.426841,0.020381,0.264436
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
max,1.0,0.9,0.8,1.0,1.0,0.8,1.0,1.0,0.6,1.0


In [18]:
x_test.describe()

Unnamed: 0,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch
count,2859.0,2859.0,2859.0,2859.0,2859.0,2859.0,2859.0,2859.0,2859.0,2859.0
mean,0.238475,0.030465,0.028052,0.98461,0.01539,0.028821,0.031829,0.731375,0.001014,0.079398
std,0.396803,0.122255,0.134407,0.12312,0.12312,0.139007,0.175576,0.443322,0.019322,0.270407
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,0.65,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
max,1.0,0.9,0.8,1.0,1.0,0.8,1.0,1.0,0.6,1.0


In [19]:
pd.concat((input_df.best_match.value_counts(), input_df.best_match.value_counts(normalize=True)), axis=1)

Unnamed: 0,best_match,best_match.1
0,11177,0.928785
1,857,0.071215


In [20]:
y_train.value_counts()

0    8540
1     635
Name: best_match, dtype: int64

In [21]:
y_test.value_counts()

0    2637
1     222
Name: best_match, dtype: int64

#### Models and their evaluation results

##### Random Forest

In [22]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=0)
model_rf = random_forest.fit(x_train, y_train)

In [23]:
train_preds = model_rf.predict(x_train)
test_preds = model_rf.predict(x_test)

In [24]:
(train_preds==(y_train)).mean()

0.9665395095367847

In [25]:
(test_preds==(y_test)).mean()

0.9576775096187479

In [26]:
def get_scores(y, y_hat):
    """
    get and print various metrics 
    """
    print(f"Accuracy: {accuracy_score(y, y_hat)}")
    print(f"Precision: {precision_score(y, y_hat)}")
    print(f"Recall: {recall_score(y, y_hat)}")
    print(f"F1Score: {f1_score(y, y_hat)}")

In [57]:
print(f"Random Forest scores\n")
get_scores(y_test, test_preds)

Random Forest scores

Accuracy: 0.9576775096187479
Precision: 0.8023952095808383
Recall: 0.6036036036036037
F1Score: 0.6889460154241647


##### XGBoost

In [29]:
xgboost = XGBClassifier(n_estimators=10, random_state=0)
model_xgb = xgboost.fit(x_train, y_train)





In [30]:
train_preds_xgb = model_xgb.predict(x_train)
test_preds_xgb = model_xgb.predict(x_test)

In [58]:
print("XGB scores\n")
get_scores(y_test, test_preds_xgb)

XGB scores

Accuracy: 0.9597761455054215
Precision: 0.8282208588957055
Recall: 0.6081081081081081
F1Score: 0.7012987012987013


###### XGB slightly better than Random Forest

In [32]:
test_prob_rf = pd.DataFrame(model_rf.predict_proba(x_test))
display(test_prob_rf.head())
test_prob_xgb = pd.DataFrame(model_xgb.predict_proba(x_test))
test_prob_xgb.head()

Unnamed: 0,0,1
0,1.0,0.0
1,0.109306,0.890694
2,0.848567,0.151433
3,1.0,0.0
4,1.0,0.0


Unnamed: 0,0,1
0,0.977492,0.022508
1,0.217482,0.782518
2,0.833774,0.166226
3,0.96547,0.03453
4,0.977492,0.022508


In [34]:
get_scores(y_test, (test_prob_rf[1]>=0.5).astype(int))

Accuracy: 0.9576775096187479
Precision: 0.8023952095808383
Recall: 0.6036036036036037
F1Score: 0.6889460154241647


In [60]:
print("Scores after changing the threshold to 0.3 \n")
get_scores(y_test, (test_prob_rf[1]>=0.3).astype(int))

Scores after changing the threshold to 0.3 

Accuracy: 0.956278419027632
Precision: 0.7621621621621621
Recall: 0.6351351351351351
F1Score: 0.6928746928746928


In [61]:
# min-max probability for both the classes for RF
temp = pd.Series(test_preds)
print(test_prob_rf[temp==0][0].max(), test_prob_rf[temp==0][0].min())
test_prob_rf[temp==1][1].max(), test_prob_rf[temp==1][1].min()

1.0 0.5837053427768296


(1.0, 0.50616987301387)

In [37]:
test_preds

array([0, 1, 0, ..., 0, 0, 0])

In [38]:
x_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch
receipt_id,company_id,matched_transaction_id,feature_transaction_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
50068,50000,50308,50307,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
50068,50000,50308,50308,0.95,0.0,0.4,1.0,0.0,0.8,1.0,0.0,0.0,1.0
50068,50000,50308,50309,0.95,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50219,50000,50173,50170,0.0,0.4,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
50219,50000,50173,50171,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [39]:
threshold = 0.5

#### Calculate the rank based on probabilites obtained

In [62]:
test_preds_df = pd.DataFrame(test_prob_rf[1].values, index=x_test.index, columns=["rf_probs"])
test_preds_df["rf_preds"] = test_preds

# display(test_preds_df.head())

grouper = test_preds_df.groupby(level=['receipt_id', 'company_id', 'matched_transaction_id'])
test_preds_df['rf_match_rank'] = grouper['rf_probs'].rank('min', ascending=False).astype(int)
# make all the values 0 for the groups where we do not have any probability >= 0.5
test_preds_df.loc[test_preds_df.rf_match_rank.eq(1) & test_preds_df.rf_probs.lt(threshold), "rf_match_rank"] = 0
test_preds_df["rf_match_rank"] = (
    test_preds_df.groupby(level=['receipt_id', 'company_id', 'matched_transaction_id'])['rf_match_rank']
    .transform(lambda x: 0 if 0 in x.values else x)
)


test_preds_df["xgb_probs"] = test_prob_xgb[1].values
test_preds_df["xgb_preds"] = test_preds_xgb
test_preds_df['xgb_match_rank'] = grouper['xgb_probs'].rank('min', ascending=False).astype(int)
# make all the values 0 for the groups where we do not have any probability >= 0.5
test_preds_df.loc[test_preds_df.xgb_match_rank.eq(1) & test_preds_df.xgb_probs.lt(threshold), "xgb_match_rank"] = 0
test_preds_df["xgb_match_rank"] = (
    test_preds_df.groupby(level=['receipt_id', 'company_id', 'matched_transaction_id'])['xgb_match_rank']
    .transform(lambda x: 0 if 0 in x.values else x)
)

test_preds_df["actual"] = y_test
test_preds_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,rf_probs,rf_preds,rf_match_rank,xgb_probs,xgb_preds,xgb_match_rank,actual
receipt_id,company_id,matched_transaction_id,feature_transaction_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
50068,50000,50308,50307,0.0,0,3,0.022508,0,3,0
50068,50000,50308,50308,0.890694,1,1,0.782518,1,1,1
50068,50000,50308,50309,0.151433,0,2,0.166226,0,2,0
50219,50000,50173,50170,0.0,0,3,0.03453,0,3,0
50219,50000,50173,50171,0.0,0,3,0.022508,0,4,0


In [42]:
test_preds_df.rf_match_rank.eq(1).astype(int).eq(test_preds_df.actual).mean()

0.9580272822665268

In [70]:
preds_match_present = (grouper['rf_preds']
                       .apply(lambda x: True if 1 in x.values else False))
actual_match_present = (grouper['actual']
                       .apply(lambda x: True if 1 in x.values else False))
display(preds_match_present)
display(actual_match_present)
f"% of times detecting whether there is a match available or not at all - {preds_match_present.eq(actual_match_present).mean()}"

receipt_id  company_id  matched_transaction_id
10,001      10000       10,605                    False
10,002      10000       10,286                     True
10,005      10000       10,474                    False
10,008      10000       10,303                    False
10,014      10000       10,650                     True
                                                  ...  
50,198      50000       50,251                     True
50,199      50000       50,075                     True
50,219      50000       50,173                     True
50,221      50000       50,034                     True
50,224      50000       50,050                     True
Name: rf_preds, Length: 289, dtype: bool

receipt_id  company_id  matched_transaction_id
10,001      10000       10,605                     True
10,002      10000       10,286                     True
10,005      10000       10,474                     True
10,008      10000       10,303                    False
10,014      10000       10,650                     True
                                                  ...  
50,198      50000       50,251                     True
50,199      50000       50,075                     True
50,219      50000       50,173                     True
50,221      50000       50,034                     True
50,224      50000       50,050                     True
Name: actual, Length: 289, dtype: bool

'% of times detecting whether there is a match available or not at all - 0.7058823529411765'

In [43]:
test_preds_df[test_preds_df.rf_match_rank.eq(2)].rf_match_rank

receipt_id  company_id  matched_transaction_id  feature_transaction_id
50,068      50000       50,308                  50,309                    2
50,219      50000       50,173                  50,174                    2
50,072      50000       50,160                  50,156                    2
                                                50,157                    2
                                                50,158                    2
                                                                         ..
50,224      50000       50,050                  50,045                    2
                                                50,046                    2
                                                50,047                    2
                                                50,048                    2
                                                50,049                    2
Name: rf_match_rank, Length: 285, dtype: int64

In [44]:
test_preds_df.rf_match_rank.eq(1).astype(int).eq(test_preds_df.actual).sum()

2739

In [46]:
# checking how many of actual 1s are present in predication with 
(test_preds_df.rf_match_rank.isin((2,)) & test_preds_df.actual.eq(1)).sum()

2747

In [47]:
test_preds_df[test_preds_df.rf_match_rank.eq(2)].actual.value_counts()

0    277
1      8
Name: actual, dtype: int64

In [48]:
test_preds_df.head(8)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,rf_probs,rf_preds,rf_match_rank,xgb_probs,xgb_preds,xgb_match_rank,actual
receipt_id,company_id,matched_transaction_id,feature_transaction_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
50068,50000,50308,50307,0.0,0,3,0.022508,0,3,0
50068,50000,50308,50308,0.890694,1,1,0.782518,1,1,1
50068,50000,50308,50309,0.151433,0,2,0.166226,0,2,0
50219,50000,50173,50170,0.0,0,3,0.03453,0,3,0
50219,50000,50173,50171,0.0,0,3,0.022508,0,4,0
50219,50000,50173,50172,0.0,0,3,0.022508,0,4,0
50219,50000,50173,50173,0.833198,1,1,0.767543,1,1,1
50219,50000,50173,50174,0.151433,0,2,0.166226,0,2,0


In [341]:
len(test_preds_df)

2859

In [51]:
temp = (test_preds_df.groupby(level=['receipt_id', 'company_id', 'matched_transaction_id'])
        .filter(lambda x: 1 in x.rf_preds.values))
print(len(temp))
temp.head()

959


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,rf_probs,rf_preds,rf_match_rank,xgb_probs,xgb_preds,xgb_match_rank,actual
receipt_id,company_id,matched_transaction_id,feature_transaction_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
50068,50000,50308,50307,0.0,0,3,0.022508,0,3,0
50068,50000,50308,50308,0.890694,1,1,0.782518,1,1,1
50068,50000,50308,50309,0.151433,0,2,0.166226,0,2,0
50219,50000,50173,50170,0.0,0,3,0.03453,0,3,0
50219,50000,50173,50171,0.0,0,3,0.022508,0,4,0


In [52]:
temp.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,rf_probs,rf_preds,rf_match_rank,xgb_probs,xgb_preds,xgb_match_rank,actual
receipt_id,company_id,matched_transaction_id,feature_transaction_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
50068,50000,50308,50307,0.0,0,3,0.022508,0,3,0
50068,50000,50308,50308,0.890694,1,1,0.782518,1,1,1
50068,50000,50308,50309,0.151433,0,2,0.166226,0,2,0
50219,50000,50173,50170,0.0,0,3,0.03453,0,3,0
50219,50000,50173,50171,0.0,0,3,0.022508,0,4,0
50219,50000,50173,50172,0.0,0,3,0.022508,0,4,0
50219,50000,50173,50173,0.833198,1,1,0.767543,1,1,1
50219,50000,50173,50174,0.151433,0,2,0.166226,0,2,0
50072,50000,50160,50156,0.0,0,2,0.022508,0,2,0
50072,50000,50160,50157,0.0,0,2,0.022508,0,2,0


In [53]:
y_test[pd.IndexSlice['10,005', 10000, '10,474', :]]

feature_transaction_id
10,461    0
10,462    0
10,465    0
10,466    0
10,467    0
10,468    0
10,469    0
10,470    0
10,471    0
10,472    0
10,473    0
10,474    1
10,477    0
10,478    0
10,479    0
Name: best_match, dtype: int64

In [54]:
temp = test_preds_df.groupby(level=['receipt_id', 'company_id', 'matched_transaction_id']).apply(len)
temp

receipt_id  company_id  matched_transaction_id
10,001      10000       10,605                     7
10,002      10000       10,286                     7
10,005      10000       10,474                    15
10,008      10000       10,303                    20
10,014      10000       10,650                     7
                                                  ..
50,198      50000       50,251                     1
50,199      50000       50,075                     7
50,219      50000       50,173                     5
50,221      50000       50,034                     4
50,224      50000       50,050                    12
Length: 289, dtype: int64

In [55]:
# For each receipt, candidate transactions
temp.mean(), temp.median()

9.892733564013842