In [1]:
import pickle
import numpy as np
from collections import defaultdict, Counter
from hs_gimme.constants.clients import ClientsNames
from datetime import datetime
import matplotlib.pyplot as plt
import statsmodels.api as sm
import math
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import warnings
warnings.filterwarnings('ignore')

ACCOUNTS = {
    ClientsNames.ANTALYA,
    ClientsNames.PORTO,
    ClientsNames.SEATTLE,
    ClientsNames.MOSCOW,
    ClientsNames.ATHENS,
    ClientsNames.CANCUN,
    ClientsNames.INDIANA,
    ClientsNames.ARIZONA,
    ClientsNames.OXFORD,
}

In [2]:
req_fields =  ['job_department',
              'country',
               'external_job_band',
               'external_recruiters',
               'top_category',
               'sub_category',
               'job_education',
               'seniority_level',
               'min_years_of_relevant_experience',
               'max_years_of_relevant_experience',
               'max_salary',
               'job_type',
               'is_visa_required',
               'is_remote_location', 
               'external_status',
               'industry',
               'open_date',
               'past_candidates_distribution_date',
               'is_recent_grad', 
               'is_intern',
               '_created_at',
               'date_posted',
               'job_create_date',
               'last_edit_date',
               'recruiter_roles',
               'recruiting_type']

In [3]:
def fix_recommendation_status(row):
    if row['dismissed_reason'] in {'Candidate invited to apply', 
                                   'Qualified but candidate not interested at this time',
                                   'Qualified but candidate could not be reached',
                                   'Candidate invited to apply, other: Candidate invited to apply'}:
        return 'contacted'
    
    if row['dismissed_reason'] in {'other: not needed', 
                                   'other: This position is not posted externally. It is posted Internally only and is intended for a promotion for an Internal employee.', 
                                   'other: not needed right now',
                                   'In process in another req',
                                   'Current Employee',
                                   'other: not needed right now'}:
        return 'other'
    
    return row['recommendation_status']

In [4]:
data = []
actions = []
reqs = {}
for i, account in enumerate(ACCOUNTS):
    
    mongo = gmcdb('production', account.lower())
    recs = list(mongo.recommended_candidate.find({'_created_at': {'$gt': datetime(2024, 2, 6)}}, ['req_id', 
                                                                                                  'applied_recipes', 
                                                                                                  'recommendation_status',
                                                                                                  'dismissed_reason',
                                                                                                  '_updated_at', 
                                                                                                  '_created_at', 
                                                                                                  'is_from_dynamic_fetch',
                                                                                                  'current_talent_id', 
                                                                                                  'removed_date']))
    
    
    act = list(mongo.dynamic_fetch_action.find({'created_at': {'$gt': datetime(2024, 2, 6)}}))
    actions += act
    
    req_ids = list({r['req_id'] for r in recs if 'req_id' in r})
    
    
    for req in mongo.req.find({'_id': {'$in': req_ids}}, req_fields):
        reqs[req['_id']] = req
    
    
    print(i, '/', len(ACCOUNTS), account, len(recs), Counter([r.get('applied_recipes') for r in recs]))

    for r in recs:
        r.update({'account_id': account})
        data.append(r)

[2m2024-05-19T15:24:05.807217Z[0m [[32m[1minfo     [0m] [1mMongos instances selected     [0m [36menvironment[0m=[35mlocal[0m [36mfile_path[0m=[35m/Users/dima/hiredscore/gimme/hs_gimme/db_facade/connection_string_builder.py[0m [36mfunction_name[0m=[35mget_selected_mongos_instances[0m [36mhostname[0m=[35mDima-Shulga-MacBook-Pro[0m [36mhosts[0m=[35m['applicativedb-prod-mongos-4.omcomcom.com', 'applicativedb-prod-mongos-0.omcomcom.com'][0m [36mline_number[0m=[35m38[0m [36mmodule[0m=[35mhs_gimme.db_facade.connection_string_builder[0m [36mpid[0m=[35m2589[0m



KeyboardInterrupt



In [None]:
df = pd.DataFrame(data)
df['recommendation_status'] = df.apply(fix_recommendation_status, axis=1)

In [None]:
df['recommendation_status'].value_counts()

In [None]:
df = df[~df['recommendation_status'].isin({'new', 'other'}) & ~df['is_from_dynamic_fetch'].fillna(False)].dropna(subset=['applied_recipes'])
df.shape

In [None]:
for f in req_fields:
    df[f] = df['req_id'].apply(lambda x: reqs[x].get(f))

In [None]:
df['is_positive'] = df['recommendation_status'].isin({'contacted', 'shared_with_hm', 'shared_with_recruiter', 'profile_sent'})
df['user'] = df['recruiter_roles'].apply(lambda x: x['recruiter'])
df['is_from_dynamic_fetch'] = df['is_from_dynamic_fetch'].fillna(False)
df['is_new_algo'] = df['applied_recipes'] == 'base_fetch'
df['min_years_of_relevant_experience_scaled'] =  (df['min_years_of_relevant_experience'] - df['min_years_of_relevant_experience'].min()) / (df['min_years_of_relevant_experience'].max() - df['min_years_of_relevant_experience'].min())

In [None]:
df['account_id'] = df['account_id'].apply(lambda x: ['all', x])

In [None]:
df = df.explode('account_id')

In [None]:
df.shape

In [None]:
gb = ['account_id', 'applied_recipes']
#gb = ['account_id', 'top_category', 'applied_recipes']
gdf = df.groupby(gb + ['recommendation_status'])['_id'].count()
udf = gdf.unstack().fillna(0)
udf['num_of_reqs'] = df.groupby(gb)['req_id'].nunique()
udf['num_of_users'] = df.groupby(gb)['user'].nunique()
udf['viewed'] += 1
udf['contacted'] += 1
udf['total'] = udf['contacted'] + udf['dismissed'] + udf['viewed']
udf['rate'] = udf['contacted'] / udf['total']
xdf = udf.unstack()#.dropna(subset=[('total', 'base_fetch'), ('total', 'base_fetch&legacy_query_fetch_for_ab_test')])
xdf['ratio'] = xdf[('rate', 'base_fetch')] / xdf[('rate', 'base_fetch&legacy_query_fetch_for_ab_test')]
xdf.dropna(subset=[('ratio', '')])
#xdf = xdf[(xdf[('total', 'base_fetch')] > 20) & (xdf[('total', 'base_fetch&legacy_query_fetch_for_ab_test')] > 20)]

xdf.reset_index().groupby([('account_id', '')]).agg(num_of_recommendations_new=(('total', 'base_fetch') , 'sum'), 
                                                    num_of_recommendations_old=(('total', 'base_fetch&legacy_query_fetch_for_ab_test') , 'sum'),
                                                    num_of_reqs_new=(('num_of_reqs', 'base_fetch') , 'sum'), 
                                                    num_of_reqs_old=(('num_of_reqs', 'base_fetch&legacy_query_fetch_for_ab_test') , 'sum'),
                                                    num_of_users_new=(('num_of_users', 'base_fetch') , 'sum'), 
                                                    num_of_users_old=(('num_of_users', 'base_fetch&legacy_query_fetch_for_ab_test') , 'sum'),
                                                    adjusted_change_median=(('ratio', ''), 'median'),
                                                    # adjusted_change_25q=(('ratio', ''), lambda x: x.quantile(0.25)),
                                                    # adjusted_change_75q=(('ratio', ''), lambda x: x.quantile(0.75)),
                                                    adjusted_change_mean=(('ratio', ''), 'mean'),  
                                                   )

In [None]:
xdf.to_csv('~/Downloads/athens.csv')

In [None]:
xdf.boxplot(column='ratio', by='account_id', figsize=(13, 5))
plt.plot([1]*10, color='red', alpha=0.5)
plt.show()

In [None]:
df.to_csv('ab_test_results_mar_2.csv')

# Bootstrap

In [None]:
for account, account_df in list(df.groupby('account_id')):
    bootstrap_ratios = []
    for _ in range(1000):
        gdf = account_df.sample(n=len(account_df), replace=True).groupby(gb + ['recommendation_status'])['_id'].count()
        udf = gdf.unstack().fillna(0)
        udf['num_of_reqs'] = df.groupby(gb)['req_id'].nunique()
        udf['num_of_users'] = df.groupby(gb)['user'].nunique()
        if 'viewed' not in udf:
            udf['viewed'] = 0
            
        if 'contacted' not in udf:
            udf['contacted'] = 0
            
        if 'dismissed' not in udf:
            udf['dismissed'] = 0
            
        udf['viewed'] += 1
        udf['contacted'] += 1
        udf['total'] = udf['contacted'] + udf['dismissed'] + udf['viewed']
        udf['rate'] = udf['contacted'] / udf['total']
        account_xdf = udf.unstack().dropna(subset=[('total', 'base_fetch'), ('total', 'base_fetch&legacy_query_fetch_for_ab_test')])
        account_xdf['ratio'] = account_xdf[('rate', 'base_fetch')] / account_xdf[('rate', 'base_fetch&legacy_query_fetch_for_ab_test')]
        account_xdf = account_xdf.dropna(subset=[('ratio', '')])
        #account_xdf = account_xdf[(account_xdf[('total', 'base_fetch')] > 20) & (account_xdf[('total', 'base_fetch&legacy_query_fetch_for_ab_test')] > 20)]
        
        m = account_xdf['ratio'].dropna().median()
        
        if not math.isinf(m):
            bootstrap_ratios.append(m)
    print(account, len(account_df), 'actions')
    print('Mean of median:', np.mean(bootstrap_ratios))
    confidence_interval = np.percentile(bootstrap_ratios, [5, 95])
    print("Bootstrap 90% Confidence Interval:", confidence_interval)
    print(round(np.mean(bootstrap_ratios), 2), [round(x, 2) for x in confidence_interval])
    print()

# Linear Regression

In [None]:
conf = ['account_id', 'top_category',  'job_education']
for account, account_df in list(df.groupby('account_id')) + [('ALL', df)]:
    xdf = account_df
    model_df = pd.get_dummies(xdf[conf]).astype(int)
    to_drop = []

    for f in conf:
        to_drop.append([c for c in model_df.columns if f in c][0])

    print('Dropping:', to_drop)
    model_df = model_df.drop(to_drop, axis=1)
    #model_df = pd.get_dummies(account_df[['account_id']])
    model_df['is_new_algo'] = xdf['is_new_algo'].astype(int)
    model_df['min_years_of_relevant_experience_scaled'] = xdf['min_years_of_relevant_experience_scaled'].fillna(1)
    #model_df =  lr_df[['is_new_algo']].astype(int)
    model_df = sm.add_constant(model_df)

    

    res = sm.Logit(xdf['is_positive'], model_df, maxiter=200).fit()
    print(f"OR={np.exp(res.params['is_new_algo'])}, p={res.pvalues['is_new_algo']}")
    print()

In [None]:
results = []
for account_id, xdf in df.groupby('account_id'):
    if account_id == 'all':
        continue
        xdf = df[df['account_id'] != 'all']
    conf = ['account_id', 'sub_category', 'user', 'job_education']
    model_df = pd.get_dummies(xdf[conf]).astype(int)
    
    model_df['is_new_algo'] = xdf['is_new_algo'].astype(int)
    model_df['min_years_of_relevant_experience_scaled'] = xdf['min_years_of_relevant_experience_scaled'].fillna(1)
    #model_df =  lr_df[['is_new_algo']].astype(int)
    model_df = sm.add_constant(model_df)
    print(account_id)
    fitted = False
    while not fitted:
        try:
            res = sm.Logit(xdf['is_positive'], model_df).fit(disp=0)
            fitted = True
            results.append({'account_id': account, 'odds_ratio':  np.exp(res.params['is_new_algo']), 'p_value': res.pvalues['is_new_algo']})
        except Exception as e:
            print(e, model_df.shape)
            
            VIF = [variance_inflation_factor(model_df.values, i) for i in range(model_df.shape[1])]
            to_drop = list(sorted(zip(model_df.columns, VIF), key=lambda x: x[1], reverse=1))[0]
            model_df = model_df.drop(to_drop[0], axis=1)
           
            print('dropping', to_drop, 'new shape', model_df.shape)
        #print()

In [None]:
model_df.shape

In [None]:
X = add_constant(model_df)  # your_dataframe should contain only predictor variables
VIF = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
for c, v in sorted(zip(model_df.columns, VIF), key=lambda x: x[1], reverse=1):
    print(c, v)

In [None]:
account_id = 'seattle'

        

print(f"{account_id}, OR={np.exp(res.params['is_new_algo'])}, p={res.pvalues['is_new_algo']}")

In [None]:
model_df.shape

In [None]:
model_df.columns

In [None]:
conf = ['account_id', 'top_category', 'user', 'job_education']
xdf = df[df['account_id'] != 'all']
model_df = pd.get_dummies(xdf[conf]).astype(int)
to_drop = []

for f in conf:
    to_drop.append([c for c in model_df.columns if f in c][0])

print('Dropping:', to_drop)
model_df = model_df.drop(to_drop, axis=1)
#model_df = pd.get_dummies(account_df[['account_id']])
model_df['is_new_algo'] = xdf['is_new_algo'].astype(int)
model_df['min_years_of_relevant_experience_scaled'] = xdf['min_years_of_relevant_experience_scaled'].fillna(1)
#model_df =  lr_df[['is_new_algo']].astype(int)
model_df = sm.add_constant(model_df)



res = sm.Logit(xdf['is_positive'], model_df, maxiter=200).fit()
print(f"OR={np.exp(res.params['is_new_algo'])}, p={res.pvalues['is_new_algo']}")
print()