In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import pytz
import numpy as np
import random
import seaborn as sns
import yaml

from datetime import datetime
from datetime import timedelta

from scipy import sparse
from sklearn.model_selection import train_test_split

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k

from google.cloud import storage

import warnings

warnings.simplefilter('ignore')
%matplotlib inline

In [None]:
district_id = 1

### Declaring paths and GCS URIs

In [None]:
gcs_client = storage.Client(project="linkbal-dp")

today = datetime.strftime(datetime.now(tz=pytz.timezone("Asia/Tokyo")), "%Y%m%d")

bucket = "cl-personalization.datasets.linkbal.com"
beacon_path = "gs://" + bucket + "/inputs/beacon_events/"
dislikes_file_base = beacon_path + "recommend-dislikes/{}_daily_recs_dislikes.csv"

### both users and likes data are provided by the collect-recs-data DAG in Airflow
prefix = "inputs/lightfm"
tmp_dir = "/tmp"
users_file = "users.csv"
users_path = prefix + "/" + users_file
likes_file = "likes.csv"
likes_path = "gs://" + bucket + "/" + prefix + "/" + likes_file

rec_output_path = "gs://" + bucket + "/outputs/lightfm_v1_district_{}.csv"

# Global variables

In [None]:
NUM_THREADS = os.cpu_count() - 1 # set this to the number of CPU cores to take advantage of parallel training
EPOCHS = 50

TARGET_RECS = 50 #also known as k
MAX_RECS = 100

N_DAYS_DISLIKES_DATA = 14

Khoa-san had this variable below:

M_MAX_REC = 400, Maximum number of times a target user gets recommended


I'm removing the max rec limitation on individual target users because in some situations, we have to recommend the same user to everybody, we simply don't have enough users to recommend. And if we do want to set a limit, we can do so by making the limit a function of the number of target users vs users available, rather than hard-coding some value that may not apply well to all districts.

# User data

### Quick and dirty fix with bash utils

In [None]:
bucket_obj = gcs_client.get_bucket(bucket)
bucket_obj.get_blob(f'{users_path}').download_to_filename(f'{tmp_dir}/{users_file}')

In [None]:
! cat $tmp_dir/$users_file | sed 's/"N,/,/g' | sed 's/"//g' > $tmp_dir/clean_users.csv
! mv $tmp_dir/clean_users.csv $tmp_dir/$users_file

In [None]:
user_cols = [
    'user_id',
    'gender',
    'age',
    'self_introduction',
    'blood_type',
    'brother_and_sister',
    'job_id',
    'annual_salary_range',
    'body_shape',
    'education_background',
    'hometown_prefecture_id',
    'nationality',
    'holiday',
    'smoking',
    'drinking',
    'housemate',
    'sociality',
    'intention_to_marry',
    'marital_status',
    'absence_or_presence_of_child',
    'whether_want_child',
    'housework_and_child_rearing',
    'meeting_wish_type',
    'first_dating_expense_type',
    'height',
    'personal_color',
    'no_lover_history',
    'gamble',
    'cooking_skill',
    'completion_rate',
    'district_id',
    'created_at'
]

In [None]:
user_df = pd.read_csv(
    tmp_dir + '/' + users_file,
    names = user_cols,
    parse_dates = ['created_at'],
    dtype = {'user_id': np.int32, 'target_user_id':np.int32}
).query('district_id == @district_id').reset_index(drop = True)

In [None]:
user_df.shape

In [None]:
# define discrete variables
continuous_vars = ['height', 'age', 'self_introduction']
discrete_vars = list(set(user_df.columns) - set(continuous_vars) - set(['user_id','gender']))

### Missing values

In [None]:
def fillna(df, continuous_vars, discrete_vars):
    for col in continuous_vars:
        df[col].fillna(df[col].mean(), inplace=True)
    for col in discrete_vars:
        df[col].fillna('na', inplace=True)

In [None]:
fillna(user_df, continuous_vars, discrete_vars)

### Split gender

In [None]:
def categorize_age(age, low, high):
    if age < low:
        return 'lt_' + str(low)
    elif age > high:
        return 'gt_' + str(high)
    else:
        return str(age)

In [None]:
user_df['age_cat'] = user_df.age.apply(lambda x: categorize_age(x, 18, 60))

In [None]:
def categorize_height(height, low, high):
    if height < low:
        return 'lt_' + str(low)
    elif height > high:
        return 'gt_' + str(high)
    else:
        return str(int(height))

In [None]:
user_df['height_cat'] = user_df.apply(
    lambda x: categorize_height(x.height, 150, 190) if x.gender == 1 else categorize_height(x.height, 140, 180), 
    axis = 1
)

In [None]:
def categorize_introduction(intro):
    if intro == 0:
        return 'na'
    elif intro > 0 and intro <= 100:
        return 'lt_100'
    elif intro > 100 and intro <= 200:
        return 'gt_100_lt_200'
    elif intro > 200 and intro <= 300:
        return 'gt_200_lt_300'
    elif intro > 300 and intro <= 400:
        return 'gt_300_lt_400'
    elif intro > 400 and intro <= 500:
        return 'gt_400_lt_500'
    else:
        return 'gt_500'

In [None]:
user_df['intro_cat']= user_df.self_introduction.apply(categorize_introduction)

# Interaction data

### Like data

In [None]:
likes_cols = [
    'user_id',
    'target_user_id',
    'checked',
    'matched',
    'deleted'
]

like_df = pd.read_csv(
    likes_path,
    names = likes_cols
)

In [None]:
like_df = like_df.drop_duplicates(subset=['user_id', 'target_user_id'])

In [None]:
like_df = like_df[like_df.user_id.isin(user_df.user_id) & like_df.target_user_id.isin(user_df.user_id)]

In [None]:
like_df['checked'] = like_df['checked'].astype(bool)
like_df['matched'] = like_df['matched'].astype(bool)
like_df['deleted'] = like_df['deleted'].astype(bool)

### Dislike data

In [None]:
def create_dislikes_df(n_days=14):
    dislikes = pd.DataFrame(
        [], 
        columns = [
            'user_id',
            'target_user_id'
        ]
    )
    for date in np.array([datetime.today() - timedelta(days = i) for i in range(1, n_days + 1)]):
        try:
            date = date.strftime("%Y%m%d")
            print(f"Obtaining dislikes for {date}")
            dislikes = dislikes.append(
                pd.read_csv(
                    dislikes_file_base.format(date),
                    dtype = {'user_id': np.int32, 'target_user_id':np.int32}
                )
            )
            dislikes.drop_duplicates(inplace = True) 
        except FileNotFoundError:
            print(f"Could not find dislikes data at {dislikes_file_base.format(date)}. Please examine URI.")

    return dislikes

In [None]:
dislike_df = create_dislikes_df()

In [None]:
# dislike_df = pd.read_csv('dislikes.csv')

In [None]:
### reducing dislikes to users and target users in the users dataset

dislike_df = dislike_df[(dislike_df.user_id.isin(user_df.user_id.unique())) & (dislike_df.target_user_id.isin(user_df.user_id.unique()))]

dislike_df.drop_duplicates(inplace=True)

In [None]:
dislike_df.user_id.nunique(), user_df.user_id.nunique()

### Merge data

#### Male - female matches as female - male liking

In [None]:
match_df = like_df[like_df.matched]

In [None]:
like_df = pd.concat(
    [
        like_df,
        match_df.rename(
            columns =
            {
                'user_id': 'target_user_id', 
                'target_user_id': 'user_id'
            }
        )
    ], 
    sort=False
)

del match_df

In [None]:
like_df = like_df.drop_duplicates(subset=['user_id', 'target_user_id'])

In [None]:
like_df.shape[0]

In [None]:
user_likes_sent = like_df.groupby('user_id').target_user_id.count().\
                            reset_index().\
                            rename(columns={"target_user_id": "user_likes_sent"})
target_likes_received = like_df.groupby('target_user_id').user_id.count().\
                            reset_index().\
                            rename(columns={"user_id": "target_likes_received"})
target_matches = like_df[like_df.matched].groupby('target_user_id').user_id.count().\
                            reset_index().\
                            rename(columns={"user_id": "target_matches"})

In [None]:
like_df = like_df.merge(user_likes_sent, on = 'user_id', how = 'left')
like_df = like_df.merge(target_likes_received, on='target_user_id', how = 'left')
like_df = like_df.merge(target_matches, on='target_user_id', how = 'left')

In [None]:
like_df['liked'] = True

In [None]:
merge_df = like_df.merge(dislike_df, on=['user_id', 'target_user_id'], how='outer')

In [None]:
del like_df, user_likes_sent, target_likes_received, target_matches

In [None]:
merge_df.matched.fillna(False, inplace=True)
merge_df.deleted.fillna(False, inplace=True)
merge_df.checked.fillna(False, inplace=True)
merge_df.user_likes_sent.fillna(0, inplace=True)
merge_df.target_likes_received.fillna(0, inplace=True)
merge_df.target_matches.fillna(0, inplace=True)
merge_df.liked.fillna(False, inplace=True)

In [None]:
### The effect of this is to drop dislike records if a like was eventually sent,
### since there should only be one interaction record b/t user and target pair, either like or dislike.
### All dislikes duplicated have already been removed, so any duplicates remaining are 1 like and 1 dislike

merge_df.drop(
    index=merge_df[
        (merge_df[['user_id', 'target_user_id']].duplicated(keep=False)) & 
        (merge_df.liked == 0)].index, 
    inplace = True
)

In [None]:
merge_df = merge_df.merge(user_df[['user_id', 'gender']], on='user_id')

In [None]:
merge_df.head()

# Split gender

### Interaction

In [None]:
male_interaction_df = merge_df[merge_df.gender == 1].copy()
female_interaction_df = merge_df[merge_df.gender == 2].copy()

In [None]:
male_interaction_df.shape[0], female_interaction_df.shape[0]

In [None]:
del merge_df

# Build lightfm dataset

### Feature list

In [None]:
feature_vars = [
       'blood_type', 'brother_and_sister', 'annual_salary_range',
       'education_background', 'holiday', 'smoking', 'drinking',
       'sociality', 'intention_to_marry', 'marital_status',
       'absence_or_presence_of_child', 'whether_want_child',
       'housework_and_child_rearing', 'meeting_wish_type',
       'first_dating_expense_type', 'body_shape', 'housemate',
       'personal_color', 'gamble', 'cooking_skill', 'job_id',
       'hometown_prefecture_id', 'age_cat', 'height_cat', 'intro_cat'
]

In [None]:
## creating indicator variables

male_feature_list = []
female_feature_list = []

# categorical features
for var in feature_vars:
    male_feature_list += [var + '_' + str(val) for val in user_df[user_df.gender == 1][var].unique()]
    female_feature_list += [var + '_' + str(val) for val in user_df[user_df.gender == 2][var].unique()]

len(male_feature_list), len(female_feature_list)

### User feature

In [None]:
def create_features(x):
    return (x['user_id'], [var + '_' + str(x[var]) for var in feature_vars])

## Data Preparation and Model Training

In [None]:
with open("hyperparameters.yaml", "r") as file:
    hyperparameters = yaml.load(file)

In [None]:
def train_model(interactions, weights, item_features, user_features, hyperparameters):
    model = LightFM(**hyperparameters)

    model.fit(
        interactions,
        item_features=item_features,
        user_features=user_features,
        sample_weight=weights,
        epochs=EPOCHS,
        num_threads=NUM_THREADS,
        verbose=False
    )
    
    return model

### For Males Users

#### Profile data

In [None]:
### "select users who have interactions or who have completed at least half of their profiles."

m_male_df = user_df[
    (user_df.gender == 1) &
    (
        (user_df.user_id.isin(male_interaction_df.user_id.unique())) | 
        (user_df.completion_rate > 0.5)
    )
].copy()

m_female_df = user_df[
    (user_df.gender == 2) & 
    (
        (user_df.user_id.isin(male_interaction_df.target_user_id.unique())) |
        (user_df.completion_rate > 0.5)
    )
].copy()

In [None]:
m_male_df.shape[0], m_female_df.shape[0]

#### Build dataset

In [None]:
m_male_df['features'] = m_male_df.apply(create_features, axis=1)
m_female_df['features'] = m_female_df.apply(create_features, axis=1)

In [None]:
dataset_m = Dataset()

dataset_m.fit(
    set(m_male_df['user_id']), 
    set(m_female_df['user_id']),
    user_features=male_feature_list, 
    item_features=female_feature_list
)

In [None]:
user_features_m = dataset_m.build_user_features(m_male_df['features'])
item_features_m = dataset_m.build_item_features(m_female_df['features'])

#### Calculate interaction score

In [None]:
def match_score(x):
    if x['matched']:
        return 5
    elif x['checked'] or x['deleted'] or not x['liked']:
        return 1
    elif x['gender'] == 2:
        return 4
    elif x['gender'] == 1:
        return 3

In [None]:
male_interaction_df['match_score'] = male_interaction_df.apply(match_score, axis=1)

#### Penalties

In [None]:
male_interaction_df['tg_received_penalty'] = 1 / (np.log(male_interaction_df['target_likes_received'] + 1) + 1)

In [None]:
male_interaction_df['sent_penalty'] = 1 / (np.log(male_interaction_df['user_likes_sent'] + 1) + 1)

In [None]:
male_interaction_df['tg_generousness_score'] = np.log(male_interaction_df['target_matches'] + 1)
male_interaction_df['tg_generousness_score'] = \
    (male_interaction_df['tg_generousness_score'] - male_interaction_df['tg_generousness_score'].min()) / \
    (male_interaction_df['tg_generousness_score'].max() - male_interaction_df['tg_generousness_score'].min())

#### Score

In [None]:
M_MATCH_SCORE_WEIGHT = 1
M_RECEIVED_PENALTY_WEIGHT = 2
M_SENT_PENALTY_WEIGHT = 1
M_TG_GENEROUSNESS_WEIGHT =  1

In [None]:
male_interaction_df['tg_received_penalty'].describe()

In [None]:
male_interaction_df['score'] = \
    male_interaction_df['match_score'] * M_MATCH_SCORE_WEIGHT + \
    male_interaction_df['tg_received_penalty'] * M_RECEIVED_PENALTY_WEIGHT + \
    male_interaction_df['sent_penalty'] * M_SENT_PENALTY_WEIGHT + \
    male_interaction_df['tg_generousness_score'] * M_TG_GENEROUSNESS_WEIGHT

male_interaction_df['score'] = male_interaction_df.apply(lambda x: 0 if x.liked == 0 else x.score, axis = 1)

In [None]:
male_interaction_df.score.describe()

In [None]:
male_interaction_df = male_interaction_df[
    male_interaction_df.user_id.isin(m_male_df.user_id) & 
    male_interaction_df.target_user_id.isin(m_female_df.user_id)
]

In [None]:
male_interaction_df.shape[0]

#### Train Male Model

In [None]:
male_interactions, male_weights = dataset_m.build_interactions(
    list(zip(male_interaction_df.user_id, male_interaction_df.target_user_id, male_interaction_df.score))
)

In [None]:
male_match_df = male_interaction_df[male_interaction_df.matched]
male_match_interactions, male_match_weights = dataset_m.build_interactions(
    list(zip(male_match_df.user_id, male_match_df.target_user_id))
)

In [None]:
model_m = train_model(male_interactions, male_weights, item_features_m, user_features_m, hyperparameters)

### For Female Users

#### Profile data

In [None]:
### this arbitrary decision to increase the req for male completeness is based on teh assumption
### that women will want more info in the profile when considering taking action
f_male_df = user_df[
    (user_df.gender == 1) &
    (
        (user_df.user_id.isin(female_interaction_df.user_id.unique())) | 
        (user_df.completion_rate > 0.7)
    )
].copy()

f_female_df = user_df[
    (user_df.gender == 2) &
    (
        (user_df.user_id.isin(female_interaction_df.user_id.unique())) | 
        (user_df.completion_rate > 0.5)
    )
].copy()

#### Build dataset

In [None]:
f_male_df['features'] = f_male_df.apply(create_features, axis=1)
f_female_df['features'] = f_female_df.apply(create_features, axis=1)

In [None]:
dataset_f = Dataset()

dataset_f.fit(
    set(f_female_df['user_id']),
    set(f_male_df['user_id']),
    user_features=female_feature_list,
    item_features=male_feature_list
)

In [None]:
user_features_f = dataset_f.build_user_features(f_female_df['features'])
item_features_f = dataset_f.build_item_features(f_male_df['features'])

#### Calculate interaction score

#### Match score

In [None]:
def match_score(x):
    if x['matched']:
        return 5
    elif x['checked'] or x['deleted'] or not x['liked']:
        return 1
    elif x['gender'] == 2:
        return 4
    elif x['gender'] == 1:
        return 3

In [None]:
female_interaction_df['match_score'] = female_interaction_df.apply(match_score, axis=1)

#### Penalties

In [None]:
female_interaction_df['tg_received_penalty'] = 1 / (np.log(female_interaction_df['target_likes_received'] + 1) + 1)

In [None]:
female_interaction_df['sent_penalty'] = 1 / (np.log(female_interaction_df['user_likes_sent'] + 1) + 1)

In [None]:
F_MATCH_SCORE_WEIGHT = 1
F_RECEIVED_PENALTY_WEIGHT = 2
F_SENT_PENALTY_WEIGHT = 1

#### Score

In [None]:
female_interaction_df['score'] = \
    female_interaction_df['match_score'] * F_MATCH_SCORE_WEIGHT + \
    female_interaction_df['tg_received_penalty'] * F_RECEIVED_PENALTY_WEIGHT + \
    female_interaction_df['sent_penalty'] * F_SENT_PENALTY_WEIGHT

female_interaction_df['score'] = female_interaction_df.apply(lambda x: 0 if x.liked == 0 else x.score, axis = 1)

In [None]:
female_interaction_df = female_interaction_df[
    female_interaction_df.user_id.isin(f_female_df.user_id) & 
    female_interaction_df.target_user_id.isin(f_male_df.user_id)
]

In [None]:
female_interactions, female_weights = dataset_f.build_interactions(
    list(zip(female_interaction_df.user_id, female_interaction_df.target_user_id, female_interaction_df.score))
)

### Train female model

In [None]:
model_f = train_model(female_interactions, female_weights, item_features_f, user_features_f, hyperparameters)

## Prediction - function declarations and generation

In [None]:
user_id_map_m, user_feature_map_m, item_id_map_m, item_feature_map_m = dataset_m.mapping()
user_id_map_f, user_feature_map_f, item_id_map_f, item_feature_map_f = dataset_f.mapping()

In [None]:
def apply_cl_conditions(this_user, tg_users):
    """Filter out target users who meet any of the following conditions: 
    -> user and targets have different district_id
      -> this is handled by subsetting likes on a single district's users data
    -> user and targets' user ages are more than 5 years apart
    -> target users' accounts are less than 1 day old (new users get special treatment in CL)
    -> target users have previously been liked by user (this is done in `filter_target_user` function below)
    """
    one_day_ago = datetime.today() - timedelta(days=1)
    age = m_male_df[m_male_df.user_id == this_user].age
    filtered_target_users = tg_users[
        (tg_users.created_at > one_day_ago) | # find target users with created_at more recent than 1 day ago
        ((age - tg_users.age).abs() > 5) # find target users more than 5 years apart from user
    ].user_id
    
    return filtered_target_users

In [None]:
def filter_target_user(this_user, tg_users, interactions):
    """
    CL Conditions and additional filters applied: 
      1. previously liked
      2. disliked within 2 weeks
    """
    tg_users_set = set(tg_users.user_id)
    liked_tg_users_set = set(
        interactions[(interactions.user_id == this_user) & (interactions.liked)].target_user_id
    )
    disliked_tg_users_set = set(
        dislike_df[dislike_df.user_id == this_user].target_user_id
    )
    cl_conditions = set(apply_cl_conditions(this_user, tg_users))
    
    return list(tg_users_set - liked_tg_users_set - disliked_tg_users_set - cl_conditions)

In [None]:
def append_and_count_valid_users(tg_list, rec_list, max_recs):
    """
    This function appends recs to a rec list if they do not exceed max_recs, and returns the number of recs obtained
    - tg_list - shuffled k recs target_user_ids
    - rec_list - contains the final list of recs, begins as an empty list
    - tg_user_dict - user_id: rec_count mapping
    - max_recs - our stopping point, but for some users there won't be enough people to recommend
    """

    for tg_user_id in tg_list:
        if len(rec_list) >= max_recs:  # we've prepared already met our max target
            break
            
        target_user_counts[tg_user_id] = target_user_counts.get(tg_user_id, 0) + 1
        
        rec_list.append(tg_user_id)
    
    return None

In [None]:
def get_top_recommended_tg(rec_df, target_recs, max_recs):
    """
    - rec_df is a df of user_id, score for a particular user using the model's predict function
      sorted from high to low score
    - target_recs is equivalent to k
    - max_recs is how many we try to provide as padding
    """
    tg_user_id_list = rec_df.user_id.tolist()
    top_list = tg_user_id_list[:target_recs]
    remain_list = tg_user_id_list[target_recs:]
    
    # shuffling k users
    random.shuffle(top_list)

    rec_list = []

    # Choose valid user from the top list
    append_and_count_valid_users(top_list, rec_list, max_recs)
    
    # Choose valid user from the remaining list to try to meet max_recs
    append_and_count_valid_users(remain_list, rec_list, max_recs)
    
    return rec_list

In [None]:
def get_sorted_recs_list(model, this_user, target_user_df, interaction_df, item_features, user_features, user_id_map, item_id_map):
    target_user_ids_for_rec = filter_target_user(this_user, target_user_df, interaction_df)
    if len(target_user_ids_for_rec) == 0:
        return []
    
    ### targets are filtered, and predict is called on each individual user separately, so this is probably as efficient as
    ### we are going to get in reducing target user space
    predicts = model.predict(
        user_id_map[this_user],
        [item_id_map[target_id] for target_id in target_user_ids_for_rec],
        item_features=item_features,
        user_features=user_features
    )

    predicted_scores = pd.DataFrame({'user_id': target_user_ids_for_rec, 'score': predicts})
    predicted_scores.sort_values(by='score', ascending=False, inplace=True)
    
    sorted_recs_list = get_top_recommended_tg(predicted_scores, TARGET_RECS, MAX_RECS)
    
    return sorted_recs_list

In [None]:
def build_records(model, user_df, target_user_df, interaction_df, item_features, user_features, user_id_map, item_id_map):
    rec_records = dict()
    for this_user in user_df.user_id:
        rec_records[this_user] = []
        sorted_recs = get_sorted_recs_list(
            model,
            this_user, 
            target_user_df, 
            interaction_df, 
            item_features, 
            user_features, 
            user_id_map, 
            item_id_map
        )    
        for rec in sorted_recs:
            rec_records[this_user].append(rec)
    return rec_records

def generate_pairs(all_recs):
    for user, recs in all_recs.items():
        for rec in recs:
            yield int(user), int(rec)

def create_long_df(records):
    return pd.DataFrame(
            generate_pairs(records), 
            columns = ['user_id', 'target_user_id']
    )

#### For Men

In [None]:
# must be declared in this namespace for the below functions to have access outside the scope of application on a per row basis
target_user_counts = {}

male_rec_records = build_records(
    model_m,
    m_male_df, 
    m_female_df, 
    male_interaction_df, 
    item_features_m, 
    user_features_m, 
    user_id_map_m, 
    item_id_map_m
)
male_rec_df = create_long_df(male_rec_records)

In [None]:
len(target_user_counts)

In [None]:
male_rec_df.groupby("target_user_id").user_id.count().plot(kind = 'hist')

In [None]:
print("Coverage male: %f%%"%(male_rec_df.target_user_id.nunique() / m_female_df.shape[0] * 100))

In [None]:
print("But, the top 10 most recommended users have been recommended as follows:")
    
sorted(target_user_counts.items(), key = lambda x: x[1], reverse = True)[:10]

#### For Women

In [None]:
target_user_counts = {}

female_rec_records = build_records(
    model_f,
    f_female_df, 
    f_male_df, 
    female_interaction_df, 
    item_features_f, 
    user_features_f, 
    user_id_map_f, 
    item_id_map_f
)

female_rec_df = create_long_df(female_rec_records)

In [None]:
female_rec_df.groupby("target_user_id").user_id.count().plot(kind = 'hist')

In [None]:
print("Coverage female: %f%%"%(female_rec_df.target_user_id.nunique() / f_male_df.shape[0] * 100))

In [None]:
print("But, the top 10 most recommended users have been recommended as follows:")
sorted(target_user_counts.items(), key = lambda x: x[1], reverse = True)[:10]

## Write and deliver output

In [None]:
merged_recs_df = pd.concat(
    [
        male_rec_df,
        female_rec_df
    ]
)

In [None]:
merged_recs_df.to_csv(rec_output_path.format(district_id), index=False, header=False)

### Clean up file just in case

In [None]:
! rm $tmp_dir/$users_file