In [None]:
import pandas as pd
import numpy as np
from rapidfuzz import process
import tqdm
pd.options.mode.chained_assignment = None

In [21]:
def ags_address_matching(customer_df,location_df):

    customer_df.drop(['mahalle_id'], axis=1, inplace=True, errors = 'ignore')
    customer_df.dropna(subset=['il','ilce','mahalle_ad'], inplace=True)

    # Correct some records that discovered
    customer_df['il'] = customer_df['il'].str.replace(' ', '')
    customer_df['il'] = customer_df['il'].replace(['Çankkale'],'Çanakkale')
    customer_df['il'] = customer_df['il'].replace(['Kırkkale'],'Kırıkkale')
    customer_df['ilce'] = customer_df['ilce'].replace(['EYÜP'],'EYÜPSULTAN')
    customer_df['ilce'] = customer_df['ilce'].replace(['EYÜP SULTAN'],'EYÜPSULTAN')

    # Create new concat columns for mapping
    location_df["concat"] = location_df.il.str.upper() + '|' + location_df.ilce.str.upper() + '|' + location_df.mahalle.str.replace(' Köyü| Köy| Koy| Mah.| Mh.| Mahallesi', '', regex=True).str.upper()
    customer_df["concat"] = customer_df.il.str.upper() + '|' + customer_df.ilce.str.upper() + '|' +  customer_df.mahalle_ad.str.replace(' Köyü| Köy| Koy| KÖYÜ| KOYU| KÖY| Mah.| Mh.| Mahallesi| MAH.| MH.| MAHALLESİ| MAHALLESI' , '' , regex=True ).str.upper()

    # Create a function for replacing characters
    def char_updater(inputstr):
        old =  ['İ', 'Ş', 'Ö', 'Ü', 'Ğ', 'Ç', 'Ý', 'Þ', 'Ð']
        new =  ['I', 'S', 'O', 'U', 'G', 'C', 'I', 'S', 'G']
        for i in range(len(old)):
            sub_o = old[i]
            sub_n = new[i]
            if sub_o in inputstr:
                inputstr = inputstr.replace(sub_o,sub_n)
        return inputstr

    # Apply function
    customer_df = customer_df.astype({"concat": str})
    location_df = location_df.astype({"concat": str})
    customer_df['concat'] = customer_df['concat'].apply(char_updater)
    location_df['concat'] = location_df['concat'].apply(char_updater)

    # Apply mapping for locations
    mapping = dict(location_df[['concat','mahalle_id']].values)
    customer_df['mahalle_id'] = customer_df['concat'].map(mapping)

    # This function will process fuzzy matching and list them into _dict
    choices = location_df.concat.tolist()
    _dict = {}
    for i in tqdm.tqdm(customer_df['concat'][customer_df.mahalle_id.isnull()].unique()):
        try:
            concat = i
            city = i.split('|')[0]
            matched = process.extract(concat, [k for k in choices if city in k], limit=1)
            mahalle_id = location_df[ location_df['concat'] == matched[0][0] ]['mahalle_id'].tolist()[0]
            _dict.update({concat:mahalle_id})
        except Exception as ex:
            pass

    # Seperate the ones that are null
    matched_df = customer_df[~customer_df['mahalle_id'].isnull()]
    unmatched_df = customer_df[customer_df['mahalle_id'].isnull()]
    unmatched_df = unmatched_df.reset_index(drop=True)
    concat_list = unmatched_df['concat'].tolist()

    # for loop to find the unmatched_df mahalle_id from the fuzzy _dict
    updated_mahalle_id = []
    for i in range(len(unmatched_df)):
        try:
            final_mahalle = concat_list[i]
            mahalle_id = _dict[final_mahalle]
            updated_mahalle_id.append(mahalle_id)
        except:
            updated_mahalle_id.append(-1)
    unmatched_df['mahalle_id'] = updated_mahalle_id

    # Combine dfs
    customer_df_final = pd.concat([matched_df,unmatched_df])

    # to check the correctness of matching, merge dfs
    merged_df = pd.merge(customer_df_final, location_df, how='left', left_on = 'mahalle_id', right_on = 'mahalle_id')

    # Create new columns to check correctness
    merged_df["city_town"] = merged_df.il_x.str.upper() + '|' + merged_df.ilce_x.str.upper()
    merged_df["il_ilce"] = merged_df.il_y.str.upper() + '|' + merged_df.ilce_y.str.upper()
    merged_df["city_district"] = merged_df.il_x.str.upper() + '|' + merged_df.mahalle_ad.str.replace(' Köyü| Köy| Koy| KÖYÜ| KOYU| KÖY| Mah.| Mh.| Mahallesi| MAH.| MH.| MAHALLESİ| MAHALLESI' , '' , regex=True ).str.upper()
    merged_df["il_mahalle"] = merged_df.il_y.str.upper() + '|' + merged_df.mahalle.str.replace(' Köyü| Köy| Koy| KÖYÜ| KOYU| KÖY| Mah.| Mh.| Mahallesi| MAH.| MH.| MAHALLESİ| MAHALLESI' , '' , regex=True ).str.upper()

    merged_df = merged_df.astype({"city_town": str, "il_ilce" :str, "city_district": str , "il_mahalle": str})
    merged_df['city_town'] = merged_df['city_town'].apply(char_updater)
    merged_df['il_ilce'] = merged_df['il_ilce'].apply(char_updater)
    merged_df['city_district'] = merged_df['city_district'].apply(char_updater)
    merged_df['il_mahalle'] = merged_df['il_mahalle'].apply(char_updater)

    merged_df['mahalle_id2'] = np.where( (merged_df['city_town'] == merged_df['il_ilce']) | (merged_df['city_district'] == merged_df['il_mahalle']) , merged_df['mahalle_id'], int(-1) )

    # concat sütunu ile mapping edilirken yanlış ilce isminden ötürü mapping olmamış (en temel neden 'Merkez' girilenler), fuzzy sonucunda da yanlış eşleşmiş kayıtlar için il-mahalle bazlı yeni mapping
    # üstteki merged_df 'de joinleme mahalle_id bazlı edildiğinden, zaten yanlış olanı getirttiğinden, bu tespit df de çıkmaz
    # bu mappingi fuzzy den sonraya bırakmamızın nedeni ise, bazı illerde aynı mahalle adı farklı ilçelerde olabiliyor. Bu yüzden önce fuzzy eşleştirmeyi denesin. Aynı mahalleden birden fazla olan illerde hata oranı düşmüş olur.
    success_df = merged_df[merged_df['mahalle_id2'] != -1 ]
    unsuccess_df = merged_df[merged_df['mahalle_id2'] == -1]

    location_df["il_mahalle"] = location_df.il.str.upper() + '|' + location_df.mahalle.str.replace(' Köyü| Köy| Koy| Mah.| Mh.| Mahallesi', '', regex=True).str.upper()
    location_df = location_df.astype({"il_mahalle": str})
    location_df['il_mahalle'] = location_df['il_mahalle'].apply(char_updater)

    # Apply mapping again for unmatched rows
    unsuccess_df.drop('mahalle_id2', axis=1 ,inplace=True, errors = 'ignore')
    unsuccess_df = unsuccess_df.reset_index(drop=True)
    mapping2 = dict(location_df[['il_mahalle','mahalle_id']].values)
    unsuccess_df['mahalle_id2'] = unsuccess_df['city_district'].map(mapping2)

    # Combine dfs
    combined_df = pd.concat([success_df,unsuccess_df])
    combined_df['mahalle_id2'] = combined_df['mahalle_id2'].fillna(-1)

    # Clean the result df and write
    final_df = combined_df.copy()
    final_df.drop(['mahalle_id','concat_x','il_y','ilce_y','mahalle','il_id','ilce_id','concat_y','city_town','il_ilce','city_district','il_mahalle'],axis=1, inplace = True)
    final_df.rename(columns = {'mahalle_id2':'mahalle_id', 'il_x':'il', 'ilce_x':'ilce'}, inplace = True)
    final_df = final_df.astype({"mahalle_id": int})

    final_df.to_csv("ags_customers_matched.csv", index = False)