In [202]:
from sqlalchemy import create_engine
import pandas as pd
pd.options.display.max_rows= 200
#pd.set_option('display.max_rows', None)
pd.set_option("display.max_columns", None)

In [203]:
engine = create_engine('postgresql://user_name:password@IP_Address:port_number/service_name')

In [None]:
# see the tables in the db schema
tables = """ SELECT * FROM information_schema.tables WHERE table_schema = 'development' ORDER BY table_name  """
df = pd.read_sql(tables, engine)
df

In [204]:
query = """ SELECT * FROM development.cfs_customer"""
crf_df = pd.read_sql(query, engine)

query2 = """ SELECT * FROM external.tr_locations """
location_df = pd.read_sql(query2, engine)

In [205]:
crf_df[['city','town','district']][crf_df['city'].notnull()].head()

Unnamed: 0,city,town,district
4163867,Ýstanbul,Ataþehir,Örnek Mh.
4163909,Ýstanbul,Maltepe,Feyzullah Mh.
4164075,Ýstanbul,Maltepe,Yalý Mh.
4164199,Ýstanbul,Esenyurt,Mevlana Mh.
4164281,Ýstanbul,Þiþli,Cumhuriyet Mh.


In [206]:
# changing the Turkish characters
def char_updater(inputstr):
    old =  ['ý', 'Ý', 'þ', 'Þ', 'ð', 'Ð']
    new =  ['ı', 'İ', 'ş', 'Ş', 'ğ', 'Ğ']
    for i in range(len(old)):
        sub_o = old[i]
        sub_n = new[i]
        if sub_o in inputstr:
            inputstr = inputstr.replace(sub_o,sub_n)
    return inputstr

In [207]:
crf_df = crf_df.astype({"city": str, "town": str, "district": str})

In [208]:
crf_df['city'] = crf_df['city'].apply(char_updater)
crf_df['town'] = crf_df['town'].apply(char_updater)
crf_df['district'] = crf_df['district'].apply(char_updater)

In [209]:
crf_df[['city','town','district']].tail()

Unnamed: 0,city,town,district
4991619,İstanbul,Güngören,Merkez Mh.
4991620,İstanbul,Beşiktaş,Ortaköy Mh.
4991621,İstanbul,Ataşehir,Barbaros Mh.
4991622,İstanbul,Maltepe,Altıntepe Mh.
4991623,Ankara,Çankaya,Erzurum Mh.


In [210]:
location_df.head()

Unnamed: 0,il,ilce,mahalle,il_id,ilce_id,mahalle_id
0,Tokat,Erbaa,Meydandüzü Köyü,4019,33859,107249
1,Kastamonu,Tosya,Mimar Sinan Mah.,4011,37911,14133
2,Kars,Kağızman,Kuloğlu Köyü,3988,38726,41411
3,Tokat,Erbaa,Keçeci Köyü,4019,33859,48928
4,Tokat,Erbaa,Sokutaş Köyü,4019,33859,107252


In [211]:
# creating new concated column: il-ilce-mahalle after lowering and Replacing 
location_df["concat"] = location_df.il.str.lower() + '|' + location_df.ilce.str.lower() + '|' +                                \
                        location_df.mahalle.str.replace(' Köyü| Köy| Koy| Mah.| Mh.| Mahallesi', '', regex=True).str.lower()

In [212]:
location_df.head()

Unnamed: 0,il,ilce,mahalle,il_id,ilce_id,mahalle_id,concat
0,Tokat,Erbaa,Meydandüzü Köyü,4019,33859,107249,tokat|erbaa|meydandüzü
1,Kastamonu,Tosya,Mimar Sinan Mah.,4011,37911,14133,kastamonu|tosya|mimar sinan
2,Kars,Kağızman,Kuloğlu Köyü,3988,38726,41411,kars|kağızman|kuloğlu
3,Tokat,Erbaa,Keçeci Köyü,4019,33859,48928,tokat|erbaa|keçeci
4,Tokat,Erbaa,Sokutaş Köyü,4019,33859,107252,tokat|erbaa|sokutaş


In [213]:
# creating new concated column: city-town-district after lowering and Replacing 
crf_df["concat"] = crf_df.city.str.lower() + '|' + crf_df.town.str.lower() + '|' +                                             \
                   crf_df.district.str.replace(' Köyü| Köy| Koy| Mah.| Mh.| Mahallesi', '', regex=True).str.lower()

In [214]:
# Indicating None values as Null, the reason for this situation is due to the str updating before
crf_df['city'][crf_df['city']=='None'] = None

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [215]:
# drop the null city rows
crf_df = crf_df.dropna(subset=['city'])

In [217]:
location_df[location_df['mahalle_id'] == 437]

Unnamed: 0,il,ilce,mahalle,il_id,ilce_id,mahalle_id,concat
52339,İstanbul,Eyüpsultan,Güzeltepe Mah.,3969,589,437,i̇stanbul|eyüpsultan|güzeltepe


In [218]:
#creating dict and mapping the mahalle_id
mapping = dict(location_df[['concat', 'mahalle_id']].values)
crf_df['mahalle_id'] = crf_df['concat'].map(mapping)

In [220]:
from thefuzz import fuzz
from thefuzz import process

In [221]:
#see an example with matching percentage
choices = location_df.concat.tolist()
sonuc = process.extract("zonguldak merkez yeşil mah.", choices, limit=1)
sonuc

[('zonguldak|merkez|yeşil', 95)]

In [None]:
# creating _dict and update it by using fuzzy matching for null(unmatched) mahalle_id values
_dict = {}
for i in crf_df['concat'][crf_df.mahalle_id.isnull()].unique():       # For each record that unmatched mahalle_id before
    try:
        concat = i                                                                             
        city = i.split('|')[0]          # seperate the city from the each 'concat' column from the crf_df
        matched = process.extract(concat, [k for k in choices if city in k], limit=1)  # filtering choices contains related city
        mahalle_id = location_df[ location_df['concat'] == matched[0][0] ]['mahalle_id'].tolist()[0] # Take the value of matched
        _dict.update({concat:mahalle_id})     
        print(concat, mahalle_id)
    except Exception as ex:
        print(ex)

In [None]:
matched_df = crf_df[~crf_df['mahalle_id'].isnull()]    # '~'' is used for reverse logic

unmatched_df = crf_df[crf_df['mahalle_id'].isnull()]

In [226]:
unmatched_df = unmatched_df.reset_index(drop=True)
concat_list = unmatched_df['concat'].tolist()
other_location_df = location_df[['concat','mahalle_id']]

In [227]:
unmatched_df['mahalle_id'] = 0

In [228]:
# tqdm is used for to monitor the process time
# for loop to find the unmatched_df mahalle_id from the fuzzy _dict

import time
import tqdm

updated_mahalle_id = []
for i in tqdm.tqdm(range(len(unmatched_df))):  
    try:
        final_mahalle = concat_list[i]
        mahalle_id = _dict[final_mahalle]           # it will look the value from the _dict   
        updated_mahalle_id.append(mahalle_id)
    except:
        updated_mahalle_id.append(-1)
unmatched_df['mahalle_id'] = updated_mahalle_id

100%|██████████| 43809/43809 [00:00<00:00, 1251947.02it/s]


In [229]:
# after updating the unmatched_df, see the ones that hasn't matched yet

unmatched_df[unmatched_df['mahalle_id']==-1][['city','town','district','concat','mahalle_id']]

Unnamed: 0,city,town,district,concat,mahalle_id
12964,Marina,Çeşme,Çeşme Marina,marina|çeşme|çeşme marina,-1
13578,Marina,Marmaris,Marmaris Marina,marina|marmaris|marmaris marina,-1
22288,Marina,Çeşme,Çeşme Marina,marina|çeşme|çeşme marina,-1
25810,Marina,Didim,Didim Marina,marina|didim|didim marina,-1
26849,Marina,Çeşme,Çeşme Marina,marina|çeşme|çeşme marina,-1
34797,Marina,Marmaris,Marmaris Marina,marina|marmaris|marmaris marina,-1
34798,Marina,Bodrum,Yalıkavak Marina,marina|bodrum|yalıkavak marina,-1
34799,Marina,Çeşme,Çeşme Marina,marina|çeşme|çeşme marina,-1
35360,Marina,Fethiye,Ece Marina,marina|fethiye|ece marina,-1
37373,Marina,Göcek,Göcek Marina,marina|göcek|göcek marina,-1


In [230]:
# combine the dfs
crf_df_final = pd.concat([matched_df,unmatched_df])

In [232]:
# there are 18 records that hasn't matched because they have missed or false data
bad = len(crf_df_final[crf_df_final['mahalle_id']==-1])
good =len(crf_df_final[crf_df_final['mahalle_id']!=-1])
total =len(crf_df_final['mahalle_id'])
print(bad,good, f'{(good / total):.6f}')

18 791005 0.999977


In [None]:
# We must be sure that the matching process was done correctly, will check it by compating city  - town
merged_df = pd.merge(crf_df_final, location_df, how='left', left_on = 'mahalle_id', right_on = 'mahalle_id')

In [234]:
# create columns to check equality in all data for city and town
# lower() was problematic in another data case because of turkish characters changes length, so upper() was more guarantee.

merged_df["city_town"] = merged_df.city.str.upper() + '|' + merged_df.town.str.upper()
merged_df["il_ilce"] = merged_df.il.str.upper() + '|' + merged_df.ilce.str.upper()                  

In [235]:
def char_updater2(inputstr):
    old =  ['İ', 'Ş', 'Ö', 'Ü', 'Ğ', 'Ç']
    new =  ['I', 'S', 'O', 'U', 'G', 'C']
    for i in range(len(old)):
        sub_o = old[i]
        sub_n = new[i]
        if sub_o in inputstr:
            inputstr = inputstr.replace(sub_o,sub_n)
    return inputstr

In [236]:
merged_df = merged_df.astype({"city_town": str, "il_ilce": str})

merged_df['city_town'] = merged_df['city_town'].apply(char_updater2)
merged_df['il_ilce'] = merged_df['il_ilce'].apply(char_updater2)

In [237]:
# Check the accuracy of matching
import numpy as np

merged_df['mahalle_id2'] = np.where( merged_df['city_town'] == merged_df['il_ilce'], merged_df['mahalle_id'], int(-1) )
  
merged_df[['city','town','district','il','ilce','mahalle','mahalle_id','mahalle_id2']][merged_df['mahalle_id2']== -1]

Unnamed: 0,city,town,district,il,ilce,mahalle,mahalle_id,mahalle_id2
747220,İstanbul,Şişli,43239,Yozgat,Akdağmadeni,İstanbulluoğlu Mah.,12255.0,-1.0
747268,İstanbul,Çatalca,Diğer,Yozgat,Akdağmadeni,İstanbulluoğlu Mah.,12255.0,-1.0
747281,İstanbul,Beşiktaş,İş Kuleleri,İstanbul,Tuzla,Cami Mah.,957.0,-1.0
747284,İstanbul,Seyhan,Tellidere Mh.,İstanbul,Küçükçekmece,Küçükçekmece İkitelli OSB,27498.0,-1.0
747301,İstanbul,Şile,Diğer,Yozgat,Akdağmadeni,İstanbulluoğlu Mah.,12255.0,-1.0
...,...,...,...,...,...,...,...,...
791009,Hatay,Antakya,Diğer,Hatay,Samandağ,Tomruksuyu Bld.,39870.0,-1.0
791014,Çanakkale,Gelibolu,Diğer,Çanakkale,Biga,Karabiga Bld. (Ulucamii Mah.),168726.0,-1.0
791015,Tekirdağ,Çorlu,Diğer,Tekirdağ,Çerkezköy,Veliköy - Yalıboyu OSB,114334.0,-1.0
791017,Hatay,Arsuz,Diğer,Hatay,Samandağ,Tomruksuyu Bld.,39870.0,-1.0


In [None]:
final_df = merged_df.drop(['concat_x','mahalle_id','il','ilce','mahalle','il_id','ilce_id','concat_y','city_town','il_ilce'],axis=1)
final_df.rename(columns = {'mahalle_id2':'mahalle_id'}, inplace = True)

In [240]:
# after controlling 6583 records do not have any proper matched 'mahalle_id' 
bad = len(final_df[final_df['mahalle_id']==-1])
good =len(final_df[final_df['mahalle_id']!=-1])
total =len(final_df['mahalle_id'])
print(bad,good, f'{(good / total):.6f}')

6583 784440 0.991678
