In [3]:
import numpy as np
import pandas as pd
import fiona
import geopandas as gpd
import os
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import matplotlib.pyplot as plt

In [4]:
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)

### Combine data sources

In [28]:
# data import
dataDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\DRC\\Cleaned\\Single source\\2nd round"
acasus_dropped2 = pd.read_csv(dataDir + "\\acasus_dropped2_0921.csv")
who_dropped2 = pd.read_csv(dataDir + "\\who_dropped2_0921.csv")

In [30]:
# drop unnecessary columns
acasus_dropped2.drop(columns=['freq_count',
       'match_name', 'n_subclusters', 'match_score', 'match_type', 'name_seq',
       'type_seq', 'clusterID', 'IN_FID_y',
       'IN_FID_y', 'FEAT_SEQ_y', 'eval2'],
                    inplace=True)
who_dropped2.drop(columns=['freq_count', 'match_name', 'n_subclusters',
       'match_score', 'match_type', 'name_seq', 'type_seq', 'clusterID',
        'IN_FID_y',
       'FEAT_SEQ_y', 'eval2'], inplace=True)

In [32]:
# create source column
acasus_dropped2['source'] = 'acasus'
who_dropped2['source'] = 'who'

In [33]:
# rename some columns
acasus_dropped2.rename({'date':'date_dt'}, axis=1, inplace=True)
who_dropped2.rename({'province':'province1'}, axis=1, inplace=True)

In [35]:
# combine datasets
df = pd.concat([acasus_dropped2, who_dropped2])

In [36]:
df.shape

(10854, 54)

In [37]:
df['source'].value_counts()

acasus    7965
who       2889
Name: source, dtype: int64

In [38]:
dataDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\DRC\\Cleaned"
df.to_csv(dataDir + "\\combined_list_0927.csv", index=False)

`IN_FID` and `IN_FID_x` can be used to match back to original datasets.

### Add feature sequence

In [39]:
dataDir = r"C:\Users\DUANYUEYUN\Documents\GRID3\Health facilities\Data\DRC\\Cleaned"
combined_list = pd.read_csv(dataDir + "\\combined_list_0927.csv")
dataDir2 = r"C:\Users\DUANYUEYUN\Documents\ArcGIS\Projects\Fuzzy Match Testing"
seq2 = gpd.read_file(dataDir2+"\\Fuzzy Match Testing.gdb", driver='FileGDB', 
                       layer= 'combined_list_0922_FindIdent')

In [40]:
# minus 1 from IN_FID in who_seq to match index of the original dataset
seq2['IN_FID'] = seq2['IN_FID']-1
# drop duplicates in IN_FID since some points are assigned to more than 1 cluster
seq2 = seq2.sort_values(by=['IN_FID', 'FEAT_SEQ'])\
.drop_duplicates(subset='IN_FID', keep='first')

combined_list2 = combined_list.merge(seq2.drop(columns=['geometry']), 
                           left_index=True, right_on='IN_FID')

In [41]:
print("Number of data points:", combined_list.shape[0])
print("Number of clusters:", seq2['FEAT_SEQ'].nunique())

Number of data points: 10854
Number of clusters: 7854


### Match facility names within clusters

In [43]:
def match_names(df, name='face_name1', clean_name='face_name2_corr',
               cluster='FEAT_SEQ', source='source', 
                     simple_score=80, simple_score_min=50, 
                     partial_score=80):

    # strip whitespaces to remove empty strings like ' '
    df[name]=df[name].str.strip()
    # replace empty string with NA
    df[name].replace('',np.nan,inplace=True)
    df[name].replace('NA',np.nan,inplace=True)
    
    # if the cleaned short name is NA, just use original facility name instead
    names = []
    for idx, row in df.iterrows():
        if isinstance(row[clean_name], str):
            names.append(row[clean_name])
        else:
            names.append(row[name])
    df[clean_name] = names 
    
    # drop NA in short name
    print("Number of NA values in name column:", df[clean_name].isna().sum())
    df.dropna(subset=[clean_name], inplace=True)

    # sort values based on feature sequence and short name
    df.sort_values(by=[cluster, clean_name, source], inplace=True)
    # group by feature sequence
    df_grouped = df.groupby(cluster)

    # store the matched name
    match_names = []
    # store count of subclusters
    sub_counts = []
    # store score of matching
    match_scores = []
    # store types of matching
    match_types = []

    for group_name, df_group in df_grouped:
        # obtain list of names
        names = df_group[clean_name].to_list()
        
        # use the first name as the potential candidates for finding a match
        match_candidates = [names[0]]
        match_names.append(names[0])
        match_scores.append(np.nan)
        match_types.append('Self')

        for i in range(1, len(names)):

            name = names[i]

            # use match candidates to find best match and compute match score
            match_name1, score1 = process.extractOne(name, match_candidates, scorer = fuzz.ratio)
            match_name2, score2 = process.extractOne(name, match_candidates, scorer = fuzz.partial_ratio)

            # score based on simple ratio
            # or for very short strings, a single letter difference will result in low score
            # consider two short strings as a match if there's only 1 letter difference
            if score1>=simple_score or (1-score1/100) * len(name) <= 1:
                # append match name and score
                match_names.append(match_name1)
                match_scores.append(score1)
                match_types.append('Simple match')

            # if simple ratio not that high, check partial ratio
            elif score1>=simple_score_min and score2>=partial_score:
                # append match name and score
                match_names.append(match_name2)
                match_scores.append(score2)
                match_types.append('Partial match')

            # the match score based on simple ratio is lower than the minimum required
            # just match the name to itself and add it to match candidates
            else:
                match_candidates.append(name)
                match_names.append(name)
                match_scores.append(np.nan)
                match_types.append('Self')

        for i in range(len(names)):
            sub_counts.append(len(match_candidates))
    
    df['match_name'] = match_names
    df['n_subclusters'] = sub_counts
    df['match_score'] = match_scores
    df['match_type'] = match_types
    
    return df

In [44]:
list_matched = match_names(combined_list2, name='face_name1', clean_name='face_name2_corr',
               cluster='FEAT_SEQ', source='source')

Number of NA values in name column: 1


In [45]:
#list_matched[list_matched['FEAT_SEQ'].isin([9,10,11])]

### Evaluate if points in one cluster are from both sources

In [46]:
def cross_val(df, cluster='FEAT_SEQ', match_name='match_name',
              source='source', n_source=2, val_col='val'):
    """Cross validate points within a cluster.
    If they are from both sources => 1, otherwise the value is 0.
    
    val_col: column name for cross validation."""
    
    df_grouped = df.groupby([cluster, match_name])
    val = []
    for name, df_group in df_grouped:
        # if points are from just 1 source, append 0 
        if df_group[source].nunique()==1:
            for idx, row in df_group.iterrows():
                val.append(0)
        # if points are from both sources, append 1
        elif df_group[source].nunique()==n_source:
            for idx, row in df_group.iterrows():
                val.append(1)  
    df[val_col]=val
    return df

In [47]:
list_val = cross_val(list_matched)

In [48]:
print("Number of facilities match by both sources:", list_val[list_val['val']==1]['FEAT_SEQ'].nunique())

Number of facilities match by both sources: 1574


In [50]:
#list_val[list_val['FEAT_SEQ'].isin([9,10,11])]

In [51]:
matched_points = list_val[list_val['val']==1]
matched_points.to_csv(dataDir+'\\matched_points_0927.csv',
                     index=False)

### Consider facility type as well

In [52]:
# examine unique facility types
acasus_dropped2['type_corr'].unique()

array(['Centre de Sante', 'Centre de Sante de Reference',
       'Poste de Sante', 'Centre Hospitalier', nan,
       'Hopital General de Reference', 'Polyclinique',
       'Centre Hopitalier', 'Dispensaire', 'Clinique',
       'Clinique Universitaires', 'Hopital', 'Centre Medical',
       'Centre de Sante Reference', 'Maternite',
       'Centre Hopital General de Reference',
       'Centre Medical Evangelique', 'Hopital Provincial de Reference',
       'Hopital Secondaire', 'Hopital Militaire de Reference'],
      dtype=object)

In [53]:
who_dropped2['type_corr'].unique()

array([nan, 'Centre de Sante', 'Hopital General de Reference',
       'Poste de Sante', 'Clinique', 'Dispensaire', 'Hopital',
       'Centre de Sante de Reference', 'Polyclinique',
       'Centre Hopitalier', 'Clinique Universitaires',
       'Centre Hospitalier', 'Centre Medical', 'Maternite',
       'Centre Pediatrique', 'Hopital Secondaire', 'Pharmacy',
       'Cliniques Universitaires', 'Hopital Militaire'], dtype=object)

In [54]:
# simple facility mapping
type_dict = {"Centre Hospitalier":"Centre Hopitalier", 
             "Cliniques Universitaires":"Clinique Universitaires",
            "Centre de Sante Reference":"Centre de Sante de Reference"}
combined_list2['type_corr2'] = [type_dict[type_corr] if type_corr in type_dict.keys() 
                                else type_corr for type_corr in combined_list2['type_corr']]

In [55]:
def cross_val2(df, cluster='FEAT_SEQ', match_name='match_name',
               fac_type = 'type_corr2',
              source='source', n_source=2, val_col='val'):
    df[fac_type] = df[fac_type].fillna('')
    # consider facility type as well
    df_grouped = df.groupby([cluster, match_name, fac_type])
    val = []
    for name, df_group in df_grouped:
        if df_group[source].nunique()==1:
            for idx, row in df_group.iterrows():
                val.append(0)
        
        elif df_group[source].nunique()==n_source:
            for idx, row in df_group.iterrows():
                val.append(1)  
    df[val_col]=val
    df[fac_type] = df[fac_type].replace('', np.nan)
    return df

In [56]:
list_val2 = cross_val2(list_matched)

In [57]:
print("Number of facilities match by both sources:", list_val2[list_val2['val']==1]['FEAT_SEQ'].nunique())

Number of facilities match by both sources: 1370


In [58]:
matched_points2 = list_val2[list_val2['val']==1]
matched_points2.to_csv(dataDir+'\\matched_points2_0927.csv',
                     index=False)

In [62]:
#matched_points[~matched_points['IN_FID_y'].isin(matched_points2['IN_FID_y'])].head(6)