In [12]:
import re

# Random augmentation
import nlpaug.augmenter.char as nac
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm


In [6]:
# labeled_data_dedupe_fixed
# labeled_data_dedupe_fixed_2
def load_data_fill_na(path="data/labeled_data_dedupe__address_fixed_2.xlsx"):
    """
    Load dataset
    :param path: path to excel file
    :return: dataframe
    """
    df = pd.read_excel(path)
    if df["similar"].isna().any():
        df["similar"][df["similar"].isna()] = 1
    df.to_excel(path, index=False)
    df.dropna(subset=["address"], inplace=True)
    return df


def get_classes(df, cols="similar"):
    """
    Count number of full triplets, anchor with postive only and anchor with negative only
    :param df: dataframe
    :param cols: column need to compare
    :return: postive, negative and full triplet
    """
    cid_list = df["cid"].unique()
    pos_set = set()
    neg_set = set()
    full_set = set()
    for cid in cid_list:
        if {1, 0.0} == set(df[df["cid"] == cid]["similar"]):
            full_set.add(cid)
        elif {1.0} == set(df[df["cid"] == cid]["similar"]):
            pos_set.add(cid)
        elif {0.0} == set(df[df["cid"] == cid]["similar"]):
            neg_set.add(cid)

    print("{} Full Positive and Negative".format(len(full_set)))
    print("{} Positive only".format(len(pos_set)))
    print("{} Negative only".format(len(neg_set)))

    return pos_set, neg_set, full_set


def reset_cis(dataframe):
    """
    Reset cid to type int if necessary
    :param dataframe: 
    :return: df: dataframe
    """
    # Reset cid of dataset
    df = dataframe.copy()
    from sklearn import preprocessing

    le = preprocessing.LabelEncoder()
    df["cid"] = le.fit_transform(df["cid"].astype(str))

    return df["cid"]

In [10]:
df = load_data_fill_na()
if df.dtypes["cid"] != "int64":
    df["cid"] = reset_cis(df)

df = df.loc[:, ["cid", "address", "similar"]]
# get list positive, negative and full set
pos_set, neg_set, full_set = get_classes(df)

26 Full Positive and Negative
104 Positive only
7 Negative only


In [11]:
"""
(DO NOT RUN IF NOT NECESSARY)
Random pair a partial of dataset to an only POS class 
to create negative samples to make all full sets
"""
df_gen = pd.DataFrame()
for cid in df['cid'].unique():
    # sample 35% of all labeled dataset
    df_sample = df.sample(frac=0.1)
    # choose rows that have different cid from main one
    df_sample = df_sample[~df_sample["cid"].isin([cid])]
    df_sample["cid"] = cid
    df_sample["similar"] = 0
    df_gen = df_gen.append(df_sample)
# Append to main dataset
df = pd.concat([df, df_gen]).sort_values("cid")
df["address"] = (
    df["address"]
    .str.lower()
    .str.replace("\n", " ")
    .str.replace(r"[ ]+", " ", regex=True)
    .str.replace("null", "")
    .str.replace("nan", "")
)
print(
    "Have done appending {} records "
    "for {} positive set".format(len(df_gen), len(pos_set))
)

Have done appending 4490 records for 104 positive set


In [14]:
import copy

new_rows = []
for row in df.itertuples():
    # Augment data base on keyboard mistakes and deletion
    new_row = list(row)[1:]
    aug = nac.KeyboardAug()
    augmented_texts = aug.augment(new_row[1], n=3)
    for text in augmented_texts:
        new_row[1] = text
        new_rows.append(copy.copy(new_row))
    
    new_row = list(row)[1:]
    aug = nac.RandomCharAug(action="delete")
    augmented_texts = aug.augment(new_row[1], n=3)
    for text in augmented_texts:
        new_row[1] = text
        new_rows.append(copy.copy(new_row))
        
# Save file random augmentation   
test_df = pd.DataFrame(new_rows, columns=['cid', 'address', 'similar'])
test_df.to_csv("data/random_augment_train.csv", index=False)

# Overall Differences
In this segment, I analyze and collect all differences through all data. And I use those statistical problems to generate new records based on original record. I also put some probability in generating each problems to reduce the bias.

In [17]:
synonym_dict = {
    "name": [
        ["[^a-zA-Z]inc.", "[^a-zA-Z]inc"],
        ["[^a-zA-Z]co.", "[^a-zA-Z]co[^a-zA-Z]", "company"],
        ["ltd.", "ltd", "limited"],
        ["pvt.", "[^a-zA-Z]pvt", "private"],
        ["[^a-zA-Z]llc.", "[^a-zA-Z]llc"],
        ["[^a-zA-Z]no.", "[^a-zA-Z]no"],
        ["[^a-zA-Z]us[^a-zA-Z]", "[^a-zA-Z]usa[^a-zA-Z]", "united states"],
        ["[^0-9]3[^0-9]", "[^a-zA-Z0-9]iii[^a-zA-Z0-9]", "[^0-9]03[^0-9]"],
        ["[^0-9]2[^0-9]", "[^a-zA-Z0-9]ii[^a-zA-Z0-9]", "[^0-9]02[^0-9]"],
        ["[^0-9]1[^0-9]", "[^a-zA-Z0-9]i[^a-zA-Z0-9]", "[^0-9]01[^0-9]"],
    ],
    "address": [
        ["[^0-9]1[^0-9]", "[^0-9]01[^0-9]"],
        ["[^0-9]2[^0-9]", "[^0-9]02[^0-9]"],
        ["[^0-9]3[^0-9]", "[^0-9]03[^0-9]"],
        ["[^0-9]4[^0-9]", "[^0-9]04[^0-9]"],
        ["[^0-9]5[^0-9]", "[^0-9]05[^0-9]"],
        ["[^0-9]7[^0-9]", "[^0-9]06[^0-9]"],
        ["[^0-9]8[^0-9]", "[^0-9]08[^0-9]"],
        ["[^0-9]9[^0-9]", "[^0-9]09[^0-9]"],
        ["1st", "first"],
        ["2nd", "second"],
        ["3rd", "third"],
        ['[^a-zA-Z]k.[^a-zA-Z]', '[^a-zA-Z]kat.[^a-zA-Z]', '[^a-zA-Z]k:[^a-zA-Z]', '[^a-zA-Z]kat:[^a-zA-Z]', '[^a-zA-Z]k[^a-zA-Z]', '[^a-zA-Z]kat[^a-zA-Z]'],
        ["[^a-zA-Z]area[^a-zA-Z]", "[^a-zA-Z]zone[^a-zA-Z]"],
        ["[^a-zA-Z]no-", "[^a-zA-Z]no.","[^a-zA-Z]no:", "[^a-zA-Z]no[^a-zA-Z.-]"],
        ["country", "county"],
        ["road", "rd[.]{1}", "[^a-zA-Z0-9]rd[^a-zA-Z.]"],
        [
            "street",
            "[^a-zA-Z0-9]str[.]{1}",
            "[^a-zA-Z0-9]str[^a-zA-Z.]",
            "[^a-zA-Z0-9]st[.]{1}",
            "[^a-zA-Z0-9]st[^a-zA-Z.]",
        ],
        ["drive", "[^a-zA-Z]dr[^a-zA-Z.]", "[^a-zA-Z]dr[.]{1}"],
        ["avenue", "[^a-zA-Z]ave[.]{1}", "[^a-zA-Z]ave[^a-zA-Z.]"],
        ["boulevard", "[^a-zA-Z]blvd[.]{1}", "blvd[^.]"],
        ["lane", "[^a-zA-Z]ln[.]{1}", "[^a-zA-Z0-9]ln[^a-zA-Z.]"],
        ["sector[^-]", "sector-"],
        ["[^a-zA-Z]court[^a-zA-Z]", "[^a-zA-Z]ct[^a-zA-Z.]"],
        ["china", "[^a-zA-Z]cn[^a-zA-Z]", "c[.]{1}n"],
        ["united states", "u[.]{1}s", "[^a-zA-Z]us[^a-zA-Z]", "usa"],
        ["vietnam", "viet nam", "[^a-zA-Z]vn[^a-zA-Z]"],
    ],
}

# ---- Generate new rows - start
def generate_row(target, synonym_dict, syn_type):
    """
    Generate new row by using synonym and acronym
    :param target: content
    :param synonym_dict: dictionary of synonym and acronym
    :param syn_type: type of synonym and acronym
    :return: {'code': 1 for successfull and 0 for failed, {content}: new row if code = 1
    """
    result = {"code": 0}
    try:
        target_lowered = target.lower()  # get lowered target
    except:
        # out if target_lowered is Nan
        return result
    # each cluster of synonym (name)
    for i in range(len(synonym_dict[syn_type])):
        synonym = synonym_dict[syn_type][i]
        # iterrate words in a synonyms cluster
        for j in range(len(synonym)):
            syn = synonym[j]
            target_lowered = target.lower()  # get lowered target
            match = re.search("{}".format(syn), target_lowered)
            if match is None:
                continue

            # Check whether replace or not
            if match is not None:
                syn_len = len(syn)  # Get length of replaceable word
                target_sta = match.start()
                target_end = target_sta + syn_len
                if target_end < len(target) and (
                    target[target_end].isdigit() or target[target_end].isalpha()
                ):
                    # if word neither is the last word of sentence nor a sub of a word, a number
                    continue

                # gamble or not base (75%)
                if np.random.randint(0, 4, 1) == 0:
                    break

                # position of synonym's replacement
                syn_rep_pos = j
                while syn_rep_pos == j:
                    syn_rep_pos = np.random.randint(0, len(synonym), 1)[0]
                syn_rep = synonym[syn_rep_pos]

                # Generate new target
                target = target[:target_sta] + " " + syn_rep + " " + target[target_end:]
                target = target.lower()
                # Remove all regex special characters
                target = re.sub("\[\.\]", ".", target)
                target = re.sub("\[([a\-z.\^09]*)\]", "", target)
                target = re.sub("\{[0-9]+\}", "", target)
                break
    result = {"code": 1, "content": target}
    return result


def generate_new_rows(df, synonym_dict, rep_column=["name", "address"], low=3, high=5):
    """
    Generate rows by using synonym and acronym with columns name
    :param df: dataframe
    :param synonym_dict: dictionary of synonym and acronym
    :param rep_column: column that use want to duplicate
    :param low: min duplications for each row
    :param high: max duplications for each row
    :return: new dataframe
    """
    new_rows = []
    for row in df.itertuples():
        # Random duplicating a rows from low times to high times
        for i in range(np.random.randint(low, high, 1)[0]):
            new_row = list(row)[1:]  # 1: to drop index in dataframe
            # Generate new row for selected column(s)
            for column in rep_column:
                result = generate_row(
                    row[1], synonym_dict, syn_type=column
                )
                if result["code"] == 1:
                    new_row[1] = result["content"]
            # Collect all new rows and store them into new_rows
            new_rows.append(new_row)
    df_result = pd.DataFrame(pd.DataFrame(new_rows, columns=df.columns))
    return df_result
# ---- Generate new rows - end


# ---- Full pipeline for pre-processing and generate rows
def pre_processing_pipeline(df):
    temp_df = df.copy()
    address_list = df["address"].values.copy()
    # All the cases are handled with a certain probability to prevent bias
    for index, target in tqdm(enumerate(address_list)):
        # ---- System Error - missing some thing
        random = np.random.randint(0, 100, 1)
        target_as_list = target.split(",")  # Split sentence as a list
        if random < 16 and len(target_as_list) > 2:
            pos_to_drop = list(set(np.random.randint(2, len(target_as_list), 3)))
            for pos in sorted(pos_to_drop, reverse=True):
                # Drop position that has been chosen
                target_as_list.pop(pos)
            target = ",".join(target_as_list)

        # ---- Zip-code case ~ 13%
        # Zipcode error
        zipcode = ["000000", "111111", "123456", "0", "1"]
        random = np.random.randint(0, 100, 1)
        if random <= 13:
            match = re.search("[0-9]{3,6}[\.]*$", target)
            if match is not None:
                match = match.start()
                target = (
                    target[:match] + zipcode[np.random.randint(0, len(zipcode), 1)[0]]
                )

        # ---- Misunderstanding
        random = np.random.randint(0, 100, 1)
        if random < 27:
            mis_understanding = [
                [
                    ["avenue", "[^a-zA-Z]ave[.]{1}", "[^a-zA-Z]ave[^a-zA-Z.]"],
                    ["boulevard", "[^a-zA-Z]blvd[.]{1}", "blvd[^.]"],
                    ["drive", "[^a-zA-Z]dr[^a-zA-Z.]", "[^a-zA-Z]dr[.]{1}"],
                ],
                [
                    ["lane", "[^a-zA-Z]ln[.]{1}", "[^a-zA-Z0-9]ln[^a-zA-Z.]"],
                    ["road", "rd[.]{1}", "[^a-zA-Z0-9]rd[^a-zA-Z.]"],
                ],
                [["[^a-zA-Z]zone[^a-zA-Z]"], ["[^a-zA-Z]area[^a-zA-Z]"]],
                [
                    ["[^a-zA-Z]suite[^a-zA-Z]", "[^a-zA-Z]ste[^a-zA-Z]"],
                    ["[^a-zA-Z]plot[^a-zA-Z]"],
                ],
            ]
            for each_type in mis_understanding:
                change_start = None
                change_end = None
                for c_id in range(0, len(each_type)):
                    # Loop through each class
                    c = each_type[c_id]
                    for element in c:
                        # Find if in string exists an element in each class to replace it by a random element in the other class of the same type
                        match = re.search("{}".format(element), target)
                        if match is not None:
                            change_start = match.start()
                            change_end = match.end()
                            break
                    if change_start is not None:
                        # Stop finding if it has founded that exist at least one element in the string and change it
                        change_class = c_id
                        while change_class == c_id:
                            change_class = np.random.randint(0, len(each_type), 1)[0]
                        new_class = np.random.choice(each_type[change_class])
                        target = (
                            target[:change_start]
                            + " "
                            + new_class
                            + " "
                            + target[change_end:]
                        )
                        target = target.lower()
                        # Remove all regex special characters
                        target = re.sub("\[\.\]", "", target)
                        target = re.sub("\[([a\-z.\^09]*)\]", "", target)
                        target = re.sub("\{[0-9]+\}", "", target)
                        break

        # Typing error (current country -> county)
        random = np.random.randint(0, 100, 1)
        if random < 41:
            match = re.search("{}".format("country"), target)
            if match is not None:
                change_start = match.start()
                change_end = match.end()
                target = target[:change_start] + "county" + target[change_end:]

        address_list[index] = target
    temp_df["address"] = address_list
    return temp_df


def generate_df(df, columns=["name", "address"], low=3, high=7):
    pos_set, neg_set, full_set = get_classes(df)
    # Paring Nos, Pos and Anc together
    df_gen = pd.DataFrame()
    for cid in pos_set:
        # sample 35% of all labeled dataset
        df_sample = df.sample(frac=0.01)
        # choose rows that have different cid from main one
        df_sample = df_sample[~df_sample["cid"].isin([cid])]
        df_sample["cid"] = cid
        df_sample["similar"] = 0
        df_gen = df_gen.append(df_sample)
    print(
        "Have done appending {} records "
        "for {} positive set".format(len(df_gen), len(pos_set))
    )
    # ---- Generate new rows by both name and address
    result = generate_new_rows(
        pd.concat([df, df_gen]).sort_values("cid"),
        synonym_dict,
        rep_column=columns,
        low=low,
        high=high,
    )
    # ---- Append to generated dataframe
    df_generated = pd.concat([df, result], ignore_index=True)
    df_generated["address"] = df_generated["address"].str.lower()
    print(len(df_generated))
#     df_generated = pre_processing_pipeline(df_generated)
    df = pd.concat([df, df_generated])
    
    for column in columns:
        df[column] = (
            df[column]
            .str.lower()
            .str.replace("\n", " ")
            .str.replace(r"[ ]+", " ", regex=True)
            .str.replace("null", "")
            .str.replace("nan", "")
        )

    return df

# Augmentation Mistakes
I collect and analyze these mistakes in our labeled data. Then, I use their distribution to generate mistakes automatically to make the original data and generated data look normal.
1. Misunderstanding: doing
2. Spelling mistake
3. Typing error
4. System error
5. Zipcode error

In [19]:
result = generate_new_rows(df, synonym_dict, rep_column=['address'],low=3, high=5)
df_generated = pd.concat([df, result], ignore_index=True)
print(len(df_generated))
df_generated = pre_processing_pipeline(df_generated)

df_generated["address"] = (
    df_generated["address"]
    .str.lower()
    .str.replace("\n", " ")
    .str.replace(r"[ ]+", " ", regex=True)
    .str.replace("null", "")
    .str.replace("nan", "")
)
df_generated.to_csv("data/new_generated_labeled_data.csv", index=False)

21694



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

# Open map preprocessing

In [6]:
import sklearn

df = pd.read_csv(
    "/data/dac/dedupe-project/openmap-us.csv", header=None, names=["cid", "address"]
)

df = sklearn.utils.shuffle(df)
df["similar"] = 1
df_train = df.iloc[:2000, :]
df_test = df.iloc[20000:21000, :]

if df_train.dtypes["cid"] != "int64":
    df_train["cid"] = reset_cis(df_train)

if df_test.dtypes["cid"] != "int64":
    df_test["cid"] = reset_cis(df_test)

In [10]:
len(df)

26871004

In [44]:
df_train_generated = generate_df(df_train, low=2, high=4)
df_train_generated.to_csv(
    "/data/dac/dedupe-project/openmap/openmap-us-train.csv", index=False
)

0 Full Positive and Negative
2000 Positive only
0 Negative only
Have done appending 39985 records for 2000 positive set
68009


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [8]:
df_test_generated = generate_df(df_train, columns=['address'], low=1, high=2)
import copy

new_rows = []
for row in df_test_generated.itertuples():
    # Augment data base on keyboard mistakes and deletion
    new_row = list(row)[1:]
    aug = nac.KeyboardAug()
    augmented_texts = aug.augment(new_row[1], n=1)
    new_row[1] = augmented_texts
    new_rows.append(copy.copy(new_row))
    
    new_row = list(row)[1:]
    aug = nac.RandomCharAug(action="delete")
    augmented_texts = aug.augment(new_row[1], n=1)
    new_row[1] = augmented_texts
    new_rows.append(copy.copy(new_row))


0 Full Positive and Negative
2000 Positive only
0 Negative only
Have done appending 39978 records for 2000 positive set
43978


In [9]:
df_test_generated = pd.DataFrame(new_rows, columns=['cid', 'address', 'similar'])
df_test_generated.to_csv(
    "/data/dac/dedupe-project/openmap/openmap-us-train.csv", index=False
)

# Pre-processing Ground Truth

In [37]:
path = "/data/dac/dedupe-project/test/"
test_df = pd.read_csv(path + "GT_added.csv", encoding="ISO-8859-1")

In [39]:
test_df.drop(["test_id", "id"], axis=1, inplace=True)
test_df["similar"] = 1
test_df["cid"] = test_df.index
temp_df = test_df
temp_df["address"] = temp_df["duplicated_address"]
test_df = pd.concat([test_df, temp_df]).loc[:, ["cid", "address", "similar"]]

In [40]:
test_df_generated = generate_df(test_df, low=1, high=2)
test_df_generated.to_csv(path + "GT_added_new.csv", index=False)

0 Full Positive and Negative
421 Positive only
0 Negative only
Have done appending 3357 records for 421 positive set
7013


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [41]:
cid_list = test_df.cid.unique()
df_1_as_list = []
df_2_as_list = []
for cid in tqdm(cid_list):
    # Generate two dataset with label to compare
    df = test_df[test_df.cid == cid]

    # Positive case
    pos = df[df.similar == 1]
    df_2_as_list.append(pos.iloc[1:, :])
    [df_1_as_list.append(pos.iloc[0, :]) for i in range(0, len(pos.iloc[1:, :]))]

    # Negative case
    neg = df[df.similar == 0]
    df_2_as_list.append(neg.iloc[0:, :])
    [df_1_as_list.append(pos.iloc[0, :]) for i in range(0, len(neg.iloc[0:, :]))]

HBox(children=(IntProgress(value=0, max=421), HTML(value='')))




In [42]:
df_1 = pd.concat(df_1_as_list, axis=1).T
df_2 = pd.concat(df_2_as_list)

In [43]:
df_1.to_csv(path + "GT_added_new_anchor.csv")
df_2.to_csv(path + "GT_added_new_check.csv")