In [1]:
import pathlib
import pandas as pd
import numpy as np

In [2]:
SUBMISSION_N = 21

In [3]:
DATA_DIR = pathlib.Path("data")

In [4]:
%%time

train, test = [
    pd.read_csv(DATA_DIR.joinpath(filename), index_col="pair_id")
    for filename in (
        "train.csv",
        "test.csv",
    ) 
]

CPU times: user 402 ms, sys: 93.2 ms, total: 495 ms
Wall time: 494 ms


In [5]:
%%time

for dataset in (train, test):
    for column in ("name_1", 'name_2'):
        dataset[column] = dataset[column].str.lower()

CPU times: user 424 ms, sys: 63.8 ms, total: 488 ms
Wall time: 487 ms


In [6]:
import re
import string

In [7]:
business_types = [
    'industries',
    'trading',
    'logistics',
    'products',
    'sports',
    'chemicals',
    'polymers',
    'technology',
    'textile',
    'plastic',
    'trade',
    'services',
    'plastics',
    'comercio',
    'technologies',
    'engineering',
    'industria',
    'bank',
    'shipping',
    'freight',
    'materials',
    'electronics',
    'distribution',
    'marketing',
    'supply',
    'transport',
    'traders',
    'logistica',
    'forwarding',
    'equipment',
    'sport',
    'textiles',
    'synthetic',
    'comercial',
    'polymer',
    'service',
    'medical',
    'commercial',
    'electronic',
]

organization_types = [
    'international',
    'enterprises',
    'industrial',
    'group',
    'global',
    'enterprise',
    'industry',
    'worldwide',
    'corporation',
    'overseas',
]

In [8]:
transliteration = dict([
    (a, a) for a in string.ascii_lowercase + ' '
] + [
    (a, '') for a in string.digits 
] + [
    (a, ' ') for a in string.punctuation
])

transliteration.update({
    'а': 'a',
    'б': 'b',
    'в': 'v',
    'г': 'g',
    'д': 'd',
    'е': 'e',
    'ё': 'yo',
    'ж': 'zh',
    'з': 'z',
    'и': 'i',
    'й': 'i',
    'к': 'k',
    'л': 'l',
    'м': 'm',
    'н': 'n',
    'о': 'o',
    'п': 'p',
    'р': 'r',
    'с': 's',
    'т': 't',
    'у': 'u',
    'ф': 'f',
    'х': 'h',
    'ц': 'ts',
    'ч': 'ch',
    'ш': 'sh',
    'щ': 'sch',
    'ъ': '',
    'ы': 'y',
    'ь': '',
    'э': 'e',
    'ю': 'yu',
    'я': 'ya',
    'ü': 'u',
    'ş': 'sh',
    'ö': 'oe',
    '\xa0': ' ',
    'á': 'a',
    'ç': 'ch',
    'é': 'e',
    'ł': 'w',
    'ó': 'o',
    'ú': 'y',
    'ñ': 'n',
    'í': 'i',
    '\u3000': ' ',
    '\u0e00': ' ',
    'å': 'a',
    'ű': 'u',
    'ő': 'o',
    'ę': 'e',
    'õ': 'o',
    'è': 'e',
    'ã': 'a',
})

def transliterate(s, transliteration=transliteration):
    return ''.join([transliteration.get(c, c) for c in s])

strip_non_alphanum = re.compile(r'\W+', re.UNICODE)
whitespace_normalization = re.compile(r'\s+', re.UNICODE)
abbreviation_glue = re.compile(r'(?<=(?<!\w)\w) (?=\w(?!\w))', re.UNICODE) 
inside_brackets = re.compile(r"\s*\(.*?\)")
def normalize(s):
    s = inside_brackets.sub(' ', s)
    s = strip_non_alphanum.sub(' ', s)
    s = whitespace_normalization.sub(' ', s)
    s = abbreviation_glue.sub('', s)
    s = transliterate(s)
    
    return s.strip()

def multi_str_replace(strings, debug=True, borders=True):
    re_str = '|'.join(
        [re.escape(s) for s in strings]
    )
    if borders:
        re_str = r'\b(?:' + re_str + r')(?!\S)'
        
    if debug:
        print(re_str)
    return re.compile(re_str, re.UNICODE)

legal_entities = [
    "ltd", "co", "inc", "bv", "scrl", 
    "gmbh", "pvt", "private", "limited", "incorporated", 
    "company", "sa", "ltda", "llc", "corp",
]

In [9]:
import pycountry

legal_re = multi_str_replace(legal_entities)
countries = [normalize(country.name.lower()) for country in pycountry.countries]
countries_re = multi_str_replace(countries)
org_re = multi_str_replace(organization_types)
business_re = multi_str_replace(business_types)

\b(?:ltd|co|inc|bv|scrl|gmbh|pvt|private|limited|incorporated|company|sa|ltda|llc|corp)(?!\S)
\b(?:aruba|afghanistan|angola|anguilla|aland\ islands|albania|andorra|united\ arab\ emirates|argentina|armenia|american\ samoa|antarctica|french\ southern\ territories|antigua\ and\ barbuda|australia|austria|azerbaijan|burundi|belgium|benin|bonaire\ sint\ eustatius\ and\ saba|burkina\ faso|bangladesh|bulgaria|bahrain|bahamas|bosnia\ and\ herzegovina|saint\ barthelemy|belarus|belize|bermuda|bolivia\ plurinational\ state\ of|brazil|barbados|brunei\ darussalam|bhutan|bouvet\ island|botswana|central\ african\ republic|canada|cocos\ islands|switzerland|chile|china|c\ôte\ d\ ivoire|cameroon|congo\ the\ democratic\ republic\ of\ the|congo|cook\ islands|colombia|comoros|cabo\ verde|costa\ rica|cuba|curachao|christmas\ island|cayman\ islands|cyprus|czechia|germany|djibouti|dominica|denmark|dominican\ republic|algeria|ecuador|egypt|eritrea|western\ sahara|spain|estonia|ethiopia|finland|fiji|falkland\ is

In [10]:
%%time

for dataset in (train, test):
    dataset[['name_1_n', 'name_2_n']] = dataset[['name_1', 'name_2']].apply(np.vectorize(normalize))

CPU times: user 12.5 s, sys: 599 ms, total: 13.1 s
Wall time: 13.1 s


In [11]:
def get_tokens(s):
    return s.split(' ')


def word_intersection(values):
    a, b = values
    a, b = [get_tokens(s) for s in (a, b)]
    
    return ' '.join([w for w in a if w in set(a).intersection(b)])


def word_difference(values):
    a, b = values
    a, b = [get_tokens(s) for s in (a, b)]
    
    return ' '.join([w for w in a + b if w in set(a).symmetric_difference(b)])

In [12]:
%%time

for dataset in (train, test):
    dataset['intersection'] = dataset[['name_1_n', 'name_2_n']].apply(word_intersection, axis=1)
    dataset['word_difference'] = dataset[['name_1_n', 'name_2_n']].apply(word_difference, axis=1)

CPU times: user 17.1 s, sys: 364 ms, total: 17.4 s
Wall time: 17.5 s


In [13]:
%%time

for dataset in (train, test):
    texts = dataset[['intersection', 'word_difference']]
    texts = texts.replace(legal_re, "LEGAL", inplace=False)
    texts = texts.replace(countries_re, "COUNTRY", inplace=False)
    texts = texts.replace(org_re, "ORG", inplace=False)
    dataset[['intersection', 'word_difference']] = texts

CPU times: user 11.7 s, sys: 93.2 ms, total: 11.8 s
Wall time: 11.8 s


In [14]:
from tqdm.notebook import tqdm

pd.concat([
    train.sample(10),
    train[train.is_duplicate == 1].sample(10),
], axis=0, ignore_index=True)[['is_duplicate', 'name_1_n', 'name_2_n', 'intersection', 'word_difference']]

Unnamed: 0,is_duplicate,name_1_n,name_2_n,intersection,word_difference
0,0,quimiplast ingenieria ltda,mir international co ltd,,quimiplast ingenieria LEGAL mir ORG LEGAL LEGAL
1,0,kerry logistics mexico sa de cv,bmc logistic sas,,kerry logistics COUNTRY LEGAL de cv bmc logist...
2,0,briggs industrial footwear ltd,mti llc,,briggs ORG footwear LEGAL mti LEGAL
3,0,trelleborg industrial products india private ltd,dsa,,trelleborg ORG products COUNTRY LEGAL LEGAL dsa
4,0,nakagawa kaname trading co ltd,ag trading co,trading LEGAL,nakagawa kaname LEGAL ag
5,0,extreme importers services ltd,trex co inc,,extreme importers services LEGAL trex LEGAL LEGAL
6,0,cbc trading co ltd,ralhum trading co pvt ltd,trading LEGAL LEGAL,cbc ralhum LEGAL
7,0,shenzhen huadingsheng imp exp trade co ltd,mmr imp exp llp,imp exp,shenzhen huadingsheng trade LEGAL LEGAL mmr llp
8,0,mathiesen colombia sas,quad graphics colombia sa,COUNTRY,mathiesen sas quad graphics LEGAL
9,0,nri inc,steve madden south africa ltd,,nri LEGAL steve madden COUNTRY LEGAL


In [15]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse 

model = LinearSVC(C=1)

# Using only train because of rules limitation to exploit test part
vectorizer = CountVectorizer(lowercase=False, ngram_range=(1, 1))
vectorizer.fit(train['intersection'] + ' ' + train['word_difference'])


def build_features(dataset):
    X = sparse.hstack([
        vectorizer.transform(dataset['intersection']),
        vectorizer.transform(dataset['word_difference']),
    ])
    return X


def build_features_same_space(dataset):
    X = (
        vectorizer.transform(dataset['intersection'])
        - vectorizer.transform(dataset['word_difference'])
    )
    
    return X

In [16]:
%%time

X_train, X_test = [
    build_features_same_space(dataset)
    for dataset in (train, test)
]

CPU times: user 6.19 s, sys: 72.8 ms, total: 6.26 s
Wall time: 6.27 s


In [17]:
X_train.shape, X_test.shape

((497819, 15968), (213249, 15968))

In [18]:
y_train = train['is_duplicate'].values
model.fit(X_train, y_train)

LinearSVC(C=1)

In [19]:
p_train = model.decision_function(X_train)

In [20]:
train['p'] = p_train
threshold = 0.95 * train.is_duplicate.mean()
train[(train['p'].rank(pct=True) >=  (1 - threshold)) & (train['is_duplicate'] == 0)]

Unnamed: 0_level_0,name_1,name_2,is_duplicate,name_1_n,name_2_n,intersection,word_difference,p
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
104190,yokohama re mfg.,"the yokohama rubber co., ltd.",0,yokohama re mfg,the yokohama rubber co ltd,yokohama,re mfg the rubber LEGAL LEGAL,0.761664
264480,"total oil india private limited, total",total polska sp. z o.o.,0,total oil india private limited total,total polska sp zoo,total total,oil COUNTRY LEGAL LEGAL polska sp zoo,0.821822
345098,yokohama tyre,"the yokohama rubber co., ltd.",0,yokohama tyre,the yokohama rubber co ltd,yokohama,tyre the rubber LEGAL LEGAL,0.851947


In [21]:
p_test = model.decision_function(X_test)

In [22]:
sample_sub = pd.read_csv(
    DATA_DIR.joinpath("sample_submission.csv"), 
    index_col="pair_id"
)

In [23]:
rank_threshold = 0.9 * train['is_duplicate'].mean()

test['is_duplicate'] = p_test
test['is_duplicate'] = test['is_duplicate'].rank(pct=True)

test['is_duplicate'] = (test.is_duplicate >= 1 - rank_threshold).astype(int)

In [24]:
test['p'] = p_test

In [25]:
test[test['is_duplicate'] == 1].sample(5)

Unnamed: 0_level_0,name_1,name_2,name_1_n,name_2_n,intersection,word_difference,is_duplicate,p
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
186272,"sojitz (thailand) co., ltd.",sojitz (shanghai),sojitz co ltd,sojitz,sojitz,LEGAL LEGAL,1,-0.713619
111762,shaoguang debao electronic science & technolog...,"shaoguang kangrui science & technology co., ltd.",shaoguang debao electronic science technology ...,shaoguang kangrui science technology co ltd,shaoguang science technology LEGAL LEGAL,debao electronic kangrui,1,-0.470339
139255,rpi de mexicos de rl de cv,cuno mexicos de rl de cv,rpi de mexicos de rl de cv,cuno mexicos de rl de cv,de mexicos de rl de cv,rpi cuno,1,-0.851562
103299,"ооо""пауль хартманн сибирь""","ооо""пауль хартманн сибирь""",ooo paul hartmann sibir,ooo paul hartmann sibir,ooo paul hartmann sibir,,1,-0.772879
154011,общество с ограниченной ответственностью asia ...,мишлен,obschestvo s ogranichennoi otvetstvennostyu as...,mishlen,,obschestvo s ogranichennoi otvetstvennostyu as...,1,-0.819801


In [26]:
test[test['is_duplicate'] == 0].sample(5)

Unnamed: 0_level_0,name_1,name_2,name_1_n,name_2_n,intersection,word_difference,is_duplicate,p
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
138067,c.o.i.m. s.p.a. chimica organica industriale m...,parker kawakami,coimspa chimica organica industriale milanese,parker kawakami,,coimspa chimica organica industriale milanese ...,0,-2.305935
108075,daksma s.a. de c.v.,danosa caribbean in,daksma sa de cv,danosa caribbean in,,daksma LEGAL de cv danosa caribbean in,0,-2.965076
94578,"joe fu qingyuan plastic co., ltd.",asimco international inc.,joe fu qingyuan plastic co ltd,asimco international inc,,joe fu qingyuan plastic LEGAL LEGAL asimco ORG...,0,-1.30677
150078,rankin usa inc.,adk international,rankin usa inc,adk international,,rankin usa LEGAL adk ORG,0,-1.107713
102499,mafer ind e com de artefatos de borracha ltda,coachap,mafer ind e com de artefatos de borracha ltda,coachap,,mafer ind e com de artefatos de borracha LEGAL...,0,-1.728003


In [27]:
test_result = test[['is_duplicate', 'p']]

In [28]:
del sample_sub['is_duplicate']

In [29]:
sample_sub = sample_sub.merge(test_result, left_index=True, right_index=True, how='left')

In [30]:
sample_sub.is_duplicate.mean()

0.0066166781555833795

In [31]:
train.is_duplicate.mean()

0.007348052203712594

In [32]:
RESULT_DIR = pathlib.Path("submissions")

In [33]:
sample_sub[['is_duplicate']].to_csv(
    RESULT_DIR.joinpath(f"model_{SUBMISSION_N}_submission.csv.gz"), 
    compression='infer'
)

In [34]:
sample_sub[['p']].to_csv(
    RESULT_DIR.joinpath(f"model_{SUBMISSION_N}_submission_raw.csv.gz"), 
    compression='infer'
)

In [35]:
reverse = [word for _, word in sorted([(v, k) for k, v in vectorizer.vocabulary_.items()])]

In [36]:
from itertools import chain
from collections import Counter

characters = Counter(chain(*train['name_1_n'].values, *train['name_2_n'].values)).most_common()

In [37]:
sorted(
    zip(list(model.coef_[0]), [f'i_{w}' for w in reverse] + [f'd_{w}' for w in reverse]), 
    key=lambda x: -abs(x[0])
)[:300]

[(2.822308879731156, 'i_api'),
 (-2.4835919392440657, 'i_zs'),
 (-2.4495503735543855, 'i_aiko'),
 (2.3641088958290997, 'i_lotte'),
 (2.304404333088943, 'i_fenner'),
 (2.128929013914652, 'i_bridgestone'),
 (2.091339389039545, 'i_polimarket'),
 (-2.0840006588401154, 'i_zika'),
 (2.056409748681573, 'i_wuxi'),
 (2.0236123410189375, 'i_basf'),
 (2.0234802507628444, 'i_artsana'),
 (2.0234802069031135, 'i_guchchi'),
 (1.9637436946180602, 'i_brenntag'),
 (1.9591944805338197, 'i_soprema'),
 (1.9570739732137001, 'i_reliance'),
 (1.9569161265962465, 'i_giti'),
 (1.9554553383696118, 'i_exxonmobil'),
 (1.9522067976545725, 'i_contitech'),
 (1.9492524715065533, 'i_ravago'),
 (1.9344833697297166, 'i_daewoo'),
 (1.9279201713286938, 'i_pirelli'),
 (1.923991305092052, 'i_zeon'),
 (1.9170553326237114, 'i_sika'),
 (1.9150115000099555, 'i_arlanxeo'),
 (-1.8890711835856755, 'i_poliglas'),
 (1.8879713404335465, 'i_synthomer'),
 (1.8721299793895938, 'i_ineos'),
 (1.8709025535187613, 'i_kenda'),
 (1.86345329922

In [38]:
[c for c in characters if c[0].lower() not in transliteration ]

[('双', 130),
 ('日', 130),
 ('株', 130),
 ('式', 130),
 ('会', 130),
 ('社', 130),
 ('س', 121),
 ('ر', 121),
 ('ف', 121),
 ('ي', 121),
 ('ك', 121),
 ('و', 121),
 ('م', 121),
 ('上', 96),
 ('海', 96),
 ('北', 1),
 ('京', 1),
 ('东', 1),
 ('方', 1),
 ('雨', 1),
 ('虹', 1),
 ('防', 1),
 ('水', 1),
 ('技', 1),
 ('术', 1),
 ('股', 1),
 ('份', 1),
 ('有', 1),
 ('限', 1),
 ('公', 1),
 ('司', 1)]

In [39]:
Counter(chain(*dataset['intersection'].apply(get_tokens).values)).most_common(300)

[('', 129841),
 ('LEGAL', 42473),
 ('industries', 8875),
 ('trading', 8871),
 ('logistics', 8023),
 ('de', 7000),
 ('COUNTRY', 6854),
 ('ORG', 6633),
 ('rubber', 4527),
 ('cv', 3004),
 ('imp', 2892),
 ('exp', 2850),
 ('products', 1965),
 ('sports', 1018),
 ('chemicals', 865),
 ('america', 774),
 ('chemical', 763),
 ('polymers', 756),
 ('technology', 624),
 ('textile', 584),
 ('shanghai', 578),
 ('plastic', 456),
 ('trade', 436),
 ('services', 430),
 ('plastics', 427),
 ('comercio', 395),
 ('dongguan', 395),
 ('usa', 365),
 ('rl', 357),
 ('technologies', 356),
 ('s', 351),
 ('general', 343),
 ('spinning', 320),
 ('engineering', 319),
 ('industria', 315),
 ('brasil', 291),
 ('qingdao', 275),
 ('polychem', 270),
 ('bank', 260),
 ('solutions', 258),
 ('shipping', 257),
 ('shenzhen', 238),
 ('freight', 232),
 ('e', 225),
 ('sociedad', 200),
 ('automotive', 199),
 ('materials', 192),
 ('anonima', 185),
 ('the', 180),
 ('do', 176),
 ('mills', 175),
 ('electronics', 165),
 ('goodyear', 160),
 