### Preliminaries

In [12]:
import re

import pandas as pd

from stringcluster import StringCluster

In [34]:
data = pd.read_csv('../data/companies.csv')
data.head(10)

Unnamed: 0,company
0,MICROSOFT CORP
1,APPLE INC
2,FACEBOOK INC
3,ISHARES TR
4,ORACLE CORP
5,ALPHABET INC - A
6,JOHNSON & JOHNSON
7,WESTERN DIGITAL CORP
8,AMAZON.COM INC
9,VISA INC


### Data

In [35]:
companies = data['company']
mask = data['company'].str.contains('FACEBOOK')
facebook = data['company'][mask]
facebook

2                                           FACEBOOK INC
408     FACEBOOK INC            CLASS                  A
474                                    FACEBOOK INC CL A
998                                           FACEBOOK-A
1042                                FACEBOOK INC CLASS A
1101                                      FACEBOOK INC A
1448                                      FACEBOOK INC-A
3020                                FACEBOOK INC COM NPV
3626                                     FACEBOOK INC -A
3638                                            FACEBOOK
4340                                      FACEBOOK, INC.
Name: company, dtype: object

### Deduplicating

In [36]:
STOP_TOKENS = r'[\W_]+|(corporation$)|(corp.$)|(corp$)|(incorporated$)|(inc.$)|(inc$)|(company$)|(common$)|(com$)'

cluster = StringCluster(ngram_size=2, threshold=0.7, stop_tokens=STOP_TOKENS)
labels = cluster.fit_transform(data['company'])

In [37]:
labels[facebook.index]

2       FACEBOOK INC
408     FACEBOOK INC
474     FACEBOOK INC
998     FACEBOOK INC
1042    FACEBOOK INC
1101    FACEBOOK INC
1448    FACEBOOK INC
3020    FACEBOOK INC
3626    FACEBOOK INC
3638    FACEBOOK INC
4340    FACEBOOK INC
Name: company, dtype: object

### Trialing Different Threshold Values

In [41]:
thresh = 0.7
while thresh < 1:
    cluster = StringCluster(ngram_size=2, threshold=thresh, stop_tokens=STOP_TOKENS)
    labels = cluster.fit_transform(data['company'])
    print(f'Threshold: {thresh}')
    print('----------------------------------------')
    print(labels[facebook.index])
    print('========================================')
    thresh += 0.05

Threshold: 0.7
----------------------------------------
2       FACEBOOK INC
408     FACEBOOK INC
474     FACEBOOK INC
998     FACEBOOK INC
1042    FACEBOOK INC
1101    FACEBOOK INC
1448    FACEBOOK INC
3020    FACEBOOK INC
3626    FACEBOOK INC
3638    FACEBOOK INC
4340    FACEBOOK INC
Name: company, dtype: object
Threshold: 0.75
----------------------------------------
2               FACEBOOK INC
408             FACEBOOK INC
474             FACEBOOK INC
998             FACEBOOK INC
1042            FACEBOOK INC
1101            FACEBOOK INC
1448            FACEBOOK INC
3020    FACEBOOK INC COM NPV
3626            FACEBOOK INC
3638            FACEBOOK INC
4340            FACEBOOK INC
Name: company, dtype: object
Threshold: 0.8
----------------------------------------
2                                           FACEBOOK INC
408     FACEBOOK INC            CLASS                  A
474                                         FACEBOOK INC
998                                         FACEBOOK