In [1]:
import numpy as np
import pandas as pd


## Load Companies/Business patters datasets

In [4]:
corps = pd.read_csv("test-task-companies.csv", encoding = "ISO-8859-1", dtype=str) 

corps.fillna('', inplace=True)
corps['len']=corps['Description'].apply(len)
corps=corps[corps['len'] > 0]
corps = corps.rename(str.lower, axis='columns')
corps.head()

Unnamed: 0,company name,description,patterns,len
1,Skin Pixel,Skin Pixel uses novel gene therapy to create a...,,117
2,REView Analytics,REview Analytics empowers real estate investor...,,364
4,Kawsay,Kawsay is the platform that connects communiti...,,616
6,Dormio,Dormio is a platform to interface with your dr...,,483
7,Alight,Alight is an app that provides bus riders with...,,295


In [6]:
ptt = pd.read_csv("test-task-patterns.csv", encoding = "ISO-8859-1", dtype=str) 

ptt.fillna('', inplace=True)
ptt['len']=ptt['description'].apply(len)
ptt=ptt[ptt['len'] > 2]
ptt.head()

Unnamed: 0,#,pattern,description,len
0,1,Add-on,Offer a basic product at a competitive price a...,74
1,2,Advertising model,Provide a product or service and mix it with a...,65
2,3,Advisors,Provide consulting and advice,29
3,4,Affiliation,Refer customers to a third party and receive a...,139
4,5,Affinity clubs,Partner with membership associations and other...,108


## Display some arbitrary descriptions

In [11]:
corps.loc[2,'description']

"REview Analytics empowers real estate investors to make better data-driven decisions. By anonymizing, aggregating, analyzing, and visualizing the projections of actual market participants, REview Analytics provides a new kind of real estate data: market intelligence that provides insight about the future, rather than merely reporting what's happened in the past."

In [12]:
ptt.loc[1,'description']

'Provide a product or service and mix it with advertising messages'

## Load lang models

In [22]:
from time import time
start_nb = time()

import spacy
import gensim
from gensim.models import Word2Vec

start = time()
# Load the large English NLP model
nlp = spacy.load('en_core_web_lg')

# Load Google news model
model = gensim.models.KeyedVectors.load_word2vec_format('../DeDub/models/GoogleNews-vectors-negative300.bin.gz', binary=True)

print('Cell took {:.2f} seconds to run.'.format(time() - start))

Cell took 148.52 seconds to run.


## doc

In [27]:
def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 
            deletions = current_row[j] + 1       
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]


# Tokenizer user spaCy tokenizer under the hood
# There is no need to remove stop words from corpus in case of word embeddings 
# otherwise it may impact performance
def spacy_tokenize(text):
    doc = nlp(text)
    pattern = [w.text.lower() for w in doc]
    return pattern 

# Company description tokenizer
# A company description may contain a company name which is probably not existing in the model
# so we need to replace a company name with something neutral like "it"
def company_tokenize(corp_name, text):
    out = []
    for token in spacy_tokenize(text):
        factor = (2*len(corp_name))/10
        distance = levenshtein(token, corp_name.lower())
        if distance <= factor:
            out.append("it")
        else:
            out.append(token)
    
    text2 = " ".join(out)
    doc = nlp(text2)

    matches=[]
    for chunk in doc.noun_chunks:
        factor = (2*len(corp_name))/10
        distance = levenshtein(chunk.text, corp_name.lower())
        if distance <= factor:
            matches.append(chunk.text)

    for tt in matches:
        text2 = text2.replace(tt, "it")
    
    return text2.split()


def cross_distance(corps, corp_list, ptt):
    df_ix = 0
    corp_distances = pd.DataFrame(columns=['corp_ix', 'ptt_ix', 'distance'])
    
    for corp in corp_list:    
        corp_name = corps.loc[corp,'company name']
        text = corps.loc[corp,'description']
        corp_desc = company_tokenizer(corp_name, text)
 
        for idx in range(len(ptt)):
            pattern = spacy_tokenize(ptt.loc[idx,'description'])

            distance = model.wmdistance(corp_desc, pattern)
            corp_distances.loc[df_ix, 'distance']=distance
            corp_distances.loc[df_ix, 'corp_ix']=corp
            corp_distances.loc[df_ix, 'ptt_ix']=idx
            df_ix += 1

    return corp_distances


In [39]:
corp_list = [1,2,4,6,7,9,11]

# Uncomment line below to generate WMD distance for the whole company dataset
# (may take time ~ several hours depending on machine)
# corp_list = corps.index.tolist()

corp_distances = cross_distance(corps, corp_list, ptt)  
corp_distances.head()

Unnamed: 0,corp_ix,ptt_ix,distance
0,1,0,3.5302
1,1,1,3.27175
2,1,2,3.81055
3,1,3,3.64481
4,1,4,3.56378


## Quick view on the distances

In [35]:
import re

def convert_to_str(res):
    lst = []
    for idx, tp in enumerate(res.values):
        lst.append("{}({:.4f})".format(tp[0], tp[1]))
            
    return lst

for corp in corp_list:
    cd = corp_distances[corp_distances['corp_ix'] == corp]
    res = cd[['ptt_ix', 'distance']].sort_values('distance').head(7)
    print("Corp: {} {}".format(corp, convert_to_str(res)))

Corp: 1 ['323(3.0026)', '50(3.1089)', '116(3.1360)', '347(3.1365)', '187(3.1447)', '251(3.1507)', '333(3.1579)']
Corp: 2 ['305(2.3515)', '210(2.3642)', '102(2.4573)', '348(2.4621)', '265(2.4640)', '86(2.4678)', '213(2.4720)']
Corp: 4 ['293(2.4866)', '228(2.4929)', '205(2.5434)', '202(2.5461)', '161(2.5479)', '288(2.5530)', '210(2.5532)']
Corp: 6 ['293(2.3788)', '164(2.4729)', '348(2.4851)', '210(2.5165)', '305(2.5440)', '86(2.5446)', '202(2.5462)']
Corp: 7 ['293(2.2475)', '305(2.2724)', '164(2.3212)', '348(2.3594)', '228(2.3676)', '123(2.3687)', '323(2.3723)']
Corp: 9 ['348(2.2720)', '288(2.5127)', '228(2.5165)', '291(2.5240)', '323(2.5257)', '265(2.5360)', '188(2.5509)']
Corp: 11 ['348(2.3476)', '293(2.4084)', '188(2.4309)', '305(2.4367)', '187(2.5000)', '210(2.5101)', '291(2.5132)']


## Process cross-distance data frame leaving business patterns with the least distances

In [45]:
filtered_distances = pd.DataFrame(columns=['corp_ix', 'ptt_ix', 'distance'])

rgrouped =corp_distances.groupby(['corp_ix', 'distance']).agg({'ptt_ix':sum})

number_to_keep=6
factor = 0.12
i=0
last_corp = -1
nearest_distance = 0
inside=0
for index, row in rgrouped.iterrows():
    (corp,distance)=index
    ptt_ix = row['ptt_ix']
    if ptt_ix != 348:
        if last_corp != corp:
            # New corp
            filtered_distances.loc[i, ['corp_ix', 'ptt_ix', 'distance']] = (corp, ptt_ix, distance)
            i+=1
            nearest_distance=distance
            last_corp = corp
            inside=1
        else:
            if nearest_distance+factor >= distance and inside < number_to_keep:
                inside +=1 
                filtered_distances.loc[i, ['corp_ix', 'ptt_ix', 'distance']] = (corp, ptt_ix, distance)
                i+=1

filtered_distances.head()


Unnamed: 0,corp_ix,ptt_ix,distance
0,1,323,3.00263
1,1,50,3.10893
2,2,305,2.35155
3,2,210,2.36416
4,2,102,2.45731


In [46]:
for idx in range(filtered_distances.shape[0]):
    corp_idx = filtered_distances.loc[idx, 'corp_ix']
    ptt_idx = filtered_distances.loc[idx, 'ptt_ix']
    cname = str(corps.loc[corp_idx, 'company name'])
    pattern = str(ptt.loc[ptt_idx, 'pattern'])
    filtered_distances.loc[idx,'company name'] = cname
    filtered_distances.loc[idx,'business patern'] = pattern


filtered_distances.groupby(['company name', 'distance']).agg({'business patern':sum}).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,business patern
company name,distance,Unnamed: 2_level_1
AdaViv,2.512701,Product-oriented Services
AdaViv,2.516467,Online waste exchange platform
AdaViv,2.523987,Result-oriented Services
AdaViv,2.525738,Video Comedy
AdaViv,2.535952,Co-Product Generation
AdaViv,2.550902,Endless aisles
Alight,2.247532,Sharing Business
Alight,2.272433,Spreading the Word Offline
Alight,2.32122,Prosumers
Dormio,2.378754,Sharing Business


## Save company to business pattern matching in CSV file

In [47]:
def convert_to_str(res):
    lst = []
    for idx, tp in enumerate(res.values):
        lst.append("{}".format(tp))
            
    return str(lst)[1:-1]

corp_ptt_mappings = pd.DataFrame(columns=['corp_ix', 'company name', 'business patterns'])

i= 0
for corp in corp_list:
    res = filtered_distances[filtered_distances['corp_ix'] == corp]['business patern']
    corp_ptt_mappings.loc[i, ['corp_ix', 'company name', 'business patterns']] = \
            (corp, corps.loc[corp, 'company name'], convert_to_str(res))
    i +=1
    
corp_ptt_mappings.to_csv("company-pattern-matching.csv", index=False)    
