## Tokenization&stopwords removal

In [202]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
STOP = set(nltk.corpus.stopwords.words("english"))

class Sentence:
    def __init__(self, sentence):
        self.raw = sentence
        normalized_sentence = sentence.replace("‘", "'").replace("’", "'").replace(",","").replace(".","").replace("'","")
        self.tokens = [t.lower() for t in nltk.word_tokenize(normalized_sentence)]
        self.tokens_without_stop = [t for t in self.tokens if t not in STOP]

[nltk_data] Error loading stopwords: <urlopen error [Errno 11004]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading punkt: <urlopen error [Errno 11004]
[nltk_data]     getaddrinfo failed>


- Example for Class Sentence

In [203]:
st = Sentence("Wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and monetize audiences for digital publishers. Wetpaint’s own online property, Wetpaint Entertainment, an entertainment news site that attracts more than 12 million unique visitors monthly and has over 2 million Facebook fans, is a proof point to the company’s success in building and engaging audiences. Media companies can license Wetpaint’s platform which includes a dynamic playbook tailored to their individual needs and comprehensive training. Founded by Internet pioneer Ben Elowitz, and with offices in New York and Seattle, Wetpaint is backed by Accel Partners, the investors behind Facebook.")

In [204]:
st.tokens_without_stop

['wetpaint',
 'technology',
 'platform',
 'company',
 'uses',
 'proprietary',
 'state-of-the-art',
 'technology',
 'expertise',
 'social',
 'media',
 'build',
 'monetize',
 'audiences',
 'digital',
 'publishers',
 'wetpaints',
 'online',
 'property',
 'wetpaint',
 'entertainment',
 'entertainment',
 'news',
 'site',
 'attracts',
 '12',
 'million',
 'unique',
 'visitors',
 'monthly',
 '2',
 'million',
 'facebook',
 'fans',
 'proof',
 'point',
 'companys',
 'success',
 'building',
 'engaging',
 'audiences',
 'media',
 'companies',
 'license',
 'wetpaints',
 'platform',
 'includes',
 'dynamic',
 'playbook',
 'tailored',
 'individual',
 'needs',
 'comprehensive',
 'training',
 'founded',
 'internet',
 'pioneer',
 'ben',
 'elowitz',
 'offices',
 'new',
 'york',
 'seattle',
 'wetpaint',
 'backed',
 'accel',
 'partners',
 'investors',
 'behind',
 'facebook']

## Word2vec - WordEmbedding

In [4]:
import gensim
from gensim.models import Word2Vec
import os
PATH_TO_WORD2VEC = os.path.expanduser("data/GoogleNews-vectors-negative300.bin")
word2vec = gensim.models.KeyedVectors.load_word2vec_format(PATH_TO_WORD2VEC, binary=True)

## Glove - WordEmbedding

In [61]:
from gensim.scripts.glove2word2vec import glove2word2vec
PATH_TO_GLOVE = os.path.expanduser("data/glove.42B.300d.txt")
tmp_file = "data/glove.42B.300d.w2v.txt"
glove2word2vec(PATH_TO_GLOVE, tmp_file)
glove = gensim.models.KeyedVectors.load_word2vec_format(tmp_file)

- add weight to word embeddings

In [7]:
import csv

PATH_TO_FREQUENCIES_FILE = "data/frequencies.tsv"
PATH_TO_DOC_FREQUENCIES_FILE = "data/doc_frequencies.tsv"

def read_tsv(f):
    frequencies = {}
    with open(f) as tsv:
        tsv_reader = csv.reader(tsv, delimiter="\t")
        for row in tsv_reader: 
            frequencies[row[0]] = int(row[1])
        
    return frequencies
        
frequencies = read_tsv(PATH_TO_FREQUENCIES_FILE)
doc_frequencies = read_tsv(PATH_TO_DOC_FREQUENCIES_FILE)
doc_frequencies["NUM_DOCS"] = 1288431

## Similarity computation

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import math
import numpy as np

def run_avg_benchmark(sentences1, sentences2, model=None, use_stoplist=False, doc_freqs=None): 

    if doc_freqs is not None:
        N = doc_freqs["NUM_DOCS"]
    
    tokens1 = sentences1.tokens_without_stop if use_stoplist else sentences1.tokens
    tokens2 = sentences2.tokens_without_stop if use_stoplist else sentences2.tokens

    tokens1 = [token for token in tokens1 if token in model]
    tokens2 = [token for token in tokens2 if token in model]

#     if len(tokens1) == 0 or len(tokens2) == 0:
#         sims.append(0)
#         continue

    tokfreqs1 = Counter(tokens1)
    tokfreqs2 = Counter(tokens2)

    weights1 = [tokfreqs1[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                for token in tokfreqs1] if doc_freqs else None
    weights2 = [tokfreqs2[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                for token in tokfreqs2] if doc_freqs else None

    embedding1 = np.average([model[token] for token in tokfreqs1], axis=0, weights=weights1).reshape(1, -1)
    embedding2 = np.average([model[token] for token in tokfreqs2], axis=0, weights=weights2).reshape(1, -1)
    sim = cosine_similarity(embedding1, embedding2)[0][0]

    return sim

def run_wmd_benchmark(sent1, sent2, model, use_stoplist=False):

    tokens1 = sent1.tokens_without_stop if use_stoplist else sent1.tokens
    tokens2 = sent2.tokens_without_stop if use_stoplist else sent2.tokens

    tokens1 = [token for token in tokens1 if token in model]
    tokens2 = [token for token in tokens2 if token in model]

    if len(tokens1) == 0 or len(tokens2) == 0:
        tokens1 = [token for token in sent1.tokens if token in model]
        tokens2 = [token for token in sent2.tokens if token in model]

    sim = model.wmdistance(tokens1, tokens2)
        
    return sim

from sklearn.decomposition import TruncatedSVD

def remove_first_principal_component(X):
    svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0)
    svd.fit(X)
    pc = svd.components_
    XX = X - X.dot(pc.transpose()) * pc
    return XX


def run_sif_benchmark(sent1, sent2, model, freqs={}, use_stoplist=False, a=0.001): 
    total_freq = sum(freqs.values())
    embeddings = []
    # SIF requires us to first collect all sentence embeddings and then perform 
    # common component analysis.
    tokens1 = sent1.tokens_without_stop if use_stoplist else sent1.tokens
    tokens2 = sent2.tokens_without_stop if use_stoplist else sent2.tokens

    tokens1 = [token for token in tokens1 if token in model]
    tokens2 = [token for token in tokens2 if token in model]

    weights1 = [a/(a+freqs.get(token,0)/total_freq) for token in tokens1]
    weights2 = [a/(a+freqs.get(token,0)/total_freq) for token in tokens2]

    embedding1 = np.average([model[token] for token in tokens1], axis=0, weights=weights1)
    embedding2 = np.average([model[token] for token in tokens2], axis=0, weights=weights2)

    embeddings.append(embedding1)
    embeddings.append(embedding2)
        
    embeddings = remove_first_principal_component(np.array(embeddings))
    sim = [cosine_similarity(embeddings[idx*2].reshape(1, -1), 
                              embeddings[idx*2+1].reshape(1, -1))[0][0] 
            for idx in range(int(len(embeddings)/2))][0]

    return sim

## Loading data

In [66]:
import pandas as pd
data = pd.read_pickle("baseline.pkl")

## Compute similarity

In [51]:
def return_similarity(data,company_one,company_two,text_type, webm_model,stop_flag,sim_type):
    com_des_one =  data[data["name"] == company_one]["description"].values[0]
#     com_peo_des_one =  data[data["name"] == company_one]["p_description"].values[0]
    com_sh_des_one =  data[data["name"] == company_one]["short_description"].values[0]
    com_des_two =  data[data["name"] == company_two]["description"].values[0]
#     com_peo_des_two =  data[data["name"] == company_two]["p_description"].values[0]
    com_sh_des_two =  data[data["name"] == company_two]["short_description"].values[0]
    com_des_one = com_des_one + str(com_sh_des_one)
#     print(str(com_des_two))
    com_des_two = str(com_des_two) + str(com_sh_des_two)

    if text_type == "only_des":
        if sim_type == "cos":
            similarity = run_avg_benchmark(Sentence(com_des_one),Sentence(com_des_two),model = webm_model, use_stoplist=stop_flag)
        elif sim_type == "wmd":
            similarity = run_wmd_benchmark(Sentence(com_des_one),Sentence(com_des_two),model = webm_model, use_stoplist=stop_flag)
    return similarity

In [64]:
return_similarity(data,"Airbnb","Brightcove","only_des", glove,False,sim_type = "cos")

0.9412049

## Experiment

### 1.1 verify feasibility 

In [76]:
name_ls_finance = data[data.apply(lambda x: "Cloud Computing" in x["category_list"],axis = 1)]["name"]
name_ls_travel = data[data.apply(lambda x: "Travel" in x["category_list"],axis = 1)]["uuid","name"]
name_ls_cloud = data[data.apply(lambda x: "Cloud Computing" in x["category_list"],axis = 1)]["name"]

In [75]:
data["category_list"] = data["category_list"].fillna("No category")

In [78]:
finance_companies_ls = list(name_ls_finance)
travel_companies_ls = list(name_ls_travel)
cloud_companies_ls = list(name_ls_cloud)

In [79]:
for name in finance_companies_ls[:10]:
    sim_ls = []
#     print(name)
    sim = return_similarity(data,"Airbnb",name,"only_des", word2vec,False,sim_type = "cos")
    sim_ls.append(sim)
    print(name,sim)

Zoho 0.76162076
PBworks 0.8265405
Box 0.8388816
Oracle-NetSuite 0.749099
Big Bang Ventures 0.5666262
LongJump 0.82523376
Brightcove 0.7676357
Limelight Networks 0.7802206
Nirvanix 0.78128
INgage Networks 0.7725506


In [80]:
for name in cloud_companies_ls[:10]:
    sim_ls = []
#     print(name)
    sim = return_similarity(data,"Airbnb",name,"only_des", word2vec,False,sim_type = "cos")
    sim_ls.append(sim)
    print(name,sim)

Zoho 0.76162076
PBworks 0.8265405
Box 0.8388816
Oracle-NetSuite 0.749099
Big Bang Ventures 0.5666262
LongJump 0.82523376
Brightcove 0.7676357
Limelight Networks 0.7802206
Nirvanix 0.78128
INgage Networks 0.7725506


In [82]:
for name in travel_companies_ls[:10]:
    sim_ls = []
#     print(name)
    sim = return_similarity(data,"Airbnb",name,"only_des", word2vec,False,sim_type = "cos")
    sim_ls.append(sim)
    print(name,sim)

TripUp 0.8351637
SideStep 0.8549029
Farecast 0.846826
Yapta 0.81168526
TripHub 0.824854
TVtrip 0.8520464
Hotelicopter 0.85233414
Sulake 0.87833744
TCV 0.7949606
lastminute.com group 0.8319692


In [103]:
name_ls_travel = data[data.apply(lambda x: "Travel" in x["category_list"],axis = 1)]

In [111]:
travel_companies_ls = name_ls_travel[["uuid","name"]]

## Database

In [183]:
import pymongo
client = pymongo.MongoClient(host='localhost', port=27017)
db = client['startups']
collection = db.similarities

In [None]:
pair = {}
for travel_company in travel_companies_ls[:100]:
    sim = return_similarity(data,"Airbnb",travel_company[1],"only_des", word2vec,False,sim_type = "cos")
    pair["Airbnb"] = {}
    pair["uuid"] = travel_company[0]
    pair["name"] = 

In [169]:
company_ls = data[["uuid","name"]].values

In [175]:
company_ls = data[data["name"] == "Airbnb"][["uuid","name"]].values

In [177]:
for company_a in company_ls:
    count = 0
    for travel_company in travel_companies_ls.values:
        print("here is the count {}".format(count))
        pair = {}
        try:
            sim = return_similarity(data,company_a[1],travel_company[1],"only_des", word2vec,False,sim_type = "cos")
        except:
            pass
        pair["id_a"] = company_a[0]
        pair["id_b"] = travel_company[0]
        pair["name_a"] = company_a[1]
        pair["name_b"] = travel_company[1]
        pair["similarity"] = float(sim)
        collection.insert_one(pair)
        count+=1

here is the count 0
here is the count 1
here is the count 2
here is the count 3
here is the count 4
here is the count 5
here is the count 6
here is the count 7
here is the count 8
here is the count 9
here is the count 10
here is the count 11
here is the count 12
here is the count 13
here is the count 14
here is the count 15
here is the count 16
here is the count 17
here is the count 18
here is the count 19
here is the count 20
here is the count 21
here is the count 22
here is the count 23
here is the count 24
here is the count 25
here is the count 26
here is the count 27
here is the count 28
here is the count 29
here is the count 30
here is the count 31
here is the count 32
here is the count 33
here is the count 34
here is the count 35
here is the count 36
here is the count 37
here is the count 38
here is the count 39
here is the count 40
here is the count 41
here is the count 42
here is the count 43
here is the count 44
here is the count 45
here is the count 46
here is the count 47
he

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


here is the count 2527
here is the count 2528
here is the count 2529
here is the count 2530
here is the count 2531
here is the count 2532
here is the count 2533
here is the count 2534
here is the count 2535
here is the count 2536
here is the count 2537
here is the count 2538
here is the count 2539
here is the count 2540
here is the count 2541
here is the count 2542
here is the count 2543
here is the count 2544
here is the count 2545
here is the count 2546
here is the count 2547
here is the count 2548
here is the count 2549
here is the count 2550
here is the count 2551
here is the count 2552
here is the count 2553
here is the count 2554
here is the count 2555
here is the count 2556
here is the count 2557
here is the count 2558
here is the count 2559
here is the count 2560
here is the count 2561
here is the count 2562
here is the count 2563
here is the count 2564
here is the count 2565
here is the count 2566
here is the count 2567
here is the count 2568
here is the count 2569
here is the

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


here is the count 11298
here is the count 11299
here is the count 11300
here is the count 11301
here is the count 11302
here is the count 11303
here is the count 11304
here is the count 11305
here is the count 11306
here is the count 11307
here is the count 11308
here is the count 11309
here is the count 11310
here is the count 11311
here is the count 11312
here is the count 11313
here is the count 11314
here is the count 11315
here is the count 11316
here is the count 11317
here is the count 11318
here is the count 11319
here is the count 11320
here is the count 11321
here is the count 11322
here is the count 11323
here is the count 11324
here is the count 11325
here is the count 11326
here is the count 11327
here is the count 11328
here is the count 11329
here is the count 11330
here is the count 11331
here is the count 11332
here is the count 11333
here is the count 11334
here is the count 11335
here is the count 11336
here is the count 11337
here is the count 11338
here is the coun

In [174]:
data[data["name"] == "Airbnb"][["uuid","name"]].values

array([['bcb617c3-9e43-d5b0-1d14-82b795f2642f', 'Airbnb']], dtype=object)

In [159]:
travel_companies_ls.values[2526]

array(['20cf3f7e-c5fd-aa4b-d81d-2b0bc4f355d6', 'PixieWorks'], dtype=object)

In [184]:
results = collection.find({"name_a":"Airbnb"})
ls = list(results)

In [191]:
result_data = pd.DataFrame(sorted(ls, key=lambda x: x.get("similarity"), reverse = True))

In [190]:
new_data = data[["name","description","p_description"]]

In [194]:
new_data.columns = ["name_b","description","p_description"]

In [197]:
merge = pd.merge(result_data,new_data,how = "left",on = ["name_b"])

In [201]:
merge.head(20).to_excel("baseline.xlsx",index=False)

In [152]:
ls = list(results)

In [154]:
pd.DataFrame(ls)

Unnamed: 0,_id,id_a,id_b,name_a,name_b,similarity
0,60f44c1d55de66abdf6659c4,e1393508-30ea-8a36-3f96-dd3226033abd,2471be1a-bd57-ef58-22b8-e7dc7266039e,Wetpaint,TripUp,0.77527
1,60f44c1d55de66abdf6659c5,e1393508-30ea-8a36-3f96-dd3226033abd,b62feb13-991c-2ae7-8faa-0b96d9a69ec6,Wetpaint,SideStep,0.876184
2,60f44c1d55de66abdf6659c6,e1393508-30ea-8a36-3f96-dd3226033abd,5c6f4d9e-dc72-e018-ee21-71cd23f698ad,Wetpaint,Farecast,0.841621
3,60f44c1d55de66abdf6659c7,e1393508-30ea-8a36-3f96-dd3226033abd,b5f351ca-6fc1-3661-e3fb-a498ce4b1a47,Wetpaint,Yapta,0.743773
4,60f44c1e55de66abdf6659c8,e1393508-30ea-8a36-3f96-dd3226033abd,c37e4ed6-f214-08ab-338a-33f329cffe3f,Wetpaint,TripHub,0.847277
5,60f44c1e55de66abdf6659c9,e1393508-30ea-8a36-3f96-dd3226033abd,a0f5e737-d973-703c-7f66-0fc31d0e4b5b,Wetpaint,TVtrip,0.840842
6,60f44c1e55de66abdf6659ca,e1393508-30ea-8a36-3f96-dd3226033abd,31f9d866-3660-5367-8c52-028a7610e441,Wetpaint,Hotelicopter,0.866531
7,60f44c1e55de66abdf6659cb,e1393508-30ea-8a36-3f96-dd3226033abd,9149ba81-8e61-1ffb-1ef8-66573435e720,Wetpaint,Sulake,0.878306
8,60f44c1e55de66abdf6659cc,e1393508-30ea-8a36-3f96-dd3226033abd,b915e540-3377-6a2a-651e-6fd7c0787e26,Wetpaint,TCV,0.829159
9,60f44c1e55de66abdf6659cd,e1393508-30ea-8a36-3f96-dd3226033abd,d2712549-89f1-0f55-4ac9-c220f4772ff8,Wetpaint,lastminute.com group,0.811847
