## Data preprocessing

In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
STOP = set(nltk.corpus.stopwords.words("english"))

class Sentence:
    def __init__(self, sentence):
        self.raw = sentence
        normalized_sentence = sentence.replace("‘", "'").replace("’", "'").replace(",","").replace(".","").replace("'","")
        self.tokens = [t.lower() for t in nltk.word_tokenize(normalized_sentence)]
        self.tokens_without_stop = [t for t in self.tokens if t not in STOP]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\70473\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\70473\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Loading Word2vec

In [2]:
import gensim
from gensim.models import Word2Vec
import os
PATH_TO_WORD2VEC = os.path.expanduser("data/GoogleNews-vectors-negative300.bin")
word2vec = gensim.models.KeyedVectors.load_word2vec_format(PATH_TO_WORD2VEC, binary=True)

## Word Freqency(word weight)

In [3]:
import csv

PATH_TO_FREQUENCIES_FILE = "data/frequencies.tsv"
PATH_TO_DOC_FREQUENCIES_FILE = "data/doc_frequencies.tsv"

def read_tsv(f):
    frequencies = {}
    with open(f) as tsv:
        tsv_reader = csv.reader(tsv, delimiter="\t")
        for row in tsv_reader: 
            frequencies[row[0]] = int(row[1])
        
    return frequencies
        
frequencies = read_tsv(PATH_TO_FREQUENCIES_FILE)
doc_frequencies = read_tsv(PATH_TO_DOC_FREQUENCIES_FILE)
doc_frequencies["NUM_DOCS"] = 1288431

## Similarity measurement-weighted cosine similarity

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import math
import numpy as np

def run_avg_benchmark(sentences1, sentences2, model=None, use_stoplist=False, doc_freqs=None): 

    if doc_freqs is not None:
        N = doc_freqs["NUM_DOCS"]
    
    tokens1 = sentences1.tokens_without_stop if use_stoplist else sentences1.tokens
    tokens2 = sentences2.tokens_without_stop if use_stoplist else sentences2.tokens

    tokens1 = [token for token in tokens1 if token in model]
    tokens2 = [token for token in tokens2 if token in model]

#     if len(tokens1) == 0 or len(tokens2) == 0:
#         sims.append(0)
#         continue

    tokfreqs1 = Counter(tokens1)
    tokfreqs2 = Counter(tokens2)

    weights1 = [tokfreqs1[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                for token in tokfreqs1] if doc_freqs else None
    weights2 = [tokfreqs2[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                for token in tokfreqs2] if doc_freqs else None

    embedding1 = np.average([model[token] for token in tokfreqs1], axis=0, weights=weights1).reshape(1, -1)
    embedding2 = np.average([model[token] for token in tokfreqs2], axis=0, weights=weights2).reshape(1, -1)
    sim = cosine_similarity(embedding1, embedding2)[0][0]

    return sim

def run_wmd_benchmark(sent1, sent2, model, use_stoplist=False):

    tokens1 = sent1.tokens_without_stop if use_stoplist else sent1.tokens
    tokens2 = sent2.tokens_without_stop if use_stoplist else sent2.tokens

    tokens1 = [token for token in tokens1 if token in model]
    tokens2 = [token for token in tokens2 if token in model]

    if len(tokens1) == 0 or len(tokens2) == 0:
        tokens1 = [token for token in sent1.tokens if token in model]
        tokens2 = [token for token in sent2.tokens if token in model]

    sim = model.wmdistance(tokens1, tokens2)
        
    return sim

from sklearn.decomposition import TruncatedSVD

def remove_first_principal_component(X):
    svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0)
    svd.fit(X)
    pc = svd.components_
    XX = X - X.dot(pc.transpose()) * pc
    return XX


def run_sif_benchmark(sent1, sent2, model, freqs={}, use_stoplist=False, a=0.001): 
    total_freq = sum(freqs.values())
    embeddings = []
    # SIF requires us to first collect all sentence embeddings and then perform 
    # common component analysis.
    tokens1 = sent1.tokens_without_stop if use_stoplist else sent1.tokens
    tokens2 = sent2.tokens_without_stop if use_stoplist else sent2.tokens

    tokens1 = [token for token in tokens1 if token in model]
    tokens2 = [token for token in tokens2 if token in model]

    weights1 = [a/(a+freqs.get(token,0)/total_freq) for token in tokens1]
    weights2 = [a/(a+freqs.get(token,0)/total_freq) for token in tokens2]

    embedding1 = np.average([model[token] for token in tokens1], axis=0, weights=weights1)
    embedding2 = np.average([model[token] for token in tokens2], axis=0, weights=weights2)

    embeddings.append(embedding1)
    embeddings.append(embedding2)
        
    embeddings = remove_first_principal_component(np.array(embeddings))
    sim = [cosine_similarity(embeddings[idx*2].reshape(1, -1), 
                              embeddings[idx*2+1].reshape(1, -1))[0][0] 
            for idx in range(int(len(embeddings)/2))][0]

    return sim

## Load data

In [5]:
import pandas as pd
data = pd.read_pickle("baseline.pkl")

## Load Database

In [6]:
import sys
import pandas as pd
import json
from pymongo import MongoClient
class my_mongodb:
    import pandas as pd
    def __init__(self, hostname='localhost', db_port=27017):

        self.hostname = hostname
        self.db_port = db_port
        self.my_mongo_client = MongoClient(self.hostname, self.db_port)
        
    def df2mongo(self, df_data, db_name, form_name):


        def df2bson(df):

            data = json.loads(df.T.to_json()).values()
            return data

        my_db = self.my_mongo_client[db_name]
        bson_data = df2bson(df_data)
        my_posts = my_db[form_name]
        result = my_posts.insert_many(bson_data)
        return result

    def collection2df(self, db_name, collection_name, query={}, no_id=True):

        """查询数据库，导出DataFrame类型数据
        （db_name：数据库名 collection_name：集合名 
         query：查询条件式 no_id：不显示ID,默认为不显示ID）"""

        db = self.my_mongo_client[db_name]
        collection = self.my_mongo_client[db_name][collection_name]
        cursor = collection.find(query)
        df = pd.DataFrame(list(cursor))
        if no_id:
            del df['_id']
        return df

In [7]:
data.columns

Index(['uuid', 'name', 'rank', 'roles', 'status', 'short_description',
       'category_list', 'category_groups_list', 'num_funding_rounds',
       'employee_count', 'founded_on', 'description', 'p_uuid', 'p_name',
       'gender', 'featured_job_title', 'p_description', 'd_uuid', 'd_name',
       'ins_uuid', 'ins_name', 'degree_type', 'subject', 'category_coding',
       'acquired', 'closed', 'ipo', 'operating'],
      dtype='object')

## Sentence Embedding Data

In [8]:
def basic_data(data,company):
    data["p_description"].fillna("No people description",inplace=True)
    data["short_description"].fillna("No short description",inplace=True)
    com_des =  data[data["name"] == company]["description"].values[0]
    com_sh_des =  data[data["name"] == company]["short_description"].values[0]
    com_p_des = data[data["name"] == company]["p_description"].values[0]
    com_des = com_des + com_sh_des
    com_all_des = com_des + com_p_des
    return com_des,com_p_des,com_all_des

## Computing Text Similarity

In [105]:
def return_similarity(data,company_one,company_two,webm_model,stop_flag,sim_type):

    com_des_one,com_p_des_one,com_all_des_one = basic_data(data,company_one)
    com_des_two,com_p_des_two,com_all_des_two = basic_data(data,company_two)
    if sim_type == "cos":
        similarity = run_avg_benchmark(Sentence(com_des_one),Sentence(com_des_two),model = webm_model, use_stoplist=stop_flag)
    elif sim_type == "wmd":
        similarity = run_wmd_benchmark(Sentence(com_des_one),Sentence(com_des_two),model = webm_model, use_stoplist=stop_flag)
    return similarity,com_des_one,com_des_two

In [106]:
return_similarity(data,"Airbnb","Booking.com",word2vec,False,"cos")

(0.9017618,
 'Airbnb is a community marketplace for people to list, discover, and book unique spaces around the world through mobile phones or the internet. The company connects travelers seeking authentic experiences with hosts offering unique, inspiring spaces around the world. Whether the available space is a castle for a night, a sailboat for a week, or an apartment for a month, Airbnb is the easiest way for people to showcase these distinctive spaces to an audience of millions. By facilitating bookings and financial transactions, Airbnb makes the process of listing or booking a space effortless and efficient. With 4,500,000 listings in over 65,000 cities in 191 countries, the company offers the widest variety of unique spaces for everyone, at any price point around the globe.  The company was co-founded in August 2008 by Brian Chesky and Joe Gebbia, and is currently headquartered in San Francisco, California.Airbnb is an online community marketplace for people to list, discover, a

## Computing Category similarity

In [93]:
from sklearn.metrics.pairwise import cosine_similarity, paired_distances
def return_cat_sim(data,company_one,company_two):
    cat_code_one = data[data["name"] == company_one]["category_coding"].values[0]
    cat_code_two = data[data["name"] == company_two]["category_coding"].values[0]
    a = set(cat_code_one)
    b = set(cat_code_two)
    sim = len(a.intersection(b))/(np.sqrt(len(a))*np.sqrt(len(b)))
    return sim

In [94]:
set(data[data["name"] == "eBay"]["category_coding"].values[0])

{43, 206, 368, 419, 568, 589}

In [95]:
return_cat_sim(data,"Airbnb","Crosslink Capital")

0.15811388300841897

## Status Encoding

In [13]:
status_unique = [i for i in data["status"].unique()]
def status_trans(status, one_hot_set):
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    le.fit(one_hot_set)
    a = le.transform(status.split())
    return a[0]
data["status_coding"] = data["status"].apply(lambda x:status_trans(x,status_unique))

In [14]:
status_trans("ipo",status_unique)

2

In [15]:
data["status_coding"].unique()

array([0, 3, 2, 1], dtype=int64)

## Ranking by size

In [16]:
ls = [0,-1,1,2,-2,3,-3]
rule = {0:0,-1:1,1:1,-2:2,2:2,3:3,-3:3}
newList = sorted(ls, key=lambda x:rule[x])
print(newList)

[0, -1, 1, 2, -2, 3, -3]


## Result storage and display

In [17]:
evaluation_com_ls = ["Airbnb","Coinbase","Deliveroo","Revolut","Darktrace"]

In [18]:
data[data["name"] == "Deliveroo"]

Unnamed: 0,uuid,name,rank,roles,status,short_description,category_list,category_groups_list,num_funding_rounds,employee_count,...,ins_uuid,ins_name,degree_type,subject,category_coding,acquired,closed,ipo,operating,status_coding
173735,a40d0a1f-f32c-a1e9-1bbd-a10bb0eca2e7,Deliveroo,77.0,company,operating,Deliveroo owns and operates an online food del...,"Delivery,Food and Beverage,Food Delivery,Resta...","Administrative Services,Food and Beverage,Tran...",10.0,1001-5000,...,169df302-c4ed-4f59-fc5d-763f86e6c36f,King's College London,unknown,Mathematics & Computer Science,"[184, 277, 274, 567, 585]",0,0,0,1,3


## narrow search range

In [19]:
def Search_coms(x,set_be_seached_com):
    flag = any(i in set_be_seached_com for i in x)
    return flag

In [20]:
set_a = data[data["name"] == "Deliveroo"]["category_coding"].values[0]

In [21]:
search_data = data[data["category_coding"].apply(lambda x:Search_coms(x,set_a))]

In [22]:
search_data["name"].values

array(['Ikan', 'Zvents', 'General Mills', ..., "'Merican Mule",
       'Just Add Honey', 'Lula'], dtype=object)

In [23]:
client = my_mongodb().my_mongo_client
database = client["dissertation"]

In [25]:
collection = database["Airbnb"] 

In [179]:
def first_return_searh_result(query_company,database,data,wem_model):
    set_a = data[data["name"] == query_company]["category_coding"].values[0]
    search_data = data[data["category_coding"].apply(lambda x:Search_coms(x,set_a))]
    collection = database[query_company]
    print("Please wait for generating all the competitors")
    for search_company in search_data["name"].values[:10]:
        pair = {}
        try:
            sim, com_des_one,com_des_two = return_similarity(data,query_company,search_company,wem_model,False,"cos")
#             print(query_company,search_company)
            cat_sim = return_cat_sim(data,query_company,search_company)
            size_diff = data[data["name"] == query_company]["status_coding"].values[0] - data[data["name"] == search_company]["status_coding"].values[0]
        except:
            pass
        pair["name_query"] = query_company
        pair["name_result"] = search_company
        pair["query_description"] = com_des_one
        pair["result_description"] = com_des_two
        pair["text_similarity"] = float(sim)
        pair["cat_similarity"] = float(cat_sim)
        pair["size_difference"] = int(size_diff)
        collection.insert_one(pair)
    print("************Data is prepared**************")

In [139]:
for query_company in evaluation_com_ls[:1]:
    count = 0
    set_a = data[data["name"] == query_company]["category_coding"].values[0]
    search_data = data[data["category_coding"].apply(lambda x:Search_coms(x,set_a))]
    collection = database[query_company]
    for search_company in search_data["name"].values[:10]:
        print("here is the count {}".format(count))
        pair = {}
        try:
            sim, com_des_one,com_des_two = return_similarity(data,query_company,search_company,word2vec,False,"cos")
            print(query_company,search_company)
            cat_sim = return_cat_sim(data,query_company,search_company)
            size_diff = data[data["name"] == query_company]["status_coding"].values[0] - data[data["name"] == search_company]["status_coding"].values[0]
        except:
            print(query_company,search_company,"errorxxxxxxxxxxxx")
            pass
        pair["name_query"] = query_company
        pair["name_result"] = search_company
        pair["query_description"] = com_des_one
        pair["result_description"] = com_des_two
        pair["text_similarity"] = float(sim)
        pair["cat_similarity"] = float(cat_sim)
        pair["size_difference"] = int(size_diff)
        collection.insert_one(pair)
        count+=1

here is the count 0
Airbnb eBay
here is the count 1
Airbnb Spark Capital
here is the count 2
Airbnb Bessemer Venture Partners
here is the count 3
Airbnb TripUp
here is the count 4
Airbnb SideStep
here is the count 5
Airbnb Farecast
here is the count 6
Airbnb Yapta
here is the count 7
Airbnb TripHub
here is the count 8
Airbnb TVtrip
here is the count 9
Airbnb Crosslink Capital
Data is prepared


In [103]:
collist = database.list_collection_names()
# collist = mydb.collection_names()
if "Airbnb" in collist: 
    print("集合已存在！")

集合已存在！


In [153]:
result_df.columns

Index(['name_query', 'name_result', 'query_description', 'result_description',
       'text_similarity', 'cat_similarity', 'size_difference'],
      dtype='object')

## search engine back-end algorithm

In [181]:
def startup_competitor_search_engine(db_name,data,word2vec):
    query_company = input("please input query company name in the Crunchdatabase.e.g. Airbnb, Coinbase"+"\n" +"You are searching for competitors of ")
    query_company = query_company.lower().capitalize()
    mango = my_mongodb()
    client = mango.my_mongo_client
    database = client[db_name]
    collist = database.list_collection_names()
    collection = database[query_company]
    if query_company in collist: 
        print("This is not the first query for this company, results come out soon......"+"\n")
        result_df = mango.collection2df( "dissertation", query_company, query={}, no_id=True)
        final_result = pd.DataFrame(sorted(result_df.values, key=lambda x: 0.8*x[4]+0.2*x[5], reverse = True),columns=['name_query', 'name_result', 'query_description', 'result_description',
       'text_similarity', 'cat_similarity', 'size_difference'])
        return final_result
    else:
        print("This is the first query for this company......"+"\n")
        first_return_searh_result(query_company,database,data,word2vec)
        result_df = mango.collection2df( "dissertation", query_company, query={}, no_id=True)
        final_result = pd.DataFrame(sorted(result_df.values, key=lambda x: 0.8*x[4]+0.2*x[5], reverse = True),columns=['name_query', 'name_result', 'query_description', 'result_description',
       'text_similarity', 'cat_similarity', 'size_difference'])
        return final_result

In [183]:
startup_competitor_search_engine("Baseline",data,word2vec)

please input query company name in the Crunchdatabase.e.g. Airbnb, Coinbase
You are searching for competitors of  Coinbase


This is the first query for this company......

Please wait for generating all the competitors
************Data is prepared**************


Unnamed: 0,name_query,name_result,query_description,result_description,text_similarity,cat_similarity,size_difference
0,Coinbase,Prosper Marketplace,Coinbase is an online platform that allows mer...,Prosper is an online lending platform that con...,0.853501,0.288675,0
1,Coinbase,Sparter,Coinbase is an online platform that allows mer...,A company put together by Bessemer Venture Par...,0.829517,0.353553,2
2,Coinbase,eBay,Coinbase is an online platform that allows mer...,eBay is an online marketplace. The platform c...,0.874301,0.144338,1
3,Coinbase,STRANDS,Coinbase is an online platform that allows mer...,Strands is a FinTech partner that develops dig...,0.819017,0.25,3
4,Coinbase,Omnidrive,Coinbase is an online platform that allows mer...,"Currently in public beta, Omnidrive makes it e...",0.79295,0.144338,2
5,Coinbase,Mobius Venture Capital,Coinbase is an online platform that allows mer...,Mobius Venture Capital is an early-stage ventu...,0.767534,0.204124,0
6,Coinbase,Wesabe,Coinbase is an online platform that allows mer...,Personal finance management platform.Wesabe is...,0.745831,0.204124,2
7,Coinbase,Bessemer Venture Partners,Coinbase is an online platform that allows mer...,Bessemer Venture Partners is a $4B venture cap...,0.737495,0.223607,0
8,Coinbase,Spark Capital,Coinbase is an online platform that allows mer...,"We are Spark Capital, investors in products we...",0.674145,0.158114,0
9,Coinbase,The Tornante Company,Coinbase is an online platform that allows mer...,The Tornante Company is a principal investment...,0.605256,0.204124,0
