In [1]:
import pandas as pd
import numpy as np
import scipy
import math
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

def load_sts_dataset(filename):
    # Loads a subset of the STS dataset into a DataFrame. In particular both
    # sentences and their human rated similarity score.
    sent_pairs = []
    with tf.gfile.GFile(filename, "r") as f:
        for line in f:
            ts = line.strip().split("\t")
            sent_pairs.append((ts[5], ts[6], float(ts[4])))
    return pd.DataFrame(sent_pairs, columns=["sent_1", "sent_2", "sim"])


def download_and_load_sts_data():
    sts_dataset = tf.keras.utils.get_file(
        fname="Stsbenchmark.tar.gz",
        origin="http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz",
        extract=True)

    sts_dev = load_sts_dataset(os.path.join(os.path.dirname(sts_dataset), "stsbenchmark", "sts-dev.csv"))
    sts_test = load_sts_dataset(os.path.join(os.path.dirname(sts_dataset), "stsbenchmark", "sts-test.csv"))

    return sts_dev, sts_test

sts_dev, sts_test = download_and_load_sts_data()

ModuleNotFoundError: No module named 'tensorflow'

In [3]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.5.0-cp38-cp38-macosx_10_11_x86_64.whl (195.7 MB)
[K     |▎                               | 1.8 MB 85 kB/s eta 0:37:587^C

[31mERROR: Operation cancelled by user[0m
[?25h

# tokenization

In [23]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
STOP = set(nltk.corpus.stopwords.words("english"))

class Sentence:
    
    def __init__(self, sentence):
        self.raw = sentence
        normalized_sentence = sentence.replace("‘", "'").replace("’", "'").replace(",","").replace(".","").replace("'","")
        self.tokens = [t.lower() for t in nltk.word_tokenize(normalized_sentence)]
        self.tokens_without_stop = [t for t in self.tokens if t not in STOP]

[nltk_data] Error loading stopwords: <urlopen error [Errno 61]
[nltk_data]     Connection refused>
[nltk_data] Error loading punkt: <urlopen error [Errno 61] Connection
[nltk_data]     refused>


In [24]:
st = Sentence("Wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and monetize audiences for digital publishers. Wetpaint’s own online property, Wetpaint Entertainment, an entertainment news site that attracts more than 12 million unique visitors monthly and has over 2 million Facebook fans, is a proof point to the company’s success in building and engaging audiences. Media companies can license Wetpaint’s platform which includes a dynamic playbook tailored to their individual needs and comprehensive training. Founded by Internet pioneer Ben Elowitz, and with offices in New York and Seattle, Wetpaint is backed by Accel Partners, the investors behind Facebook.")

In [25]:
st.tokens_without_stop

['wetpaint',
 'technology',
 'platform',
 'company',
 'uses',
 'proprietary',
 'state-of-the-art',
 'technology',
 'expertise',
 'social',
 'media',
 'build',
 'monetize',
 'audiences',
 'digital',
 'publishers',
 'wetpaints',
 'online',
 'property',
 'wetpaint',
 'entertainment',
 'entertainment',
 'news',
 'site',
 'attracts',
 '12',
 'million',
 'unique',
 'visitors',
 'monthly',
 '2',
 'million',
 'facebook',
 'fans',
 'proof',
 'point',
 'companys',
 'success',
 'building',
 'engaging',
 'audiences',
 'media',
 'companies',
 'license',
 'wetpaints',
 'platform',
 'includes',
 'dynamic',
 'playbook',
 'tailored',
 'individual',
 'needs',
 'comprehensive',
 'training',
 'founded',
 'internet',
 'pioneer',
 'ben',
 'elowitz',
 'offices',
 'new',
 'york',
 'seattle',
 'wetpaint',
 'backed',
 'accel',
 'partners',
 'investors',
 'behind',
 'facebook']

In [31]:
import gensim

from gensim.models import Word2Vec

In [32]:
import os
PATH_TO_WORD2VEC = os.path.expanduser("data/GoogleNews-vectors-negative300.bin")

In [33]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format(PATH_TO_WORD2VEC, binary=True)

In [101]:
import csv

PATH_TO_FREQUENCIES_FILE = "data/frequencies.tsv"
PATH_TO_DOC_FREQUENCIES_FILE = "data/doc_frequencies.tsv"

def read_tsv(f):
    frequencies = {}
    with open(f) as tsv:
        tsv_reader = csv.reader(tsv, delimiter="\t")
        for row in tsv_reader: 
            frequencies[row[0]] = int(row[1])
        
    return frequencies
        
frequencies = read_tsv(PATH_TO_FREQUENCIES_FILE)
doc_frequencies = read_tsv(PATH_TO_DOC_FREQUENCIES_FILE)
doc_frequencies["NUM_DOCS"] = 1288431

In [358]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import math
import numpy as np

def run_avg_benchmark(sentences1, sentences2, model=None, use_stoplist=False, doc_freqs=None): 

    if doc_freqs is not None:
        N = doc_freqs["NUM_DOCS"]
    
    tokens1 = sentences1.tokens_without_stop if use_stoplist else sentences1.tokens
    tokens2 = sentences2.tokens_without_stop if use_stoplist else sentences2.tokens

    tokens1 = [token for token in tokens1 if token in model]
    tokens2 = [token for token in tokens2 if token in model]

#     if len(tokens1) == 0 or len(tokens2) == 0:
#         sims.append(0)
#         continue

    tokfreqs1 = Counter(tokens1)
    tokfreqs2 = Counter(tokens2)

    weights1 = [tokfreqs1[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                for token in tokfreqs1] if doc_freqs else None
    weights2 = [tokfreqs2[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                for token in tokfreqs2] if doc_freqs else None

    embedding1 = np.average([model[token] for token in tokfreqs1], axis=0, weights=weights1).reshape(1, -1)
    embedding2 = np.average([model[token] for token in tokfreqs2], axis=0, weights=weights2).reshape(1, -1)
    sim = cosine_similarity(embedding1, embedding2)[0][0]

    return sim

def run_wmd_benchmark(sent1, sent2, model, use_stoplist=False):

    tokens1 = sent1.tokens_without_stop if use_stoplist else sent1.tokens
    tokens2 = sent2.tokens_without_stop if use_stoplist else sent2.tokens

    tokens1 = [token for token in tokens1 if token in model]
    tokens2 = [token for token in tokens2 if token in model]

    if len(tokens1) == 0 or len(tokens2) == 0:
        tokens1 = [token for token in sent1.tokens if token in model]
        tokens2 = [token for token in sent2.tokens if token in model]

    sim = -model.wmdistance(tokens1, tokens2)
        
    return sim

from sklearn.decomposition import TruncatedSVD

def remove_first_principal_component(X):
    svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0)
    svd.fit(X)
    pc = svd.components_
    XX = X - X.dot(pc.transpose()) * pc
    return XX


def run_sif_benchmark(sent1, sent2, model, freqs={}, use_stoplist=False, a=0.001): 
    total_freq = sum(freqs.values())
    embeddings = []
    # SIF requires us to first collect all sentence embeddings and then perform 
    # common component analysis.
    tokens1 = sent1.tokens_without_stop if use_stoplist else sent1.tokens
    tokens2 = sent2.tokens_without_stop if use_stoplist else sent2.tokens

    tokens1 = [token for token in tokens1 if token in model]
    tokens2 = [token for token in tokens2 if token in model]

    weights1 = [a/(a+freqs.get(token,0)/total_freq) for token in tokens1]
    weights2 = [a/(a+freqs.get(token,0)/total_freq) for token in tokens2]

    embedding1 = np.average([model[token] for token in tokens1], axis=0, weights=weights1)
    embedding2 = np.average([model[token] for token in tokens2], axis=0, weights=weights2)

    embeddings.append(embedding1)
    embeddings.append(embedding2)
        
    embeddings = remove_first_principal_component(np.array(embeddings))
    sim = [cosine_similarity(embeddings[idx*2].reshape(1, -1), 
                              embeddings[idx*2+1].reshape(1, -1))[0][0] 
            for idx in range(int(len(embeddings)/2))][0]

    return sim

In [10]:
import pandas as pd                                                               
data = pd.read_csv("data/organization_descriptions.csv")                               
facebook = data[data["name"] == "Facebook"]["description"]                        
google = data[data["name"] == "Google"]["description"]                            
amazon = data[data["name"] == "Amazon"]["description"]                            

In [178]:
facebook = data[data["name"] == "Facebook"]["description"]                        
google = data[data["name"] == "Google"]["description"]                            
amazon = data[data["name"] == "Amazon"]["description"]

In [179]:
data[data["name"] == "Facebook"]["p_description"] 

148    Mark Zuckerberg is the founder and CEO of Face...
Name: p_description, dtype: object

In [162]:
final_data = pd.read_csv("final_data.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [173]:
data = data.drop_duplicates(["uuid","name"])

In [177]:
data

Unnamed: 0,uuid,name,rank,roles,status,short_description,category_list,category_groups_list,num_funding_rounds,employee_count,...,p_name,gender,featured_job_title,p_description,d_uuid,d_name,ins_uuid,ins_name,degree_type,subject
0,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,131437.0,company,acquired,Wetpaint offers an online social publishing pl...,"Publishing,Social Media,Social Media Management","Content and Publishing,Internet Services,Media...",3.0,51-100,...,,,,,,,,,,
1,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,7682.0,"investor,company",operating,"Zoho offers a suite of business, collaboration...","Cloud Computing,Collaboration,CRM,Developer To...","Administrative Services,Information Technology...",,1001-5000,...,Raju Vegesna,male,Chief Evangelist,Raju is an evangelist for Zoho and is one of t...,,,,,,
35,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,13152.0,company,acquired,Digg Inc. operates a website that enables its ...,"Internet,Social Media,Social Network","Internet Services,Media and Entertainment",6.0,51-100,...,Joshua Auerbach,male,Board Member,Venture Partner at Betaworks Studio. Formerl...,967b2ae6-5842-ca27-59b9-acd60b7b4c42,BA Economics @ Harvard University,d8b57c0e-9f0f-4dcb-d207-a12a90c64a2d,Harvard University,BA,Economics
38,f4d5ab44-058b-298b-ea81-380e6e9a8eec,Omidyar Network,1757.0,investor,operating,Omidyar Network is an investment firm.,"Enterprise Software,Financial Services,Venture...","Financial Services,Lending and Investments,Sof...",,101-250,...,Salvatore Giambanco,male,Partner of Human Capital & Operations Functions,Sal leads the human capital and operations fun...,49ae5af2-25fe-b1d1-e19e-11f45a04a4c4,B.A. Economics & Political Science @ Columbia...,8d297d1e-3e3d-fadd-2963-57e0a40bff2a,Columbia University,B.A.,Economics & Political Science
148,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,5.0,"investor,company",ipo,Facebook is an online social networking servic...,"Mobile Apps,Social,Social Media,Social Network...","Apps,Community and Lifestyle,Content and Publi...",17.0,10000+,...,Mark Zuckerberg,male,Founder & CEO,Mark Zuckerberg is the founder and CEO of Face...,e75e1434-2ace-9255-2da8-3943f5bbae7c,Dropped Out Computer Science @ Harvard Univer...,d8b57c0e-9f0f-4dcb-d207-a12a90c64a2d,Harvard University,Dropped Out,Computer Science
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1718514,09c8e895-e62e-46f0-93f9-45e088ede007,AngelNV,,investor,operating,Early stage investor school and investment fun...,,,,unknown,...,,,,,,,,,,
1718515,9b7c50d5-8d96-4af4-9b0e-af72705184e9,Science Delivered,,company,operating,Science Delivered promote quality science educ...,Education,Education,,1-10,...,,,,,,,,,,
1718516,98589112-e90c-434b-a4e7-ffaa12eee602,OmniPanel,,company,operating,OmniPanel is an analytics platform that surfac...,,,1.0,unknown,...,,,,,,,,,,
1718517,90418618-ce0c-49b2-8c0b-1f741a3d313e,Schafer Logistics Inc,,company,operating,Schafer Logistics Inc provides integrate exper...,Transportation,Transportation,,51-100,...,,,,,,,,,,


In [195]:
say = data[data["name"] == "Say"]["p_description"].values[0]
wetpaint = data[data["name"] == "Wetpaint"]["p_description"].values[0]
google = data[data["name"] == "Google"]["p_description"].values[0]
facebook = data[data["name"] == "Facebook"]["p_description"].values[0]

In [165]:
data = pd.read_pickle("foo.pkl")

In [182]:
airbnb = data[data["name"] == "Airbnb"]["p_description"].values[0]

In [188]:
data[data["name"] == "Airbnb"]["p_description"].values[0]

'Aristotle â€œAriâ€\x9d Balogh is currently the VP of Storage Infrastructure Products at Google.   Prior to Google, Balogh was Chief Technology Officer at Yahoo. He was responsible for company-wide product development which includes optimizing resources, speeding innovation, and ensuring the quality of Yahooâ€™s products and services. He was focused on establishing a common architecture and building blocks to drive development aligned with corporate strategy and on improving the overall effectiveness of Yahooâ€™s engineering efforts. All of Yahooâ€™s engineering functions, including technical operations, infrastructure, and internal IT support groups, reported into Balogh.   Prior to Yahoo, Balogh was Executive Vice President, Chief Technology Officer and Head of Global Product Design for VeriSign, Inc., where he led all internal and external technology functions. During his nearly ten years at VeriSign, he held numerous technology leadership positions that focused on aligning technica

In [192]:
data[data["name"] == "Expedia"]["p_description"].values[0]

'Jonathan L. Dolgen has been a director of Expedia since completion of the IAC/Expedia Spin-Off. From July 2004 until April 2010, Dolgen was a Senior Advisor to Viacom, Inc. (“Old Viacom”), a worldwide entertainment and media company, where he provided advisory services to the chief executive officer of Old Viacom, or others designated by him, on an as-requested basis.  Effective December 31, 2005, Old Viacom was separated into two publicly traded companies, Viacom Inc. (“New Viacom”) and CBS Corporation. From the separation of Old Viacom, Dolgen provided advisory services to the chief executive officer of New Viacom, or others designated by him, on an as-requested basis. Since July 2004, Dolgen has been a private investor, and since September 2004, Dolgen has been the principal of Wood River Ventures, LLC, a private entity that seeks investment and other opportunities and provides consulting services, primarily in the media sector. From April 2005 until April 2013, Dolgen, through Woo

In [199]:
booking = data[data["name"] == "Booking.com"]["description"].values[0]
expedia = data[data["name"] == "Expedia"]["p_description"].values[0]
kayak = data[data["name"] == "KAYAK"]["p_description"].values[0]
Agoda = data[data["name"] == "Trip.com"]["p_description"].values[0]

In [223]:
na=np.nan


In [232]:
pd.isnull(data[data["name"] == "Booking.com"]["p_description"].values[0])

True

In [140]:
Agoda = """Trip.com is a Chinese travel agency offering comprehensive services including hotel reservations, air ticketing, packaged tours, high-speed rail ticket booking, and corporate travel management.Trip.com offers over 1,000 packaged tours to both domestic Chinese and overseas destinations, departing from six major Chinese cities (Beijing, Shanghai,Guangzhou, Shenzhen, Hangzhou, and Chengdu) and serving over 10,000 individual travelers monthly. Trip.com Group is a provider of travel services including accommodation reservation, transportation ticketing, packaged tours, and corporate travel management.
The company owns and operates the Trip.com, Skyscanner, and Ctrip.com, all of which are online travel agencies. It enables local partners and travelers to make informed and cost-effective bookings for travel products and services, through the aggregation of comprehensive travel-related information and resources, and an advanced transaction platform consisting of mobile apps, internet websites, and 24/7 customer service centers. Founded in 1999 and listed on Nasdaq in 2003, Trip.com Group was established in Shanghai, China by co-founders Neil Shen, Min Fan, Qi Ji, and James Liang.

Founded in 1999, Ctrip was listed on NASDAQ in December 2003. It has successfully integrated high-tech industries and traditional travel industry to serve over 40 million registered members. The Ctrip group includes members like lvping, Xingcheng ezTravel, csshotel, and sozhen.com. Its major competitors are elong, aoyou.com, and mangocity.com."""

In [204]:
run_avg_benchmark(Sentence(google),Sentence(airbnb),model = word2vec, use_stoplist=False)

0.85570425

In [159]:
run_wmd_benchmark(Sentence(airbnb),Sentence(expedia),model = word2vec, use_stoplist=True)

-1.1296013761334525

In [26]:
Sentence(say)

<__main__.Sentence at 0x7fbce786fbe0>

In [103]:
run_sif_benchmark(Sentence(airbnb),Sentence(booking),model = word2vec, freqs=frequencies, use_stoplist=False, a=0.001)

-1.0

In [233]:
data.columns

Index(['uuid', 'name', 'rank', 'roles', 'status', 'short_description',
       'category_list', 'category_groups_list', 'num_funding_rounds',
       'employee_count', 'description', 'p_uuid', 'p_name', 'gender',
       'featured_job_title', 'p_description', 'd_uuid', 'd_name', 'ins_uuid',
       'ins_name', 'degree_type', 'subject'],
      dtype='object')

In [325]:
def return_similarity(data,company_one,company_two,text_type, webm_model,stop_flag):
    com_des_one =  data[data["name"] == company_one]["description"].values[0]
    com_peo_des_one =  data[data["name"] == company_one]["p_description"].values[0]
    com_sh_des_one =  data[data["name"] == company_one]["short_description"].values[0]
    com_des_two =  data[data["name"] == company_two]["description"].values[0]
    com_peo_des_two =  data[data["name"] == company_two]["p_description"].values[0]
    com_sh_des_two =  data[data["name"] == company_two]["short_description"].values[0]
    com_des_one = com_des_one + str(com_sh_des_one)
#     print(str(com_des_two))
    com_des_two = str(com_des_two) + str(com_sh_des_two)

    if text_type == "only_des":
        similarity = run_avg_benchmark(Sentence(com_des_one),Sentence(com_des_two),model = webm_model, use_stoplist=stop_flag)
    return similarity

In [None]:
name_ls_finance = data[data.apply(lambda x: "Cloud Computing" in x["category_list"],axis = 1)]["name"]
name_ls_travel = data[data.apply(lambda x: "Travel" in x["category_list"],axis = 1)]["name"]
name_ls_cloud = data[data.apply(lambda x: "Cloud Computing" in x["category_list"],axis = 1)]["name"]

In [342]:
finance_companies_ls = list(name_ls_finance)
travel_companies_ls = list(name_ls_travel)
cloud_companies_ls = list(name_ls_cloud)

In [365]:
for name in finance_companies_ls[:10]:
    sim_ls = []
#     print(name)
    sim = return_similarity(data,"Airbnb",name,"only_des", word2vec,False)
    sim_ls.append(sim)
    print(name,sim)

Omidyar Network 0.77989084
Meritech Capital Partners 0.7644329
Centennial Ventures 0.69306344
Austin Ventures 0.6186713
Acadia Woods Partners 0.65494436
Foundation Capital 0.78947747
Mobius Venture Capital 0.7800758
Inflexion Partners 0.4088419
Village Ventures 0.61379933
First Round Capital 0.81887305


In [367]:
for name in cloud_companies_ls[:10]:
    sim_ls = []
#     print(name)
    sim = return_similarity(data,"Airbnb",name,"only_des", word2vec,False)
    sim_ls.append(sim)
    print(name,sim)

Zoho 0.76162076
PBworks 0.82654047
Box 0.8388816
Oracle-NetSuite 0.749099
Big Bang Ventures 0.5666262
LongJump 0.82523376
Brightcove 0.7676357
Limelight Networks 0.78022057
Nirvanix 0.78128
INgage Networks 0.7725506


In [368]:
for name in travel_companies_ls[:10]:
    sim_ls = []
#     print(name)
    sim = return_similarity(data,"Airbnb",name,"only_des", word2vec,False)
    sim_ls.append(sim)
    print(name,sim)

Booking.com 0.90176183
TripUp 0.8351637
SideStep 0.854903
Farecast 0.846826
KAYAK 0.8667915
PAR Capital Management 0.5284684
Yapta 0.8116853
TripHub 0.824854
TVtrip 0.8520464
Hotelicopter 0.8523342


In [362]:
data.shape

(1216445, 22)