# Text similarity and strategy score using word2vec

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import pathlib
import random
import math

Download the pretrained model. After doing this the first time you have to restart runtime.

In [None]:
!python -m spacy download en_core_web_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


Mount google drive to read the files. For this to work you have to go into your google drive, find the sared folder, right click on it and select "Add shortcut to Drive".

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_stata("drive/My Drive/Capstone Shared Docs/data/all_deals.dta")

df_unique = df.drop_duplicates(["portfoliocompanyid"],keep="first")

In [None]:
!ls drive/My\ Drive/Capstone\ Shared\ Docs/result

'EC2 full run Oct14 result'  'Local Oct 6 result'
'EC2 Oct 13 result'	     'terminal log_oct5_am.txt'
'Error Log Resume.xlsx'      'terminal log_oct5.txt'
'Local Oct 13 result'


In [None]:
path = pathlib.Path("drive/My Drive/Capstone Shared Docs/result/Local Oct 13 result")

In [None]:
folders_with_data =list(path.glob('*'))
folders_with_data =  [x for x in folders_with_data if x.is_dir()]
folders_with_data

[PosixPath('drive/My Drive/Capstone Shared Docs/result/Local Oct 13 result/Local Oct 13 result 1-250'),
 PosixPath('drive/My Drive/Capstone Shared Docs/result/Local Oct 13 result/Local Oct 13 result_251-500')]

In [None]:
all_paths=[]
for f in folders_with_data:
  paths = list(f.glob('*.txt'))
  all_paths = all_paths + paths
all_paths[:5]

[PosixPath('drive/My Drive/Capstone Shared Docs/result/Local Oct 13 result/Local Oct 13 result 1-250/61058_2011.txt'),
 PosixPath('drive/My Drive/Capstone Shared Docs/result/Local Oct 13 result/Local Oct 13 result 1-250/61319_2011.txt'),
 PosixPath('drive/My Drive/Capstone Shared Docs/result/Local Oct 13 result/Local Oct 13 result 1-250/61345_2011.txt'),
 PosixPath('drive/My Drive/Capstone Shared Docs/result/Local Oct 13 result/Local Oct 13 result 1-250/61791_2011.txt'),
 PosixPath('drive/My Drive/Capstone Shared Docs/result/Local Oct 13 result/Local Oct 13 result 1-250/61896_2011.txt')]

In [None]:
all_text = {}
for p in all_paths:
    n = p.name.split("_")[0]
    file = open(p)
    # Remove first two and last characters, because of ascii encoding
    text = file.read().replace("***///***","")[2:-1]
    # remove ""
    text_list = text.split(" ")
    text_list = [x for x in text_list if x!=""]
    text = " ".join(text_list)
    file.close()
    all_text[n] = text

print(len(all_text))

347


The following are some dicts to go from code to indices, website and so on.

In [None]:
code_to_name = {}
code_to_website = {}

for p in all_paths:
  n = int(p.name.split("_")[0])
  code_to_name[n] = df_unique[df_unique.portfoliocompanyid==n].iloc[0].portfoliocompany
  code_to_website[n] = df_unique[df_unique.portfoliocompanyid==n].iloc[0].website

print(list(code_to_name.values())[:5], list(code_to_website.values())[:5])

['Kiip Inc.', 'SocialFlow, Inc.', 'Alteryx, Inc.', 'Velocita Inc.', 'Linkable Networks, Inc.'] ['www.kiip.me', 'www.socialflow.com', 'www.alteryx.com', 'www.shopsocially.com', 'www.linkablenetworks.com']


In [None]:
codes_list = list(code_to_name.keys())
codes_list.sort()
code_to_index = {k:i for i,k in enumerate(codes_list)}
index_to_code = {i:k for i,k in enumerate(codes_list)}

In [None]:
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "ner"])

Go from documents to vectors using the pretrained model. 

Word2vect has a limit on the maximum number of characters allowed. If a text is larger than the allowed I am creating many vectors and then I take the average.


In [None]:
# word2vec has a max of 1000000 characters
max_characters = 1000000
docs_train = []

for i in index_to_code:
  doc_text = all_text[ str(index_to_code[i]) ]
  text_len = len(doc_text)
  num_of_vectors = math.ceil(text_len/max_characters)
  v = np.zeros((300,1))
  w = 0
  for j in range(num_of_vectors):
    small_text = doc_text[max_characters*(j):max_characters*(j+1)]
    vec = nlp(small_text).vector.reshape(vec.shape[0],1)
    v = np.sum( np.hstack((v,vec*len(small_text))),axis=1)
    v = v.reshape((v.shape[0],1))
    w += len(small_text)
  v /= w
  docs_train.append(v)



(104100, 1)

In [None]:
X_train = np.vstack([d.T for d in docs_train])
X_train.shape

(347, 300)

In [None]:
def cos_sim(v1,v2):
    return np.dot(v1.T,v2)/(np.sqrt( np.sum(np.dot(v1.T,v1)))*np.sqrt(np.sum(np.dot(v2.T,v2))))

def similarity_to_all(vec):
    similarities = np.asarray([cos_sim(vec, X_train[i]) for i in range(0,X_train.shape[0])])
    similarities = similarities.reshape(similarities.shape[0],1)
    return similarities

def get_most_similar(sim_matrix_v, startup_index, how_many=1):
  sim_matrix_copy = sim_matrix_v.copy()
  v = sim_matrix_copy[startup_index,:]
  v[startup_index] = 0
  most_similar_indices = []
  similarity_list = []
  for i in range(how_many):
    ind = np.argmax(v)
    similarity = np.max(v)
    v[ind] = 0
    most_similar_indices.append(ind)
    similarity_list.append(similarity)
  return most_similar_indices, similarity_list

In [None]:
cos_sim(X_train[0],X_train[1])

0.9701763996232649

In [None]:
similarity_to_all(X_train[0]).shape

(347, 1)

This is the matrix with the similarity between all startups.

In [None]:
sim_vect = []
for i in range(0,X_train.shape[0]):
    sim_vect.append(similarity_to_all(X_train[i]))
sim_matrix = np.hstack(sim_vect)
sim_matrix

array([[1.        , 0.9701764 , 0.9527012 , ..., 0.94450052, 0.9469321 ,
        0.91789354],
       [0.9701764 , 1.        , 0.97579093, ..., 0.95779241, 0.95242794,
        0.94785295],
       [0.9527012 , 0.97579093, 1.        , ..., 0.95789746, 0.95234001,
        0.97478235],
       ...,
       [0.94450052, 0.95779241, 0.95789746, ..., 1.        , 0.92986277,
        0.93278403],
       [0.9469321 , 0.95242794, 0.95234001, ..., 0.92986277, 1.        ,
        0.91541858],
       [0.91789354, 0.94785295, 0.97478235, ..., 0.93278403, 0.91541858,
        1.        ]])

In [None]:
def website_from_index(ind):
  return code_to_website[index_to_code[ind]]

def index_to_backgroud(ind):
  return df_unique[df_unique.portfoliocompanyid==int(index_to_code[ind])].iloc[0].background

## Run an example to see the most similar startups of a company

In [None]:
example_index = random.randint(0,len(codes_list)-1)
n = 5
most_similar_index,most_similar_similarity = get_most_similar(sim_matrix, example_index, n)

print(f"Example startup: {website_from_index(example_index)}")
print(index_to_backgroud(example_index))
print("-----------------------------------------------------------------")
print("Similar startups:")
for i in range(n):
  print(f"{i+1}): {website_from_index(most_similar_index[i])}  |   Similarity: {most_similar_similarity[i]}")
  print(index_to_backgroud(most_similar_index[i]), end="\n\n")


Example startup: www.submittable.com
Founded in 2010 and based in Montana, US, Submittable Holdings Inc. operates as a developer of cloud-based online submission marketplace for publishers and organizations. It allows users to submit their creative content and grant applications to be screened and selected by organizations.
-----------------------------------------------------------------
Similar startups:
1): www.simplymeasured.com  |   Similarity: 0.9880040822610014
Established in 2010 and based in Washington, US, Simply Measured, Inc. is a provider of social media analytic solutions which allows marketing, public relations and social media professionals to create excel based reports.

2): www.insightsquared.com  |   Similarity: 0.9854135896836758
Founded in 2010 and based in Massachusetts, US, InsightSquared Inc. operates as a provider of data intelligence software designed specifically for small and medium-sized businesses. The company's software helps users to manage pipelines and

# Strategy score

These functions should compute the strategy score used in the paper.

In [None]:
def compute_strategy_score(similarity_list):
  distance_list = [1-x for x in similarity_list]
  return (1.0/len(distance_list))*np.sum(distance_list)

def get_similarity_score(ind):
  ms_index,ms_similarity = get_most_similar(sim_matrix, ind, 5)
  return compute_strategy_score(ms_similarity)

In [None]:
all_scores={index_to_code[i]:get_similarity_score(i) for i in index_to_code}
all_scores = sorted(all_scores.items(), key=lambda x: x[1], reverse=False)

## Companies with the best score

In [None]:
n =10
for i in range(n):
  print(f"{i+1}): {code_to_website[all_scores[i][0]]}  |   Score: {all_scores[i][1]}")
  print(index_to_backgroud(code_to_index[all_scores[i][0]]), end="\n\n")

1): www.educreations.com  |   Score: 0.004449631700104662
Founded in 2010, based in Sunnyvale, California, Educreations, Inc. operate educreations.com, an Internet community that allows users to teach and learn.

2): www.familyid.com  |   Score: 0.0050636840651746345
Founded in 2010 and based in Massachusetts, US, FamilyID, Inc. operates an online platform that enables online student registration for educational institutions.

3): www.ifeelgoods.com  |   Score: 0.005135826039561842
Founded in 2010 and based in California, US, Ifeelgoods, Inc. is an online promotion platform provider that offers gifts, rewards and incentives to their audiences real-time.

4): www.astrolome.com  |   Score: 0.005171245911951661
Founded in 2010 and based in California, US, Astrolome Inc. operates an online platform for customized astrology forecasting services.

5): www.jooraccess.com  |   Score: 0.00556618386581036
Founded in 2010 and based in New York, US, JOOR, Inc. operates as an online fashion marketp