In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import pathlib
import random
import math
import nltk
from nltk import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_stata("drive/My Drive/Capstone Shared Docs/data/all_deals.dta")

df_unique = df.drop_duplicates(["portfoliocompanyid"],keep="first")

In [4]:
path = pathlib.Path("drive/My Drive/Capstone Shared Docs/result/Local Oct 13 result")

In [None]:
folders_with_data =list(path.glob('*'))
folders_with_data =  [x for x in folders_with_data if x.is_dir()]
folders_with_data

[PosixPath('drive/My Drive/Capstone Shared Docs/result/Local Oct 13 result/Local Oct 13 result 1-250'),
 PosixPath('drive/My Drive/Capstone Shared Docs/result/Local Oct 13 result/Local Oct 13 result_251-500')]

In [None]:
all_paths=[]
for f in folders_with_data:
  paths = list(f.glob('*.txt'))
  all_paths = all_paths + paths
all_paths[:5]

[PosixPath('drive/My Drive/Capstone Shared Docs/result/Local Oct 13 result/Local Oct 13 result 1-250/61058_2011.txt'),
 PosixPath('drive/My Drive/Capstone Shared Docs/result/Local Oct 13 result/Local Oct 13 result 1-250/61319_2011.txt'),
 PosixPath('drive/My Drive/Capstone Shared Docs/result/Local Oct 13 result/Local Oct 13 result 1-250/61345_2011.txt'),
 PosixPath('drive/My Drive/Capstone Shared Docs/result/Local Oct 13 result/Local Oct 13 result 1-250/61791_2011.txt'),
 PosixPath('drive/My Drive/Capstone Shared Docs/result/Local Oct 13 result/Local Oct 13 result 1-250/61896_2011.txt')]

In [None]:
all_text = {}
for p in all_paths:
    n = p.name.split("_")[0]
    file = open(p)
    # Remove first two and last characters, because of ascii encoding
    text = file.read().replace("***///***","")[2:-1]
    # remove ""
    text_list = text.split(" ")
    text_list = [x for x in text_list if x!=""]
    text = " ".join(text_list)
    file.close()
    all_text[n] = text

print(len(all_text))

347


In [None]:
code_to_name = {}
code_to_website = {}

for p in all_paths:
  n = int(p.name.split("_")[0])
  code_to_name[n] = df_unique[df_unique.portfoliocompanyid==n].iloc[0].portfoliocompany
  code_to_website[n] = df_unique[df_unique.portfoliocompanyid==n].iloc[0].website

print(list(code_to_name.values())[:5], list(code_to_website.values())[:5])

['Kiip Inc.', 'SocialFlow, Inc.', 'Alteryx, Inc.', 'Velocita Inc.', 'Linkable Networks, Inc.'] ['www.kiip.me', 'www.socialflow.com', 'www.alteryx.com', 'www.shopsocially.com', 'www.linkablenetworks.com']


In [None]:
codes_list = list(code_to_name.keys())
codes_list.sort()
code_to_index = {k:i for i,k in enumerate(codes_list)}
index_to_code = {i:k for i,k in enumerate(codes_list)}

In [None]:
stopWords = set(stopwords.words("english"))

In [None]:
l1, l2 = [], []
for k, v in all_text.items():
  l1.append(k)
  l2.append(v)

In [None]:
l2

In [None]:
vectorizer = TfidfVectorizer(stop_words = stopWords)
X = vectorizer.fit_transform(l2 )

In [None]:
def cos_sim(v1,v2):
    return 1-scipy.spatial.distance.cosine(v1.toarray(),v2.toarray())

def similarity_to_all(vec):
    similarities = np.asarray([cos_sim(vec, X[i]) for i in range(0,X.shape[0])])
    similarities = similarities.reshape(similarities.shape[0],1)
    return similarities

def get_most_similar(sim_matrix_v, startup_index, how_many=1):
  sim_matrix_copy = sim_matrix_v.copy()
  v = sim_matrix_copy[startup_index,:]
  v[startup_index] = 0
  most_similar_indices = []
  similarity_list = []
  for i in range(how_many):
    ind = np.argmax(v)
    similarity = np.max(v)
    v[ind] = 0
    most_similar_indices.append(ind)
    similarity_list.append(similarity)
  return most_similar_indices, similarity_list

In [None]:
import scipy

In [None]:
cos_sim(X[0],X[1])

0.9855801677800838

In [None]:
sim_vect = []
for i in range(0,X.shape[0]):
    sim_vect.append(similarity_to_all(X[i]))
sim_matrix = np.hstack(sim_vect)
sim_matrix

array([[1.        , 0.01441983, 0.00312452, ..., 0.00967522, 0.02391946,
        0.01389271],
       [0.01441983, 1.        , 0.01671133, ..., 0.02512715, 0.01731169,
        0.01637092],
       [0.00312452, 0.01671133, 1.        , ..., 0.01200658, 0.03574773,
        0.02788   ],
       ...,
       [0.00967522, 0.02512715, 0.01200658, ..., 1.        , 0.01706869,
        0.00497433],
       [0.02391946, 0.01731169, 0.03574773, ..., 0.01706869, 1.        ,
        0.0363301 ],
       [0.01389271, 0.01637092, 0.02788   , ..., 0.00497433, 0.0363301 ,
        1.        ]])

In [None]:
def website_from_index(ind):
  return code_to_website[index_to_code[ind]]

def index_to_backgroud(ind):
  return df_unique[df_unique.portfoliocompanyid==int(index_to_code[ind])].iloc[0].background

In [None]:
def background_text(t, line_len,line_num):
  b = [t[j*line_len:(j+1)*line_len] for j in range(line_num)]
  b = "\n".join([c for c in b if c!=""])
  if len(t)>line_len*line_num:
    b+="..."
  return b

In [None]:
example_index = random.randint(0,len(codes_list)-1)
n = 5
most_similar_index,most_similar_similarity = get_most_similar(sim_matrix, example_index, n)
line_len = 100
line_num = 4

print(f"Example startup: {website_from_index(example_index)}")
print(background_text(index_to_backgroud(example_index),line_len,line_num))
print("-----------------------------------------------------------------")
print("Similar startups:")
for i in range(n):
  print(f"{i+1}): {website_from_index(most_similar_index[i])}  |   Similarity: {most_similar_similarity[i]}")
  background = index_to_backgroud(most_similar_index[i])
  background = background_text(background, line_len, line_num)
  print(background)
  print("")


Example startup: www.popdust.com
Founded in 2010, Popdust, Inc. operates a music editorial website focused on mainstream artists and 
pop music culture. It provides audiences with music news, reviews.
-----------------------------------------------------------------
Similar startups:
1): www.cetas.net  |   Similarity: 0.37931138217309823
Founded in 2010 and headquartered in Palo Alto, California, Cetas Software provides real-time big da
ta analytics solutions to extract actionable insights for online businesses and enterprises to get i
nstant recommendations, summarizations, segmentations and predictions from behavioral, social, locat
ional and mobile data.

2): www.luxurygaragesale.com  |   Similarity: 0.22682465417712583
Established in 2010 and based in Chicago, Illinois, Luxury Garage Sale operates as boutique retail a
nd digital store that sells new and used designer clothing and accessories.

3): www.plumdistrict.com  |   Similarity: 0.21053130467574566
Founded in 2010 and based i

In [None]:
def compute_strategy_score(similarity_list):
  distance_list = [1-x for x in similarity_list]
  return (1.0/len(distance_list))*np.sum(distance_list)

def get_similarity_score(ind):
  ms_index,ms_similarity = get_most_similar(sim_matrix, ind, 5)
  return compute_strategy_score(ms_similarity)

In [None]:
all_scores={index_to_code[i]:get_similarity_score(i) for i in index_to_code}
all_scores = sorted(all_scores.items(), key=lambda x: x[1], reverse=False)

In [None]:
n =10
for i in range(n):
  print(f"{i+1}): {code_to_website[all_scores[i][0]]}  |   Score: {all_scores[i][1]}")
  print(index_to_backgroud(code_to_index[all_scores[i][0]]), end="\n\n")

1): www.narrativescience.com  |   Score: 0.173984480039841
Founded in 2010 and based in Illinois, US, Narrative Science operates as a provider of natural language generation platform using artificial intelligence.

2): www.sproutsocial.com  |   Score: 0.173984480039841
Founded in 2010 and based in Illinois, US, Sprout Social, Inc. operates as a provider of social media management software and solutions designed to allow businesses to efficiently and effectively manage and grow their social presence across multiple channels. The company's platform offers tools for streaming social media content, assessing customer metrics, and managing brands to find and interact with social audiences.

3): www.vivino.com  |   Score: 0.173984480039841
Founded in 2010 and based in California, US, Vivino, Inc. operates an online marketplace which offers wines.

4): www.ziprecruiter.com  |   Score: 0.173984480039841
Founded in 2010 and based in California, US, ZipRecruiter, Inc. operates an online job sear

In [None]:
n =10
top_10 = pd.DataFrame({"startup":[],"score":[],"background":[]})
for i in range(n):
  website = code_to_website[all_scores[i][0]]
  score = all_scores[i][1]
  top_10 = top_10.append({"startup":website,"score":score,"background":index_to_backgroud(code_to_index[all_scores[i][0]])},ignore_index=True)

top_10

Unnamed: 0,startup,score,background
0,www.narrativescience.com,0.173984,"Founded in 2010 and based in Illinois, US, Nar..."
1,www.sproutsocial.com,0.173984,"Founded in 2010 and based in Illinois, US, Spr..."
2,www.vivino.com,0.173984,"Founded in 2010 and based in California, US, V..."
3,www.ziprecruiter.com,0.173984,"Founded in 2010 and based in California, US, Z..."
4,www.fractyl.com,0.173984,"Founded in 2010 and based in Massachusetts, US..."
5,www.educreations.com,0.540863,"Founded in 2010, based in Sunnyvale, Californi..."
6,www.aerofs.com,0.551374,"Founded in 2010 and based in California, US, A..."
7,www.astrolome.com,0.559608,"Founded in 2010 and based in California, US, A..."
8,www.propertybase.com,0.570922,"Founded in 2010 and based in Massachusetts, US..."
9,www.smartwires.com,0.570922,"Founded in 2010 and based in California, US, S..."
