# Text similarity and strategy score using word2vec

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import pathlib
import random
import math
import re

In [2]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Download the pretrained model. After doing this the first time you have to restart runtime.

In [3]:
!python -m spacy download en_core_web_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


Mount google drive to read the files. For this to work you have to go into your google drive, find the sared folder, right click on it and select "Add shortcut to Drive".

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
df_startups = pd.read_stata("drive/My Drive/Capstone Shared Docs/data/all_deals.dta")
df_startups_unique = df_startups.drop_duplicates(["portfoliocompanyid"],keep="first")

df_public = pd.read_stata("drive/My Drive/Capstone Shared Docs/data/all_public_firms.dta")
df_public_unique = df_public.drop_duplicates(["ÿþmark"],keep="first")

In [6]:
!ls drive/My\ Drive/Capstone\ Shared\ Docs/result

'Colab Nov1 startup 2000'  'Colab Nov3 Public 2013'
'Colab Nov1 startup 2001'  'Colab Nov3 Public 2014'
'Colab Nov1 startup 2002'  'Colab Oct30 startup 2009'
'Colab Nov1 startup 2003'  'Colab Oct30 startup 2011'
'Colab Nov1 startup 2004'  'Colab Oct30 startup 2017'
'Colab Nov1 startup 2005'  'Colab Oct30 startup 2018'
'Colab Nov1 startup 2006'  'Colab Oct30 startup 2019'
'Colab Nov1 startup 2007'  'EC2 full run Oct14 result'
'Colab Nov1 startup 2008'  'EC2 Oct 13 result'
'Colab Nov1 startup 2012'  'EC2 Oct26 public data'
'Colab Nov1 startup 2013'  'Error Log Resume.xlsx'
'Colab Nov1 startup 2014'  'Local Oct 13 result'
'Colab Nov1 startup 2015'  'Local Oct 6 result'
'Colab Nov1 startup 2016'  'terminal log_oct5_am.txt'
'Colab Nov3 Public 2012'   'terminal log_oct5.txt'


## Loading startup data

In [7]:
path_startup = pathlib.Path("drive/My Drive/Capstone Shared Docs/result/EC2 full run Oct14 result")

In [8]:
startup_paths = list(path_startup.glob('*.txt'))
startup_paths[:5]

[PosixPath('drive/My Drive/Capstone Shared Docs/result/EC2 full run Oct14 result/73501_2011.txt'),
 PosixPath('drive/My Drive/Capstone Shared Docs/result/EC2 full run Oct14 result/160418_2011.txt'),
 PosixPath('drive/My Drive/Capstone Shared Docs/result/EC2 full run Oct14 result/161261_2011.txt'),
 PosixPath('drive/My Drive/Capstone Shared Docs/result/EC2 full run Oct14 result/129092_2011.txt'),
 PosixPath('drive/My Drive/Capstone Shared Docs/result/EC2 full run Oct14 result/77179_2011.txt')]

In [9]:
startups_text = {}
for p in startup_paths:
    n = p.name.split("_")[0]
    file = open(p)
    # Remove first two and last characters, because of ascii encoding
    text = file.read().replace("***///***","\n")[2:-1]
    # remove ""
    text_list = text.split(" ")
    text_list = [x for x in text_list if x!=""]
    text = " ".join(text_list)
    file.close()
    startups_text[n] = text

print(len(startups_text))

430


In [10]:
startup_frame = pd.DataFrame.from_dict(startups_text, orient='index')
startup_frame.columns = ['text']
# startup_frame['companyid'] = startup_frame.index
# startup_frame.index = np.arange(0,len(startup_frame.companyid))
startup_frame = startup_frame.assign(comp_type='S')
stop_words_l=stopwords.words('english')
startup_frame['text_cleaned']=startup_frame.text.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )
startup_frame.head()

Unnamed: 0,text,comp_type,text_cleaned
73501,JOOR Skip navigation Member Log In Forgot pass...,S,joor skip navigation member log forgot passwor...
160418,Patients Physicians Employers Login Crossover ...,S,patients physicians employers login crossover ...
161261,Home | Recent questions | Directories | Feedba...,S,home recent questions directories feedba...
129092,Nanotronics Home News Products What is nSpec ?...,S,nanotronics home news products nspec system ...
77179,Home About Us Our Approach Technology Our Cust...,S,home us approach technology customers contact ...


In [11]:
startup_frame["compid"] = startup_frame.index
startup_frame.reset_index(inplace=True,drop=True)

## Loading public companies data

In [12]:
path_public = pathlib.Path("drive/My Drive/Capstone Shared Docs/result/EC2 Oct26 public data")

In [13]:
public_paths =list(path_public.glob('*.txt'))

In [14]:
public_text = {}
for p in public_paths:
    n = p.name.split("_")[0]
    file = open(p)
    # Remove first two and last characters, because of ascii encoding
    text = file.read().replace("***///***","\n")[2:-1]
    # remove ""
    text_list = text.split(" ")
    text_list = [x for x in text_list if x!=""]
    text = " ".join(text_list)
    file.close()
    public_text[n] = text

print(len(public_text))

536


In [15]:
public_frame = pd.DataFrame.from_dict(public_text, orient='index')
public_frame.columns = ['text']
# public_frame['companyid'] = public_frame.index
# public_frame.index = np.arange(0,len(public_frame.companyid))
public_frame = public_frame.assign(comp_type='P')
public_frame['text_cleaned']=public_frame.text.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )
public_frame.head()


Unnamed: 0,text,comp_type,text_cleaned
151,CVR Energy is an independent petroleum refiner...,P,cvr energy independent petroleum refiner marke...
1658,COMPANY Executive Profiles History Company Val...,P,company executive profiles history company val...
5503,Home | About Us | Locations | ATM Locations | ...,P,home us locations atm locations contac...
2901,Home | Site Map | Contact Us Information is be...,P,home site map contact us information colle...
1510,Search: Corporate Info | Contact GAF | News & ...,P,search corporate info contact gaf news ...


In [16]:
public_frame["compid"] = public_frame.index
public_frame.reset_index(inplace=True,drop=True)

## Language issues

In [17]:
all_frame = pd.concat([startup_frame, public_frame]).reset_index(drop=True)
print(all_frame.shape)
all_frame.head()

(966, 4)


Unnamed: 0,text,comp_type,text_cleaned,compid
0,JOOR Skip navigation Member Log In Forgot pass...,S,joor skip navigation member log forgot passwor...,73501
1,Patients Physicians Employers Login Crossover ...,S,patients physicians employers login crossover ...,160418
2,Home | Recent questions | Directories | Feedba...,S,home recent questions directories feedba...,161261
3,Nanotronics Home News Products What is nSpec ?...,S,nanotronics home news products nspec system ...,129092
4,Home About Us Our Approach Technology Our Cust...,S,home us approach technology customers contact ...,77179


In [57]:
all_frame[all_frame.duplicated("text")]

Unnamed: 0,text,comp_type,text_cleaned,compid,language,Rela_language
85,503 Service Unavailable No server is available...,S,service unavailable server available handl...,165407,en,True
95,503 Service Unavailable No server is available...,S,service unavailable server available handl...,73277,en,True
181,503 Service Unavailable No server is available...,S,service unavailable server available handl...,74117,en,True
198,503 Service Unavailable No server is available...,S,service unavailable server available handl...,71518,en,True
255,503 Service Unavailable No server is available...,S,service unavailable server available handl...,71519,en,True
261,"2011 Marchex Sales, Inc. | Terms Of Use | Priv...",S,marchex sales inc terms use privacy...,180156,en,True
343,banned interdit verboden vietato prohibido ver...,S,banned interdit verboden vietato prohibido ver...,129019,fr,False
419,Your browser does not support frames. We recom...,S,browser support frames recommend upgrading br...,135700,en,False
423,503 Service Unavailable No server is available...,S,service unavailable server available handl...,73304,en,True
554,About BlackRock Contact United States Change C...,P,blackrock contact united states change country...,4058,en,True


In [58]:
all_frame[all_frame.duplicated("text")].iloc[0].text

'503 Service Unavailable No server is available to handle this request.'

In [61]:
all_frame.drop_duplicates(subset="text", keep="first",inplace=True)

In [62]:
all_frame[all_frame.duplicated("text")]

Unnamed: 0,text,comp_type,text_cleaned,compid,language,Rela_language


In [21]:
!pip install gcld3



In [22]:
import gcld3
detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=10000)

In [63]:
lan = []
rel_lan = []
for i,r in all_frame.iterrows():
  t =r.text_cleaned
  t = str(t)
  
  result = detector.FindLanguage(text=t)
  lan.append(result.language)
  rel_lan.append(result.is_reliable)

In [64]:
all_frame['language'] = lan
all_frame['Rela_language'] = rel_lan
all_frame.head()

Unnamed: 0,text,comp_type,text_cleaned,compid,language,Rela_language
0,JOOR Skip navigation Member Log In Forgot pass...,S,joor skip navigation member log forgot passwor...,73501,en,True
1,Patients Physicians Employers Login Crossover ...,S,patients physicians employers login crossover ...,160418,en,True
2,Home | Recent questions | Directories | Feedba...,S,home recent questions directories feedba...,161261,en,True
3,Nanotronics Home News Products What is nSpec ?...,S,nanotronics home news products nspec system ...,129092,en,True
4,Home About Us Our Approach Technology Our Cust...,S,home us approach technology customers contact ...,77179,en,True


In [65]:
all_frame[all_frame.language!="en"]

Unnamed: 0,text,comp_type,text_cleaned,compid,language,Rela_language
17,Index of / Name Last modified Size Description...,S,index name last modified size description fa...,79115,it,True
27,Bienvenido al Nuevo Mundo de Nativo Network . ...,S,bienvenido al nuevo mundo de nativo network ...,112162,es,True
33,Presentacin Asesores / Colaboradores Com desca...,S,presentacin asesores colaboradores com desca...,230198,es,True
55,banned interdit verboden vietato prohibido ver...,S,banned interdit verboden vietato prohibido ver...,93431,fr,False
66,Who wants another funny behind the scenes vid ...,S,wants another funny behind scenes vid announce...,178116,mg,False
71,The Domain SOSH.COM was Successfully Registere...,S,domain sosh com successfully registered joker ...,97785,no,False
74,movista.com Algunos sitios relacionados con: m...,S,movista com algunos sitios relacionados con m...,297244,es,True
118,Moda Operandi puts you front row at the runway...,S,moda operandi puts front row runway comfort p...,72780,sn,False
125,A stealth mode San Francisco startup ContextLo...,S,stealth mode san francisco startup contextlogi...,81046,gl,True
169,Microsoft JET Database Engine error '80040e07'...,S,microsoft jet database engine error e ...,74020,zu,False


In [66]:
all_frame_en = all_frame[(all_frame.language=='en') ].copy() #| ( (all_frame.language!='en') & (all_frame.language==False) )
all_frame_en.reset_index(drop=True,inplace=True)

In [67]:
all_frame_en["text_len"] = all_frame_en.text_cleaned.apply(lambda x: len(x))
all_frame_en["words_len"] = all_frame_en.text_cleaned.apply(lambda x: len(x.split()))
all_frame_en.sort_values("text_len",ascending=False).head()

Unnamed: 0,text,comp_type,text_cleaned,compid,language,Rela_language,text_len,words_len
592,Home Suppliers Site Map Contact Us Automotive ...,P,home suppliers site map contact us automotive ...,143,en,True,16065268,4806695
886,Subscriptions Log in to FlowSelex Select Langu...,P,subscriptions log flowselex select language en...,604,en,True,10026805,2897832
709,LSI.com is no longer optimized for IE6. Please...,P,lsi com longer optimized ie please upgrade b...,861,en,True,7854029,2289538
660,"RSS Feb 1, 2011 Silver Dragon Reports ~10.2M o...",P,rss feb silver dragon reports m o...,7678,en,True,5834470,1749186
797,Home | Ag Services | Farm Brokerage | Customer...,P,home ag services farm brokerage customer...,2268,en,True,5508864,1598096


## Informative columns

The following are some dicts to go from code to indices, website and so on.

In [68]:
startup_code_to_name = {}
startup_code_to_website = {}
public_code_to_name = {}
public_code_to_website = {}

for p in startup_paths:
  n = int(p.name.split("_")[0])
  startup_code_to_name[n] = df_startups_unique[df_startups_unique.portfoliocompanyid==n].iloc[0].portfoliocompany
  startup_code_to_website[n] = df_startups_unique[df_startups_unique.portfoliocompanyid==n].iloc[0].website

for p in public_paths:
  n = str(p.name.split("_")[0])
  public_code_to_name[n] = df_public_unique[df_public_unique["ÿþmark"]==n].iloc[0].companyname
  public_code_to_website[n] = df_public_unique[df_public_unique["ÿþmark"]==n].iloc[0].websiteaddress


In [69]:
startup_code_to_info = df_startups_unique.set_index("portfoliocompanyid").background.to_dict()
public_code_to_info = df_public_unique.set_index("ÿþmark").descriptionandhistory.to_dict()

In [70]:
all_frame_en.loc[all_frame_en.comp_type=="S","website"] = all_frame_en[all_frame_en.comp_type=="S"].compid.astype(int).map(startup_code_to_website)
all_frame_en.loc[all_frame_en.comp_type=="S","name"] = all_frame_en[all_frame_en.comp_type=="S"].compid.astype(int).map(startup_code_to_name)
all_frame_en.loc[all_frame_en.comp_type=="S","info"] = all_frame_en[all_frame_en.comp_type=="S"].compid.astype(int).map(startup_code_to_info)

all_frame_en.loc[all_frame_en.comp_type=="P","website"] = all_frame_en[all_frame_en.comp_type=="P"].compid.astype(str).map(public_code_to_website)
all_frame_en.loc[all_frame_en.comp_type=="P","name"] = all_frame_en[all_frame_en.comp_type=="P"].compid.astype(str).map(public_code_to_name)
all_frame_en.loc[all_frame_en.comp_type=="P","info"] = all_frame_en[all_frame_en.comp_type=="P"].compid.astype(str).map(public_code_to_info)
all_frame_en.head()

Unnamed: 0,text,comp_type,text_cleaned,compid,language,Rela_language,text_len,words_len,website,name,info
0,JOOR Skip navigation Member Log In Forgot pass...,S,joor skip navigation member log forgot passwor...,73501,en,True,27748,3614,www.jooraccess.com,"JOOR, Inc.","Founded in 2010 and based in New York, US, JOO..."
1,Patients Physicians Employers Login Crossover ...,S,patients physicians employers login crossover ...,160418,en,True,489,66,www.crossoverhealth.com,"Crossover Health Management Services, Inc.","Founded in 2010 and based in California, US, C..."
2,Home | Recent questions | Directories | Feedba...,S,home recent questions directories feedba...,161261,en,True,656,92,www.medwhat.com,"Medwhat.com, Inc.","Established in 2010, based in California, US, ..."
3,Nanotronics Home News Products What is nSpec ?...,S,nanotronics home news products nspec system ...,129092,en,True,19264,2337,www.nanotronicsimaging.com,"Nanotronics Imaging, Inc.","Founded in 2010 and based in New York, US, Nan..."
4,Home About Us Our Approach Technology Our Cust...,S,home us approach technology customers contact ...,77179,en,True,1326,167,www.hello.getsidecar.com,"Sidecar Interactive, Inc.","Established in 2010 and based in Philadelphia,..."


## NLP with word2vec

In [71]:
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "ner"])

Go from documents to vectors using the pretrained model. 

Word2vect has a limit on the maximum number of characters allowed. If a text is larger than the allowed I am creating many vectors and then I take the average.


In [72]:
# word2vec has a max of 1000000 characters
max_characters = 1000000
docs_train = []

for i,r in all_frame_en.iterrows():
  doc_text = r.text_cleaned
  text_len = len(doc_text)
  num_of_vectors = math.ceil(text_len/max_characters)
  v = np.zeros((300,1))
  w = 0
  for j in range(num_of_vectors):
    small_text = doc_text[max_characters*(j):max_characters*(j+1)]
    vec = nlp(small_text).vector
    vec = vec.reshape(vec.shape[0],1)
    v = np.sum( np.hstack((v,vec*len(small_text))),axis=1)
    v = v.reshape((v.shape[0],1))
    w += len(small_text)
  v /= w
  docs_train.append(v)

In [73]:
X_train = np.vstack([d.T for d in docs_train])
X_train.shape

(902, 300)

In [74]:
def cos_sim(v1,v2):
    return np.dot(v1.T,v2)/(np.sqrt( np.sum(np.dot(v1.T,v1)))*np.sqrt(np.sum(np.dot(v2.T,v2))))

def similarity_to_all(vec):
    similarities = np.asarray([cos_sim(vec, X_train[i]) for i in range(0,X_train.shape[0])])
    similarities = similarities.reshape(similarities.shape[0],1)
    return similarities

def get_most_similar(sim_matrix_v, startup_index, how_many=1):
  sim_matrix_copy = sim_matrix_v.copy()
  v = sim_matrix_copy[startup_index,:]
  v[startup_index] = 0
  most_similar_indices = []
  similarity_list = []
  for i in range(how_many):
    ind = np.argmax(v)
    similarity = np.max(v)
    v[ind] = 0
    most_similar_indices.append(ind)
    similarity_list.append(similarity)
  return most_similar_indices, similarity_list

In [75]:
cos_sim(X_train[0],X_train[1])

0.9026528357083603

In [76]:
similarity_to_all(X_train[0]).shape

(902, 1)

This is the matrix with the similarity between all startups.

In [77]:
sim_vect = []
for i in range(0,X_train.shape[0]):
    sim_vect.append(similarity_to_all(X_train[i]))
sim_matrix = np.hstack(sim_vect)
sim_matrix

array([[1.        , 0.90265284, 0.92210691, ..., 0.88866732, 0.88491462,
        0.83882375],
       [0.90265284, 1.        , 0.89521577, ..., 0.83928689, 0.86952002,
        0.8244992 ],
       [0.92210691, 0.89521577, 1.        , ..., 0.85628396, 0.83364919,
        0.79175202],
       ...,
       [0.88866732, 0.83928689, 0.85628396, ..., 1.        , 0.84989794,
        0.85579738],
       [0.88491462, 0.86952002, 0.83364919, ..., 0.84989794, 1.        ,
        0.8696521 ],
       [0.83882375, 0.8244992 , 0.79175202, ..., 0.85579738, 0.8696521 ,
        1.        ]])

In [78]:
np.mean(sim_matrix[0]),np.min(sim_matrix[0])

(0.8336655301845008, 0.10433832217566776)

## Run an example to see the most similar startups of a company

In [79]:
def background_text(t, line_len,line_num):
  b = [t[j*line_len:(j+1)*line_len] for j in range(line_num)]
  b = "\n".join([c for c in b if c!=""])
  if len(t)>line_len*line_num:
    b+="..."
  return b

In [80]:
example_index = random.randint(0,all_frame_en.shape[0]-1)
n = 5
most_similar_index,most_similar_similarity = get_most_similar(sim_matrix, example_index, n)
line_len = 100
line_num = 4

print(f"Example company ({all_frame_en.loc[example_index].comp_type}): {all_frame_en.loc[example_index].website}")
print(background_text(all_frame_en.loc[example_index].info,line_len,line_num))
print("-----------------------------------------------------------------")
print("Similar companies:")
for i in range(n):
  print(f"{i+1}): {all_frame_en.loc[most_similar_index[i]].website} ({all_frame_en.loc[most_similar_index[i]].comp_type})  |   Similarity: {most_similar_similarity[i]}")
  background = all_frame_en.loc[most_similar_index[i]].info
  background = background_text(background, line_len, line_num)
  print(background)
  print("")


Example company (P): www.aeroflex.com
Aeroflex Holding Corp. (Aeroflex Holding), incorporated on May 9, 2005, is a provider of radio frequ
ency (RF) and microwave integrated circuits, components and systems used in the design, development 
and maintenance of wireless communication systems. The Company's solutions include microelectronic c
omponents and test and measurement equipment used by companies in the space, avionics and defense; c...
-----------------------------------------------------------------
Similar companies:
1): www.cifc.com (P)  |   Similarity: 0.9553968220863334
The Company is a real estate investment trust.

2): www.alliedwaste.com (P)  |   Similarity: 0.9494804556386659
Allied Waste Industries, Inc. (Allied), incorporated in 1989, is a non-hazardous, solid waste manage
ment company. The Company provides collection, transfer, recycling and disposal services for more th
an eight million residential, commercial and industrial customers. Allied serves its customers thro

# Strategy score

These functions should compute the strategy score used in the paper.

In [81]:
def compute_strategy_score(similarity_list):
  distance_list = [1-x for x in similarity_list]
  return (1.0/len(distance_list))*np.sum(distance_list)

def get_similarity_score(ind):
  ms_index,ms_similarity = get_most_similar(sim_matrix, ind, 5)
  return compute_strategy_score(ms_similarity)

In [97]:
all_frame_en["score"] = all_frame_en.index.to_series().apply(get_similarity_score)
all_frame_en.sort_values("score",ascending=False,inplace=True)
startups_score = all_frame_en[all_frame_en.comp_type=="S"]

In [98]:
startups_score.head()

Unnamed: 0,text,comp_type,text_cleaned,compid,language,Rela_language,text_len,words_len,website,name,info,score
20,"rel8tion Coming Soon copyright 2010 rel8tion, ...",S,rel tion coming soon copyright rel tion ...,75381,en,True,62,10,www.rel8tion.com,Rel8tion,"Founded in 2010 and is based in Seattle, Washi...",0.290578
289,Please enter your E-mail Save Close Welcome ba...,S,please enter e mail save close welcome back vi...,103883,en,False,175872,22722,www.vivino.com,"Vivino, Inc.","Founded in 2010 and based in California, US, V...",0.247564
45,| | | Sitemap 10Cookies 68 PT FAQ PT Copyright...,S,sitemap cookies pt faq pt copyright...,172978,en,True,101,13,www.ptmind.com,Ptmind Inc.,"Founded in 2010 and based in Washington, US, P...",0.241406
370,Mobile Login Register for Free Invite friends ...,S,mobile login register free invite friends beta...,120176,en,False,7868,1020,www.letshum.com,Hum,"Founded in 2010 and based in California, US, H...",0.238755
299,DIRT BIKES Dirt Bike Gear Dirt Bike Parts Top ...,S,dirt bikes dirt bike gear dirt bike parts top ...,112592,en,True,346,49,www.jumpbikes.com,"Social Bicycles, Inc.","Founded in 2010 and based in New York, US, Soc...",0.233409


## Startups with the best score

In [101]:
n =10
for i in range(n):
  print(f"{i+1}): {startups_score.iloc[i].website} |   Score: {startups_score.iloc[i].score}")
  print(background_text(startups_score.iloc[i].info, line_len, line_num), end="\n\n")

1): www.rel8tion.com |   Score: 0.2905777420454519
Founded in 2010 and is based in Seattle, Washington, Rel8tion is a mobile advertising company, provi
des hyperlocal mobile advertising technology.

2): www.vivino.com |   Score: 0.24756376950653555
Founded in 2010 and based in California, US, Vivino, Inc. operates an online marketplace which offer
s wines.

3): www.ptmind.com |   Score: 0.2414055139165946
Founded in 2010 and based in Washington, US, Ptmind Inc. develops and owns a heatmap and web analyti
cs platform. The company's products are Ptengine and DataDeck that help businesses share, organize, 
optimize and visualize their data in real-time.

4): www.letshum.com |   Score: 0.23875489117615364
Founded in 2010 and based in California, US, Hum operates as a email and chat application focused on
 real-time conversation and presence.

5): www.jumpbikes.com |   Score: 0.23340909626911388
Founded in 2010 and based in New York, US, Social Bicycles, Inc., doing business as JUMP Bikes, 

In [100]:
n =10
top_10 = pd.DataFrame({"startup":[],"score":[],"background":[]})
for i in range(n):
  website = startups_score.iloc[i].website
  score = startups_score.iloc[i].score
  top_10 = top_10.append({"startup":website,"score":score,"background":startups_score.iloc[i].info},ignore_index=True)

top_10

Unnamed: 0,startup,score,background
0,www.rel8tion.com,0.290578,"Founded in 2010 and is based in Seattle, Washi..."
1,www.vivino.com,0.247564,"Founded in 2010 and based in California, US, V..."
2,www.ptmind.com,0.241406,"Founded in 2010 and based in Washington, US, P..."
3,www.letshum.com,0.238755,"Founded in 2010 and based in California, US, H..."
4,www.jumpbikes.com,0.233409,"Founded in 2010 and based in New York, US, Soc..."
5,www.bifflabs.com,0.220084,"Founded in 2010, Spindle Labs, formerly knwon ..."
6,www.4me.com,0.203664,"Founded in 2010 and based in California, US, 4..."
7,www.yesware.com,0.193809,"Founded in 2010 and based in Massachusetts, US..."
8,www.bigswitch.com,0.19177,"Founded in 2010 and based in California, US, B..."
9,www.treatfeed.com,0.191492,"Founded in 2010, based in Los Angeles, Califor..."
