# **Package installation**

In [None]:
pip install -U sentence-transformers

Requirement already up-to-date: sentence-transformers in /usr/local/lib/python3.6/dist-packages (0.3.8)


In [None]:
pip install gcld3



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import pathlib
import random
import math
import re
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sentence_transformers import SentenceTransformer

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import gcld3

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls '/content/drive/My Drive/Capstone Shared Docs/result'

'EC2 full run Oct14 result'  'Local Oct 13 result'
'EC2 Oct 13 result'	     'Local Oct 6 result'
'EC2 Oct26 public data'      'terminal log_oct5_am.txt'
'Error Log Resume.xlsx'      'terminal log_oct5.txt'


# **Loading dta files**

In [None]:
df_startup = pd.read_stata("drive/My Drive/Capstone Shared Docs/data/all_deals.dta")
df_startup_unique = df_startup.drop_duplicates(["portfoliocompanyid"],keep="first")

In [None]:
df_public = pd.read_stata("drive/My Drive/Capstone Shared Docs/data/all_public_firms.dta")
df_public_unique = df_public.drop_duplicates(["ÿþmark"],keep="first")

# **Loading txt Data Startups**

In [None]:
path_startup = pathlib.Path("drive/My Drive/Capstone Shared Docs/result/EC2 full run Oct14 result")

In [None]:
txt_startup =list(path_startup.glob('*.txt'))

In [None]:
all_text_startup = {}

for p in txt_startup:
    n = p.name.split("_")[0]
    file = open(p)
    # Remove first two and last characters, because of ascii encoding
    text = file.read().replace("***///***","")[2:-1]
    # remove ""
    text_list = text.split(" ")
    text_list = [x for x in text_list if x!=""]
    text = " ".join(text_list)
    file.close()
    all_text_startup[n] = text
  
print(len(all_text_startup))

430


In [None]:
Txt_startup_frame = pd.DataFrame.from_dict(all_text_startup, orient='index')
Txt_startup_frame.columns = ['Text']
Txt_startup_frame['companyid'] = Txt_startup_frame.index
Txt_startup_frame.index = np.arange(0,len(Txt_startup_frame.companyid))
Txt_startup_frame = Txt_startup_frame.assign(comp_type='S')
Txt_startup_frame = Txt_startup_frame[['companyid', 'comp_type', 'Text']]

In [None]:
stop_words_l=stopwords.words('english')
Txt_startup_frame['Text_cleaned']=Txt_startup_frame.Text.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )

In [None]:
Txt_startup_frame.head()

Unnamed: 0,companyid,comp_type,Text,Text_cleaned
0,73501,S,JOOR Skip navigation Member Log In Forgot pass...,joor skip navigation member log forgot passwor...
1,160418,S,Patients Physicians Employers Login Crossover ...,patients physicians employers login crossover ...
2,161261,S,Home | Recent questions | Directories | Feedba...,home recent questions directories feedba...
3,129092,S,Nanotronics Home News Products What is nSpec ?...,nanotronics home news products nspec system ...
4,77179,S,Home About Us Our Approach Technology Our Cust...,home us approach technology customers contact ...


# **Loading txt Data Public Companies**

In [None]:
path_public = pathlib.Path("drive/My Drive/Capstone Shared Docs/result/EC2 Oct26 public data")

In [None]:
txt_public =list(path_public.glob('*.txt'))

In [None]:
all_text_public = {}

for p in txt_public:
    n = p.name.split("_")[0]
    file = open(p)
    # Remove first two and last characters, because of ascii encoding
    text = file.read().replace("***///***","")[2:-1]
    # remove ""
    text_list = text.split(" ")
    text_list = [x for x in text_list if x!=""]
    text = " ".join(text_list)
    file.close()
    all_text_public[n] = text

print(len(all_text_public))

536


In [None]:
Txt_public_frame = pd.DataFrame.from_dict(all_text_public, orient='index')
Txt_public_frame.columns = ['Text']
Txt_public_frame['companyid'] = Txt_public_frame.index
Txt_public_frame.index = np.arange(0,len(Txt_public_frame.companyid))
Txt_public_frame = Txt_public_frame.assign(comp_type='P')
Txt_public_frame = Txt_public_frame[['companyid', 'comp_type', 'Text']]

In [None]:
stop_words_l=stopwords.words('english')
Txt_public_frame['Text_cleaned']=Txt_public_frame.Text.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )

In [None]:
Txt_public_frame
Txt_public_frame.head()

Unnamed: 0,companyid,comp_type,Text,Text_cleaned
0,151,P,CVR Energy is an independent petroleum refiner...,cvr energy independent petroleum refiner marke...
1,1658,P,COMPANY Executive Profiles History Company Val...,company executive profiles history company val...
2,5503,P,Home | About Us | Locations | ATM Locations | ...,home us locations atm locations contac...
3,2901,P,Home | Site Map | Contact Us Information is be...,home site map contact us information colle...
4,1510,P,Search: Corporate Info | Contact GAF | News & ...,search corporate info contact gaf news ...


# **Concatenate Public and Startup DB**

In [None]:
Txt_all_frame = pd.concat([Txt_startup_frame, Txt_public_frame])
Txt_all_frame.index = np.arange(0,len(Txt_all_frame.companyid))

In [None]:
Txt_all_frame.head()

Unnamed: 0,companyid,comp_type,Text,Text_cleaned
0,73501,S,JOOR Skip navigation Member Log In Forgot pass...,joor skip navigation member log forgot passwor...
1,160418,S,Patients Physicians Employers Login Crossover ...,patients physicians employers login crossover ...
2,161261,S,Home | Recent questions | Directories | Feedba...,home recent questions directories feedba...
3,129092,S,Nanotronics Home News Products What is nSpec ?...,nanotronics home news products nspec system ...
4,77179,S,Home About Us Our Approach Technology Our Cust...,home us approach technology customers contact ...


# **Solving some language issues**

In [None]:
Txt_all_frame.iloc[33:34,:]

Unnamed: 0,companyid,comp_type,Text,Text_cleaned
33,230198,S,Presentacin Asesores / Colaboradores Com desca...,presentacin asesores colaboradores com desca...


In [None]:
lan = []
rel_lan = []
detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=10000)

for i in range(len(Txt_all_frame['Text_cleaned'])):
  t =Txt_all_frame.Text_cleaned[i]
  t = str(t)
  
  result = detector.FindLanguage(text=t)
  lan.append(result.language)
  rel_lan.append(result.is_reliable)


In [None]:
Txt_all_frame['language'] = lan
Txt_all_frame['Rela_language'] = rel_lan

In [None]:
Txt_all_frame.language.unique()

array(['en', 'it', 'es', 'fr', 'mg', 'no', 'sn', 'gl', 'zu', 'eo', 'da',
       'fy', 'lb', 'de', 'sr', 'nl', 'fi', 'vi', 'hu', 'su', 'jv', 'pt',
       'ru-Latn'], dtype=object)

In [None]:
Txt_all_frame[Txt_all_frame.language=='es']

Unnamed: 0,companyid,comp_type,Text,Text_cleaned,language,Rela_language
27,112162,S,Bienvenido al Nuevo Mundo de Nativo Network . ...,bienvenido al nuevo mundo de nativo network ...,es,True
33,230198,S,Presentacin Asesores / Colaboradores Com desca...,presentacin asesores colaboradores com desca...,es,True
74,297244,S,movista.com Algunos sitios relacionados con: m...,movista com algunos sitios relacionados con m...,es,True
355,150133,S,Error 503 Service Unavailable Service Unavaila...,error service unavailable service unavaila...,es,True
508,4691,P,Call Us 24/7 1-800-786-7235 Why Bank Stem Cell...,call us bank stem cells v...,es,True
773,4640,P,Welcome to the Worlds Healthy Coffee Company M...,welcome worlds healthy coffee company manufact...,es,True
803,51,P,-------- Including dojo Configurations -------...,including dojo configurations ...,es,False


In [None]:
Txt_all_frame[Txt_all_frame.language=='fr']

Unnamed: 0,companyid,comp_type,Text,Text_cleaned,language,Rela_language
55,93431,S,banned interdit verboden vietato prohibido ver...,banned interdit verboden vietato prohibido ver...,fr,False
257,139205,S,Recherche de noms de domaine : www.indom.com L...,recherche de noms de domaine www indom com l...,fr,True
316,107417,S,Index of / cgi-bin/ dev/ Apache/2.2.15 (Unix) ...,index cgi bin dev apache unix mod...,fr,True
343,129019,S,banned interdit verboden vietato prohibido ver...,banned interdit verboden vietato prohibido ver...,fr,False
491,5208,P,Alpha Innotech is now part of Cell Biosciences...,alpha innotech part cell biosciences visit ww...,fr,True
650,5107,P,Forbidden You don't have permission to access ...,forbidden don t permission access server ap...,fr,True
733,3106,P,Skip Navigation Country UK (English) France (F...,skip navigation country uk english france f...,fr,True


In [None]:
Txt_all_frame_en = Txt_all_frame[(Txt_all_frame.language=='en') ] #| ( (Txt_all_frame.language!='en') & (Txt_all_frame.language==False) )
Txt_all_frame_en.index = np.arange(0,len(Txt_all_frame_en.companyid))

# **Lenght of text**

In [None]:
word_len = [len(i.split()) for i in Txt_all_frame_en.Text_cleaned ]
text_len = [len(i) for i in Txt_all_frame_en.Text_cleaned ]
Txt_all_frame_en['Word_len'] = word_len
Txt_all_frame_en['Text_len'] = text_len

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
Txt_all_frame_en.head()

Unnamed: 0,companyid,comp_type,Text,Text_cleaned,language,Rela_language,Word_len,Text_len
0,73501,S,JOOR Skip navigation Member Log In Forgot pass...,joor skip navigation member log forgot passwor...,en,True,3614,27748
1,160418,S,Patients Physicians Employers Login Crossover ...,patients physicians employers login crossover ...,en,True,66,489
2,161261,S,Home | Recent questions | Directories | Feedba...,home recent questions directories feedba...,en,True,92,656
3,129092,S,Nanotronics Home News Products What is nSpec ?...,nanotronics home news products nspec system ...,en,True,2337,19264
4,77179,S,Home About Us Our Approach Technology Our Cust...,home us approach technology customers contact ...,en,True,167,1326


In [None]:
Txt_all_frame_en.Word_len.mean()

30677.095135135136

In [None]:
Txt_all_frame_en.Text_len.mean()

133178.79027027026

In [None]:
Txt_all_frame_en.Word_len.max()

4806695

In [None]:
Txt_all_frame_en.Text_len.max()

16065268

In [None]:
np.percentile(Txt_all_frame_en.Word_len, 95)

58185.59999999997

In [None]:
np.percentile(Txt_all_frame_en.Text_len, 95)

353595.39999999985

In [None]:
np.percentile(Txt_all_frame_en.Word_len, 99)

565369.6799999988

In [None]:
np.percentile(Txt_all_frame_en.Text_len, 99)

2461761.719999997

# **BERT Implementation**

### **Example 1**

In [None]:
sentences = ['This framework generates embeddings for each input sentence',
             'Sentences are passed as a list of string.',
             'The quick brown fox jumps over the lazy dog.']

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/roberta-large-nli-stsb-mean-tokens")
model = AutoModel.from_pretrained("sentence-transformers/roberta-large-nli-stsb-mean-tokens")


In [None]:
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')

In [None]:
with torch.no_grad():
    model_output = model(**encoded_input)

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [None]:
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

In [None]:
sentence_embeddings

tensor([[ 0.6307, -0.2880,  0.0535,  ...,  0.2687, -2.2384,  0.2251],
        [ 0.2207, -0.8046,  0.1843,  ...,  0.6993, -1.7670,  0.1126],
        [-0.1782,  0.0875, -0.7615,  ..., -0.6984, -0.1316, -0.1113]])

In [None]:
sentence_embeddings.shape

torch.Size([3, 1024])

### **Example 2**

In [None]:
sbert_model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

In [None]:
sentence_embeddings = sbert_model.encode(sentences)

In [None]:
sentence_embeddings

array([[ 0.6306941 , -0.2880172 ,  0.05346152, ...,  0.26867366,
        -2.2383592 ,  0.22513251],
       [ 0.22073454, -0.8045986 ,  0.18434067, ...,  0.6992715 ,
        -1.7669749 ,  0.11263788],
       [-0.1781616 ,  0.08751994, -0.76151556, ..., -0.69839   ,
        -0.13163872, -0.11128177]], dtype=float32)

In [None]:
sentence_embeddings.shape

(3, 1024)

### **Example Long Text**

In [None]:
webtext = Txt_all_frame_en.iloc[81,3]
len(webtext.split())

80751

In [None]:
%%time
encoded_input = tokenizer(webtext, padding=True, truncation=True, max_length=128, return_tensors='pt')

with torch.no_grad():
    model_output = model(**encoded_input)

v = mean_pooling(model_output, encoded_input['attention_mask'])

v.shape

CPU times: user 53.3 s, sys: 930 ms, total: 54.2 s
Wall time: 54.3 s


In [None]:
%%time

v = sbert_model.encode(webtext)

v.shape

CPU times: user 52.9 s, sys: 952 ms, total: 53.8 s
Wall time: 53.9 s


In [None]:
webtext2 = Txt_all_frame_en.iloc[82,3]
len(webtext2.split())

448150

In [None]:
#%%time
#encoded_input = tokenizer(webtext2, padding=True, truncation=True, max_length=128, return_tensors='pt')

#with torch.no_grad():
    #model_output = model(**encoded_input)

#v = mean_pooling(model_output, encoded_input['attention_mask'])

#v.shape

CPU times: user 29min 35s, sys: 34.1 s, total: 30min 9s
Wall time: 30min 13s


# **BERT implemnentation with Truncation ~ 95 lenght percentile**

In [None]:
sbert_model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

In [None]:
Complete_frame = Txt_all_frame_en
Complete_frame.head(9)

Unnamed: 0,companyid,comp_type,Text,Text_cleaned,language,Rela_language,Word_len,Text_len
0,73501,S,JOOR Skip navigation Member Log In Forgot pass...,joor skip navigation member log forgot passwor...,en,True,3614,27748
1,160418,S,Patients Physicians Employers Login Crossover ...,patients physicians employers login crossover ...,en,True,66,489
2,161261,S,Home | Recent questions | Directories | Feedba...,home recent questions directories feedba...,en,True,92,656
3,129092,S,Nanotronics Home News Products What is nSpec ?...,nanotronics home news products nspec system ...,en,True,2337,19264
4,77179,S,Home About Us Our Approach Technology Our Cust...,home us approach technology customers contact ...,en,True,167,1326
5,89064,S,StyleSaint Something incredible is happening i...,stylesaint something incredible happening fash...,en,True,8,68
6,76857,S,Client Login Signup Free Consultation Services...,client login signup free consultation services...,en,True,6487,49653
7,77461,S,Connection Engine beta Have an account? Sign i...,connection engine beta account sign username ...,en,True,107,873
8,88411,S,Blog Features About Management Team Investors ...,blog features management team investors adviso...,en,True,4664,35884


In [None]:
%%time
docs_train = []

pct = int(np.percentile(Txt_all_frame_en.Text_len, 95))

for i in range(len(Complete_frame)):

  if len(Complete_frame['Text_cleaned'][i]) > pct :
    v = sbert_model.encode(Complete_frame['Text_cleaned'][i][:pct])
    docs_train.append(v)
  
  else:
    v = sbert_model.encode(Complete_frame['Text_cleaned'][i])
    docs_train.append(v)


CPU times: user 1h 12min 55s, sys: 1min 2s, total: 1h 13min 58s
Wall time: 1h 14min 4s


In [None]:
X_train = np.vstack([d.T for d in docs_train])
X_train.shape

(925, 1024)

In [None]:
with open('/content/drive/My Drive/Capstone Shared Docs/BERT_matrix.npy', 'wb') as f:
  np.save(f, X_train)

In [None]:
pairwise_similarities=cosine_similarity(X_train)


In [None]:
def get_most_similar(sim_matrix_v, startup_index, how_many=1):
  sim_matrix_copy = sim_matrix_v.copy()
  v = sim_matrix_copy[startup_index,:]
  v[startup_index] = 0
  most_similar_indices = []
  similarity_list = []
  for i in range(how_many):
    ind = np.argmax(v)
    similarity = np.max(v)
    v[ind] = 0
    most_similar_indices.append(ind)
    similarity_list.append(similarity)
  return most_similar_indices, similarity_list

In [None]:
Dict_similarity = []
Strat_score = []
n = 5

for i in range(len(Complete_frame.companyid)):

  most_similar_index,most_similar_similarity = get_most_similar(pairwise_similarities, i, n)
  IDs = [Complete_frame.companyid[j] for j in most_similar_index]
  dict_sim = {IDs[i]: most_similar_similarity[i] for i in range(len(IDs))} 
  strategy_score = np.mean(1-np.array(most_similar_similarity))

  Dict_similarity.append(dict_sim)
  Strat_score.append(strategy_score)

In [None]:
Complete_frame['Dict_similarity'] = Dict_similarity
Complete_frame['Strat_score'] = Strat_score

In [None]:
Complete_frame.to_csv('/content/drive/My Drive/Capstone Shared Docs/BERT_Results.csv')

In [None]:
Complete_frame.head()

Unnamed: 0,companyid,comp_type,Text,Text_cleaned,language,Rela_language,Word_len,Text_len,Dict_similarity,Strat_score
0,73501,S,JOOR Skip navigation Member Log In Forgot pass...,joor skip navigation member log forgot passwor...,en,True,3614,27748,"{'87812': 0.75534654, '2187': 0.7421633, '9555...",0.265543
1,160418,S,Patients Physicians Employers Login Crossover ...,patients physicians employers login crossover ...,en,True,66,489,"{'3652': 0.6156608, '1028': 0.60842973, '415':...",0.401242
2,161261,S,Home | Recent questions | Directories | Feedba...,home recent questions directories feedba...,en,True,92,656,"{'4178': 0.71706617, '3972': 0.6322659, '28515...",0.35979
3,129092,S,Nanotronics Home News Products What is nSpec ?...,nanotronics home news products nspec system ...,en,True,2337,19264,"{'4760': 0.728186, '4006': 0.67883325, '3322':...",0.323905
4,77179,S,Home About Us Our Approach Technology Our Cust...,home us approach technology customers contact ...,en,True,167,1326,"{'96097': 0.7260119, '71320': 0.72508353, '722...",0.282603
