###Package installation


In [None]:
!pip install -U sentence-transformers
!pip install gcld3

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import pathlib
import random
import math
import re
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import gcld3

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import spacy.cli
spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "ner"])

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Loading all Data**


In [10]:
Txt_all_frame = pd.read_csv("drive/My Drive/Capstone Shared Docs/result/complete_df/year_2013.csv")

In [12]:
Txt_all_frame.drop_duplicates("website",keep = "first",inplace=True)

In [None]:
Txt_all_frame.head()

In [20]:
Txt_all_frame_en = Txt_all_frame[(Txt_all_frame.language=='en') & (Txt_all_frame.words_len>25)] #| ( (Txt_all_frame.language!='en') & (Txt_all_frame.language==False) )
Txt_all_frame_en.index = np.arange(0,len(Txt_all_frame_en.compid))

# **embeddings**

### embedding for word2vec

In [None]:
import spacy.cli
spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "ner"])

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
max_characters = 1000000
docs_train = []

for i,r in Txt_all_frame_en.iterrows():
  doc_text = r.text_cleaned
  text_len = len(doc_text)
  num_of_vectors = math.ceil(text_len/max_characters)
  v = np.zeros((300,1))
  w = 0
  for j in range(num_of_vectors):
    small_text = doc_text[max_characters*(j):max_characters*(j+1)]
    vec = nlp(small_text).vector
    vec = vec.reshape(vec.shape[0],1)
    v = np.sum( np.hstack((v,vec*len(small_text))),axis=1)
    v = v.reshape((v.shape[0],1))
    w += len(small_text)
  v /= w
  docs_train.append(v)
X_train = np.vstack([d.T for d in docs_train])

In [None]:
X_train.shape


(11434, 300)

### embedding for tf-idf






In [None]:
!pip install gcld3
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import gcld3

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stopWords = set(stopwords.words("english"))
vectorizer = TfidfVectorizer(stop_words = stopWords)

In [None]:
X_train = vectorizer.fit_transform(Txt_all_frame_en['text_cleaned'])

### embedding for Bert


In [None]:
sbert_model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

In [None]:
%%time
docs_train = []


pct = int(np.percentile(Txt_all_frame_en.text_len, 95))

for i in range(len(Txt_all_frame_en)):

  if len(Txt_all_frame_en['text_cleaned'][i]) > pct :
    v = sbert_model.encode(Txt_all_frame_en['text_cleaned'][i][:pct])
    docs_train.append(v)
  
  else:
    v = sbert_model.encode(Txt_all_frame_en['text_cleaned'][i])
    docs_train.append(v)
X_train = np.vstack([d.T for d in docs_train])

# **similarity**

In [None]:
pairwise_similarities=cosine_similarity(X_train)

In [None]:
pairwise_similarities.shape

(11434, 11434)

In [None]:
def get_most_similar(sim_matrix_v, startup_index, how_many=1):
  sim_matrix_copy = sim_matrix_v.copy()
  v = sim_matrix_copy[startup_index,:]
  v[startup_index] = 0
  most_similar_indices = []
  similarity_list = []
  for i in range(how_many):
    ind = np.argmax(v)
    similarity = np.max(v)
    v[ind] = 0
    most_similar_indices.append(ind)
    similarity_list.append(similarity)
  return most_similar_indices, similarity_list

In [None]:
rows_startup = Txt_all_frame_en[Txt_all_frame_en.comp_type == 'S'].index.tolist()

In [None]:
Txt_all_frame_en

In [None]:
Dict_similarity = []
Strat_score = []
n = 5

for i in rows_startup:

  most_similar_index,most_similar_similarity = get_most_similar(pairwise_similarities, i, n)
  #print(most_similar_index)
  IDs = [Txt_all_frame_en.companyid[j] for j in most_similar_index]
  dict_sim = {IDs[i]: most_similar_similarity[i] for i in range(len(IDs))} 
  strategy_score = np.mean(1-np.array(most_similar_similarity))

  Dict_similarity.append(dict_sim)
  Strat_score.append(strategy_score)

In [None]:
startups = Txt_all_frame_en[Txt_all_frame_en.comp_type == 'S']
startups['Dict_similarity'] = Dict_similarity
startups['Strat_score'] = Strat_score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
startups.Strat_score.var()

0.0016775297493768068

# **Store the Strategy Score**

In [None]:
startups.to_csv("drive/My Drive/Capstone Shared Docs/strategy score/word2_vec_2013.csv",index=False)