# **Data Cleaning**

In [1]:
# !pip install selenium
# !apt-get update
# !apt-get install -y wget
# !wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
# !dpkg -i google-chrome-stable_current_amd64.deb
# !google-chrome-stable --version
# !pip install openai
# !pip install chromedriver-binary==113.0.5672.24.0

**Import needed libraries**

In [2]:
# Connect to server
import smtplib 
# To manipulate arrays
import numpy as np
# Manipulate DataFrames
import pandas as pd
# Google Spreadsheets client
import gspread
# Google Spreadsheets client to export dataframes to G Sheets
import gspread_dataframe as gd
# Authenticate with Google
from google.auth import default
#Get the credentials of GoogleDrive
from google.oauth2.credentials import Credentials
#Make the request of the credentials
from google.auth.transport.requests import Request
#idk
from google_auth_oauthlib.flow import InstalledAppFlow

#Needed to use the API of ChatGPT
import openai

#For scraping LinkedIn
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

#To get information of the page we are scraping
from bs4 import BeautifulSoup
import getpass
#To pause the code
import time
#To count the occurences in a collection
from collections import Counter



**Connection to Google Drive**

In [3]:
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']
creds = None
if not creds or not creds.valid:
    flow = InstalledAppFlow.from_client_secrets_file(
        'client_secret.json', SCOPES)
    creds = flow.run_local_server(port=0)
gc = gspread.authorize(creds)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=580396607120-4akf277ns01rtnf7o8umikt8ek3dgurq.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A58231%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fspreadsheets&state=5UBNsPAeyaxahJYbl9mzEb7RyDFYaT&access_type=offline


**Link of the spreadsheet**

In [4]:
spreadsheet_url = "https://docs.google.com/spreadsheets/d/11PZqjOByAbXv7qYan6t1j2P1vu7uE_vkK9rHjyifHHM/edit#gid=0"

In [5]:
list_df = list()

## **BioValley**

In [6]:
#Defining the columns to keep and naming them
biovalley_new_columns = ["name",
                      "link",
                      "address_line_1",
                      "address_line_2",
                      "address_line_3",
                      "contact",
                      "summary"]

In [7]:
# Open the spreadsheet and get the specific sheet
worksheet_name = "Biovalley"  # Replace with the name of the sheet you want to access

worksheet = gc.open_by_url(spreadsheet_url).worksheet(worksheet_name)

# get_all_values gives a list of rows.
rows = worksheet.get_all_values()
# Extract the 7 columns we need
columns_to_keep = [ 2, 3, 4, 5, 6, 7, 9]
selected_rows = [[row[i] for i in columns_to_keep] for row in rows[5:]]

# Convert to a DataFrame
df_biovalley = pd.DataFrame.from_records(selected_rows,
                                         columns=biovalley_new_columns)

# Replace missing data and "-" with NaN
df_biovalley.replace({'': np.nan, '-': np.nan}, inplace=True)

**Concatenating "address" columns**

In [8]:
# Concatenate the address columns and create a new column
df_biovalley['address'] = df_biovalley['address_line_1'] + ', ' + df_biovalley['address_line_2'] + ', ' + df_biovalley['address_line_3']

# Drop the original address columns
df_biovalley.drop(['address_line_1', 'address_line_2', 'address_line_3'], axis=1, inplace=True)

In [9]:
list_df.append(df_biovalley)
df_biovalley.head()

Unnamed: 0,name,link,contact,summary,address
0,A&O Pharmadienstleistungen,www.aopharma.de,07628 95 03 119,Arzneimittelfreigaben klinische Prüfpräparate ...,"Am Sattel 17, 79588, Efringen-Kirchen"
1,Albert-Ludwigs-Universität,https://informatik.uni-freiburg.de,0761 203-7461,Arcondis ist eine Unternehmensberatung für das...,"Institut für Informatik, 79110, Freiburg"
2,Arcondis AG Schweiz,www.arcondis.com,0041 61 717 8200,Wir sind ein kleines unabhängiges Institut für...,"Christoph Merian-Ring 31A, 0, Reinach"
3,ATG biosynthetics GmbH,www.atg-biosynthetics.com,0761 888 94 24,Die BioCopy AG ist ein junges Biotech-Startup ...,"Weberstr. 40, 79249, Merzhausen"
4,BBI Solutions,www.diarect.com,0761 47979 0,BioCopy’s vielfach ausgezeichnetes Team von me...,"Bötzinger Str. 29b, 79111, Freiburg"


## **Biolago**

In [10]:
#Defining the columns to keep and naming them
biolago_new_columns = ["name",
                      "field",
                      "summary",
                      "address"]

In [11]:
# Open the spreadsheet and get the specific sheet
worksheet_name = "Biolago"  # Replace with the name of the sheet you want to access

worksheet = gc.open_by_url(spreadsheet_url).worksheet(worksheet_name)

# Get the values from the sheet
rows = worksheet.get_all_values()

columns_to_keep = [1]
selected_rows = [[row[i] for i in columns_to_keep] for row in rows[3:]]

# Convert to a DataFrame
df_biolago = pd.DataFrame.from_records(rows)

# get_all_values gives a list of rows.
rows = worksheet.get_all_values()

# Convert to a DataFrame
df_biolago = pd.DataFrame.from_records(selected_rows)

# Transpose the DataFrame
df_biolago = pd.DataFrame(np.reshape(df_biolago.values, (len(df_biolago) // 4, 4)), columns=biolago_new_columns)

# Replace missing data and "-" with NaN
df_biolago.replace({'': np.nan}, inplace=True)

In [12]:
df_biolago.head()

Unnamed: 0,name,field,summary,address
0,1LIMS,Data,"Laboratory Transformation, LIMS, Process Optim...",CH-Märstetten
1,abiotec AG,"Devices, Drugs & Therapy, Data","Biotechnology, Medical devices, Pharmacy, Qual...",CH-Rheinfelden
2,AID Diagnostika GmbH,Diagnostics,"Diagnostics (In-vitro), Microbiology, Virology...",DE-Straßberg
3,AIRAmed GmbH,"Diagnostics, Devices, Drugs & Therapy, Data","Neuroradiology, Software, Artificial Intellige...",DE-Tübingen
4,alcare AG,Data,,DE-Wil


## **Biorn**

In [13]:
#Defining the columns to keep and naming them
biorn_new_columns = ["name",
                     "to_delete",
                      "field",
                      "second field"]

In [14]:
# Open the spreadsheet and get the specific sheet
worksheet_name = "Biorn"  # Replace with the name of the sheet you want to access

worksheet = gc.open_by_url(spreadsheet_url).worksheet(worksheet_name)

# Get the values from the sheet
rows = worksheet.get_all_values()

columns_to_keep = [1]
selected_rows = [[row[i] for i in columns_to_keep] for row in rows[6:]]

# Convert to a DataFrame
df_biorn = pd.DataFrame.from_records(selected_rows)

# delete the last three rows
df_biorn = df_biorn.drop(index=[len(df_biorn)-3, len(df_biorn)-2, len(df_biorn)-1])

# Transpose the DataFrame
df_biorn = pd.DataFrame(np.reshape(df_biorn.values, (len(df_biorn) // 4, 4)), columns=biorn_new_columns)

#Delete the "to_delete column"
df_biorn.drop(["to_delete"], axis=1, inplace=True)

# Replace missing data with NaN
df_biorn.replace({'': np.nan}, inplace=True)

In [15]:
# Concatenate the address columns and create a new column
df_biorn['field'] = df_biorn['field'] + ', ' + df_biorn['second field']

# Drop the original address columns
df_biorn.drop('second field', axis=1, inplace=True)

In [16]:
list_df.append(df_biorn)
df_biorn.head()

Unnamed: 0,name,field
0,10x Genomics B.V.,"Biotechnology - R&D Services, Analytical servi..."
1,AaviGen GmbH,"Biotechnology - Therapeutics and Diagnostics, ..."
2,AbbVie Deutschland GmbH & Co. KG,
3,AcademicLabs,"Professional Services and Consulting, Informat..."
4,Affimed GmbH,"Biotechnology - Therapeutics and Diagnostics, ..."


## **Biowin**

In [17]:
#Defining the columns to keep and naming them
biowin_new_columns = ["name",
                     "summary"]

In [18]:
# Open the spreadsheet and get the specific sheet
worksheet_name = "Biowin"  # Replace with the name of the sheet you want to access

worksheet = gc.open_by_url(spreadsheet_url).worksheet(worksheet_name)

# Get the values from the sheet
rows = worksheet.get_all_values()

columns_to_keep = [1, 2]
selected_rows = [[row[i] for i in columns_to_keep] for row in rows[3:]]

# Convert to a DataFrame
df_biowin = pd.DataFrame.from_records(selected_rows,
                                      columns=biowin_new_columns)

# Replace missing data with NaN
df_biowin.replace({'': np.nan}, inplace=True)

# Delete all rows containing Nan
df_biowin.dropna(inplace=True)


In [19]:
list_df.append(df_biowin)
df_biowin.head()

Unnamed: 0,name,summary
0,Trialzen,Clinical trials constitute a critical componen...
1,2 Bridge,2 Bridge is a Belgian-based company that provi...
3,ABL,From development to market ABL provides GMP vi...
4,Abscint,ABSCINT is a molecular imaging company. ABSCIN...
6,Ad Hoc Clinical BVBA,Ad Hoc Clinical is a privately owned CRO rende...


## **ci3**

In [20]:
ci3_new_columns = ["name"]

In [21]:
# Open the spreadsheet and get the specific sheet
worksheet_name = "ci3"  # Replace with the name of the sheet you want to access

worksheet = gc.open_by_url(spreadsheet_url).worksheet(worksheet_name)

# Get the values from the sheet
rows = worksheet.get_all_values()

columns_to_keep = [0]
selected_rows = [[row[i] for i in columns_to_keep] for row in rows[4:]]

# Convert to a DataFrame
df_ci3 = pd.DataFrame.from_records(selected_rows,
                                      columns=ci3_new_columns)

In [22]:
list_df.append(df_ci3)
df_ci3.head()

Unnamed: 0,name
0,Abbott GmbH & Co. KG
1,AbbVie Deutschland GmbH & Co.KG
2,AESKU Diagnostics GmbH & Co. KG
3,Amp-Lab GmbH
4,Baerkraft GmbH


## **Atlantapole**

In [23]:
atlantapole_new_columns = ["name","summary"]

In [24]:
# Open the spreadsheet and get the specific sheet
worksheet_name = "Atlantapole"  # Replace with the name of the sheet you want to access

worksheet = gc.open_by_url(spreadsheet_url).worksheet(worksheet_name)

# Get the values from the sheet
rows = worksheet.get_all_values()

columns_to_keep = [1, 3]
selected_rows = [[row[i] for i in columns_to_keep] for row in rows[3:]]

# Convert to a DataFrame
df_atlantapole = pd.DataFrame.from_records(selected_rows,
                                      columns=atlantapole_new_columns)

# Replace missing data with NaN
df_atlantapole.replace({'': np.nan}, inplace=True)

In [25]:
list_df.append(df_atlantapole)
df_atlantapole.head()

Unnamed: 0,name,summary
0,ABYS MEDICAL,
1,AI4R,AI4R offers a very high performance system in ...
2,ALISON MUNRO CORPORATE LANGUAGE,
3,APERSY,Apersy provide expert advisory services for Co...
4,APO TECH CARE,Our mission with Apo Tech Care is to be the fa...


## **BioM**

In [26]:
biom_new_columns = ["field","summary","name","address","contact"]

In [27]:
# Open the spreadsheet and get the specific sheet
worksheet_name = "BioM"  # Replace with the name of the sheet you want to access

worksheet = gc.open_by_url(spreadsheet_url).worksheet(worksheet_name)

# Get the values from the sheet
rows = worksheet.get_all_values()

columns_to_keep = [0,1,2,3,4]
selected_rows = [[row[i] for i in columns_to_keep] for row in rows[3:]]

# Convert to a DataFrame
df_biom = pd.DataFrame.from_records(selected_rows,
                                      columns=biom_new_columns)

# Replace missing data with NaN
df_biom.replace({'': np.nan}, inplace=True)

In [28]:
list_df.append(df_biom)
df_biom.head()

Unnamed: 0,field,summary,name,address,contact
0,Category: Technology Transfer,"Keywords: Infectious Diseases, Vaccines, Mole...",21Dx GmbH,Kreillerstrasse 210 Deutschland-81825 München,Telefon: +49 (0) 163 874 79 75 +4915128601726...
1,Category: Biotech DNA/Protein Analytics,"Keywords: Drug Development, Peptide/Protein, ...",2bind GmbH,Am BioPark 11 BioPark II Deutschland-93053 Reg...,Telefon: +49 (0) 941 20000890http://www.2bind...
2,Category: Biotech Therapeutics & Diagnostics,"Keywords: Drug Development, Small Molecules, ...",4SC AG,Fraunhoferstraße 22 Deutschland-82152 Martinsried,Telefon: +49 (0) 89 700763-0 Fax: +49 (0) 89 ...
3,Category: Pharma supplier and trade,abf diagnostics GmbH,Raiffeisenstraße 34 Deutschland-85402 Kranzberg,Telefon: +49 8166 9986 130 Fax: +49 8166 9986...,
4,Category: CRO,"Keywords: Analytical Chemistry, Biomarker Dis...",ABF GmbH,Semmelweisstraße 5 Deutschland-82152 Planegg-S...,Telefon: +49 89 535395http://www.abf-lab.comm...


## **BioNow**

In [29]:
bionow_new_columns = ["name","address","contact","link","summary"]

In [30]:
# Open the spreadsheet and get the specific sheet
worksheet_name = "Bionow"  # Replace with the name of the sheet you want to access

worksheet = gc.open_by_url(spreadsheet_url).worksheet(worksheet_name)

# Get the values from the sheet
rows = worksheet.get_all_values()

columns_to_keep = [0,1,2,3,4]
selected_rows = [[row[i] for i in columns_to_keep] for row in rows[3:]]

# Convert to a DataFrame
df_bionow = pd.DataFrame.from_records(selected_rows,columns=bionow_new_columns)

# Replace missing data with NaN
df_bionow.replace({'': np.nan}, inplace=True)

In [31]:
list_df.append(df_bionow)
df_bionow.head()

Unnamed: 0,name,address,contact,link,summary
0,3M Buckley Innovation Centre,"Firth Street, Huddersfield, West Yorkshire, H...",1484505601,http://www.3mbic.com,3M BIC is a centre for enterprise and innovati...
1,Alderley Park,"Alderley Park, Congleton Road, Macclesfield, ...",0161 233 7877,https://www.alderleypark.co.uk/,Alderley Park a development by Bruntwood SciTe...
2,Ambit,"The Boathouse, Clarence Mill, Clarence Road, ...",1625562201,https://ambit.careers,Ambit works with business founders and hiring ...
3,AstraZeneca,"Alderley Park, Macclesfield, Cheshire, SK10 4TG",0800 032 0501,http://www.astrazeneca.co.uk,AstraZeneca is one of the world’s leading phar...
4,AstraZeneca Liverpool,"5 Renaissance Way, Speke, Liverpool, L24 9JW",0151 4857700,http://www.medimmune.com,"MedImmune, the worldwide biologics business fo..."


## **Biopartner**

In [32]:
biopartner_new_columns = ["name","summary1","summary2","summary3","summary4","summary5","summary6","summary7","to_delete","link","contact"]

In [33]:
# Open the spreadsheet and get the specific sheet
worksheet_name = "Biopartner"  # Replace with the name of the sheet you want to access

worksheet = gc.open_by_url(spreadsheet_url).worksheet(worksheet_name)

# Get the values from the sheet
rows = worksheet.get_all_values()

columns_to_keep = [0,1,2,3,4,5,6,7,8,9,10]
selected_rows = [[row[i] for i in columns_to_keep] for row in rows[3:]]

# Convert to a DataFrame
df_biopartner = pd.DataFrame.from_records(selected_rows,columns=biopartner_new_columns)

# Replace missing data with NaN
df_biopartner.replace({'': np.nan}, inplace=True)

**Cleaning**

In [34]:
# Concatenate summary columns into 'Summary'
df_biopartner['summary'] = df_biopartner['summary1'].fillna('') + \
                            df_biopartner['summary2'].fillna('') + \
                            df_biopartner['summary3'].fillna('') + \
                            df_biopartner['summary4'].fillna('') + \
                            df_biopartner['summary5'].fillna('') + \
                            df_biopartner['summary6'].fillna('') + \
                            df_biopartner['summary7'].fillna('')

# Drop the individual summary columns if needed
df_biopartner = df_biopartner.drop(columns=['to_delete','summary1', 'summary2', 'summary3', 'summary4', 'summary5', 'summary6', 'summary7'])

In [35]:
list_df.append(df_biopartner)
df_biopartner

Unnamed: 0,name,link,contact,summary
0,ACROBiosystems,www.acrobiosystems.com,@AcrobiosystemsE,"ACROBiosystems Group, founded in 2010 and list..."
1,Albumedix,http://www.albumedix.com,@Albumedix,Albumedix - Dedicated to Better HealthAlbumedi...
2,Alderley Park,https://www.alderleypark.co.uk/,@AlderleyPark,Alderley Park is a place where world leading s...
3,Almac Discovery,https://www.almacgroup.com/discovery/,@AlmacGroup,Almac Discovery is a research driven drug disc...
4,Almac Group,https://www.almacgroup.com,@AlmacGroup,The Almac Group is an established contract dev...
...,...,...,...,...
56,UK BioIndustry Association (BIA),https://viramal.com/,@SygnatureDiscov,"Established in 1989, the BIA (BioIndustry Asso..."
57,Viramal,https://www.worldwide.com/,@BIA_UK,Viramal is a Specialty Pharmaceutical Company ...
58,Worldwide Clinical Trials,,@worldwidetrials,Worldwide Clinical Trials provides drug develo...
59,,,,


## **Bioregio**

In [36]:
bioregio_new_columns = ["name"]

In [37]:
# Open the spreadsheet and get the specific sheet
worksheet_name = "Bioregio"  # Replace with the name of the sheet you want to access

worksheet = gc.open_by_url(spreadsheet_url).worksheet(worksheet_name)

# Get the values from the sheet
rows = worksheet.get_all_values()

columns_to_keep = [0]
selected_rows = [[row[i] for i in columns_to_keep] for row in rows[3:]]

# Convert to a DataFrame
df_bioregio = pd.DataFrame.from_records(selected_rows,columns=bioregio_new_columns)

# Replace missing data with NaN
df_bioregio.replace({'': np.nan}, inplace=True)

In [38]:
list_df.append(df_bioregio)
df_bioregio

Unnamed: 0,name
0,biohymed network
1,BioMan4R2
2,BSTART
3,Codex4SMEs
4,Einschnitte - Einblicke
...,...
841,ZSE Tübingen - Behandlungs- und Forschungszent...
842,ZSE Tübingen - Behandlungs- und Forschungszent...
843,Privacy Statement
844,Imprint


## **EU Startups**

In [39]:
eustartup_new_columns = ["name"]

In [40]:
# Open the spreadsheet and get the specific sheet
worksheet_name = "EU_startups"  # Replace with the name of the sheet you want to access

worksheet = gc.open_by_url(spreadsheet_url).worksheet(worksheet_name)

# Get the values from the sheet
rows = worksheet.get_all_values()

columns_to_keep = [2]
selected_rows = [[row[i] for i in columns_to_keep] for row in rows[1:]]

# Convert to a DataFrame
df_eustartups = pd.DataFrame.from_records(selected_rows,columns=eustartup_new_columns)

# Replace missing data with NaN
df_eustartups.replace({'': np.nan}, inplace=True)

In [41]:
list_df.append(df_eustartups)
df_eustartups

Unnamed: 0,name
0,Cropler
1,AllesHealth
2,simpli.codes
3,Newel Health
4,La Ruche Health
...,...
372,Sleepio
373,Kaiku Health
374,Remente
375,Zava


# **Merge**

In [42]:
df_merged = pd.concat(list_df, sort=False)
#Delete all duplicates
df_merged = df_merged.loc[~df_merged.name.duplicated(keep='first')]
df_merged.reset_index(drop=True)

Unnamed: 0,name,link,contact,summary,address,field
0,A&O Pharmadienstleistungen,www.aopharma.de,07628 95 03 119,Arzneimittelfreigaben klinische Prüfpräparate ...,"Am Sattel 17, 79588, Efringen-Kirchen",
1,Albert-Ludwigs-Universität,https://informatik.uni-freiburg.de,0761 203-7461,Arcondis ist eine Unternehmensberatung für das...,"Institut für Informatik, 79110, Freiburg",
2,Arcondis AG Schweiz,www.arcondis.com,0041 61 717 8200,Wir sind ein kleines unabhängiges Institut für...,"Christoph Merian-Ring 31A, 0, Reinach",
3,ATG biosynthetics GmbH,www.atg-biosynthetics.com,0761 888 94 24,Die BioCopy AG ist ein junges Biotech-Startup ...,"Weberstr. 40, 79249, Merzhausen",
4,BBI Solutions,www.diarect.com,0761 47979 0,BioCopy’s vielfach ausgezeichnetes Team von me...,"Bötzinger Str. 29b, 79111, Freiburg",
...,...,...,...,...,...,...
2126,Sleepio,,,,,
2127,Kaiku Health,,,,,
2128,Remente,,,,,
2129,Zava,,,,,


# **Load**

In [43]:
#Export the dataframe into a google sheets

#Open a new sheet
worksheet = gc.open_by_url("https://docs.google.com/spreadsheets/d/1-i88MzUP_Fq6TBLCM_IZckeEJe3z0kLOkBHqCnX3E0A/edit#gid=0").sheet1
#Export our "df_partner_linked" dataframe into our worksheet
gd.set_with_dataframe(worksheet, df_merged)


print('Data uploaded to Google Sheets!')

Data uploaded to Google Sheets!


# **Data Enrichment**

**Defining needed functions**

In [44]:
def cleaning_list(list_text):

    cleaned_list = []
    for item in list_text:
        if item.strip():  # if the item is not just whitespace
            if not item.startswith('          '):  # if the item is not a subheading
                cleaned_list.append(item.strip())
            elif item.startswith('          '):
                for i in range(len(item.strip())):
                    if item.strip()[i] != ' ':
                        new_item = item.strip()[i:]
                        cleaned_list.append(new_item)
                        break  
    return cleaned_list

def list_to_dict(cleaned_list):
    keywords = ['Website','Phone','Industry','Company size','Headquarters','Founded','Specialties','name']
    index = 0
    full_summary = ""
    result = {}
    for i in range(len(cleaned_list)):
        if cleaned_list[i] == "Website":
            index = i
            break
        if cleaned_list[i] != 'Overview':
            full_summary += f" {cleaned_list[i]}"
    result['Overview'] = full_summary

    for j, d in enumerate(cleaned_list):
        if j >= index:
            if d in keywords:
                result[d] = cleaned_list[j+1] 
    return result


def string_similarity(s1, s2):
    # Count the occurrences of each character in the strings
    count1 = Counter(str(s1).lower())
    count2 = Counter(str(s2).lower())

    # Calculate the total number of characters in the two strings
    total_chars = sum(count1.values()) + sum(count2.values())

    # Calculate the number of characters that appear in both strings
    common_chars = sum((count1 & count2).values())

    # Calculate the similarity as a percentage
    similarity = common_chars / total_chars * 200

    return similarity

**Define the options of the Selenium webdriver**

In [45]:
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')
options.add_argument('--disable-browser-side-navigation')
options.add_argument('--disable-infobars')
options.add_argument('--disable-extensions')
options.add_argument('--disable-features=VizDisplayCompositor')
options.add_argument('--remote-debugging-port=9222')
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--disable-notifications')
#options.add_argument('--headless') # to hide the window
options.add_argument('--disable-default-apps')
options.add_argument('--disable-translate')
options.add_argument('--disable-background-timer-throttling')
options.add_argument('--mute-audio')
options.add_argument('--disable-popup-blocking')
options.add_argument('--disable-web-security')
options.add_argument('--disable-backgrounding-occluded-windows')
options.add_argument('--safebrowsing-disable-auto-update')
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36')

**Search companies on LinkedIn**

In [46]:
def data_enrichment(df_merged, scraped_companies):
    driver = webdriver.Chrome(options=options)
    driver.get("https://www.linkedin.com/login")

    new_info = list()

    # Wait for the email input field to be visible
    eml = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, "username")))
    eml.send_keys("thesinou02@gmail.com")

    # Wait for the password input field to be visible
    passwd = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, "password")))
    passwd.send_keys("Taon3nos!")  # need to hide it

    # Wait for the login button to be clickable
    loginbutton = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//*[@id=\"organic-div\"]/form/div[3]/button")))
    loginbutton.click()

    for index, company in enumerate(df_merged["name"]):
        if index > len(scraped_companies):
            try:
                # Wait for the search bar to be visible
                research = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, "search-global-typeahead__input")))
                research.clear()
                research.send_keys(company)
                research.send_keys(Keys.ENTER)
                research.clear()
            except:
                driver.get("https://www.linkedin.com")
                research = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, "search-global-typeahead__input")))
                research.clear()
                research.send_keys(company)
                research.send_keys(Keys.ENTER)
                research.clear()
            try:
                time.sleep(2)
                # Get the URL of the search
                current_url = driver.current_url
                # Replace "all" with "companies" in the current URL
                new_url = current_url.replace("all", "companies")
                # Go to the "companies" tab
                driver.get(new_url)
            except:
                pass

            # Wait for the company element to be visible
            try:
                company_element = WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".entity-result__title-line a, .entity-result__title-line a:hover, .entity-result__title-line a:hover:visited, .entity-result__title-line a:visited")))
                company_text = company_element.text
                if string_similarity(company_text, company) >= 50:
                    try:
                        company_link = company_element.get_attribute('href')
                        driver.get(company_link + "about/")
                    except:
                        pass
            except:
                pass

            try:
                source = BeautifulSoup(driver.page_source)
                section = source.find_all('section', class_='artdeco-card p5 artdeco-card mb4')

                for div in section:
                    text = div.get_text()
                    list_text = text.split('\n')
                    list_text.append('name')
                    list_text.append(company)
                    new_info.append(list_text)
                    all_dict.append(list_to_dict(cleaning_list(list_text)))
            except:
                pass

    return new_info

**Scrape startups on Biocat**

In [47]:
def biocat():
    # Open the website Biocat
    driver = webdriver.Chrome(options=options)
    driver.get("https://biocat.my.site.com/Catalonialifesciencesdatabase/s/")
    
    # Open select bar to select a sector
    sector_input_element = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="ms-input-48"]')))
    sector_input_element.click()
    
    # Select 'DigitalHealth'
    sector_digital_health = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="ms-list-item-66"]')))
    sector_digital_health.click()
    
    # Click on the search button
    search_button = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="filters-10"]/div[5]/button[2]')))
    search_button.click()
    
    # Define the dictionnary to fill
    all_startups = list()
    
    #Select "More details" buttons
    button_elements = list()
    
    page = 1
    
    while True:
        print("Page ", page)
        try:
            button_elements = list()
            for i in range(20):
                button_xpath = '//*[@id="CustomerPortalTemplate"]/div[1]/div/div[1]/div/div[2]/c-search-landing/div/section[3]/div[2]/c-company-card[{}]/button/div[3]/p'.format(i+1)
                button_element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, button_xpath)))
                button_elements.append(button_element)

            for i, button in enumerate(button_elements):
                button.click()

                element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[3]/div[1]/div/div[1]/div/div[2]/c-search-landing/c-card-modal/section/div/div')))

                # Get all child elements
                child_elements = element.find_elements(By.XPATH, ".//*")

                html_doc = ""
                # For each child element, add its attributes to html_doc
                for child in child_elements:
                    html_doc += child.get_attribute('outerHTML')



                # Parse the HTML with BeautifulSoup
                soup = BeautifulSoup(html_doc, 'html.parser')

                # Initialize an empty dictionary to store the information
                info = {}

                # Find the title of the company
                name = soup.find(class_='card-title')
                if name:
                    info['name'] = name.get_text(strip=True)

                # Find the address, phone, contact person, year founded and description
                rest_info = soup.find_all(class_='section-desc')
                if rest_info:
                    rest_info = [element.get_text(strip=True) for element in rest_info]
                    info['address'] = ''
                    address = rest_info[:3]
                    for k in address:
                        info['address'] += k

                    # Check all possible formats of the scraped list
                    if rest_info[3] == "":
                        info['contact_person'] = ""
                        info['phone'] = ""  
                        info['year_founded'] = rest_info[4]
                        info['description'] = rest_info[5]
                    elif rest_info[3].startswith('+') or rest_info[3].startswith('('):
                        info['contact_person'] = ""
                        info['phone'] = rest_info[3]  
                        info['year_founded'] = rest_info[4]
                        info['description'] = rest_info[5]
                    elif not (rest_info[3].startswith('+') or rest_info[3].startswith('(')) and (rest_info[4].startswith('+') or rest_info[4].startswith('(')):
                        info['contact_person'] = rest_info[3]
                        info['phone'] = rest_info[4]  
                        info['year_founded'] = rest_info[5]
                        info['description'] = rest_info[6]
                    else:
                        info['contact_person'] = rest_info[3]
                        info['phone'] = ""  
                        info['year_founded'] = rest_info[5]
                        info['description'] = rest_info[6]

                # Find the email
                email = soup.find(class_='section-mail')
                if email:
                    info['email'] = email.get_text(strip=True)

                # Find the website
                website = soup.find(class_='section-link')
                if website:
                    info['website'] = website.get('href')

                # Find the main sector
                main_sector = soup.find(class_='main-sector-cat')
                if main_sector:
                    info['main_sector'] = main_sector.get_text(strip=True)

                # Find the subsector
                subsector = soup.find(class_='subsector-cat')
                if subsector:
                    info['subsector'] = subsector.get_text(strip=True)

                # Add the scraped information into "all_startups"
                all_startups.append(info)

                time.sleep(1)

                # Find close button and click on it
                close_button = driver.find_element(By.XPATH, '/html/body/div[3]/div[1]/div/div[1]/div/div[2]/c-search-landing/c-card-modal/section/div/lightning-button-icon/button')
                close_button.click()

                # Change page
                if (i+1) % 20 == 0:
                    if page == 1:
                        next_page_button = driver.find_element(By.XPATH, '/html/body/div[3]/div[1]/div/div[1]/div/div[2]/c-search-landing/div/section[3]/div[3]/lightning-button')
                    else :
                        next_page_button = driver.find_element(By.XPATH, '/html/body/div[3]/div[1]/div/div[1]/div/div[2]/c-search-landing/div/section[3]/div[3]/lightning-button[2]')
                    next_page_button.click()
                    page += 1
                    time.sleep(0.5)               
        except:
            print("Finished !")
            return all_startups

In [48]:
df_biocat = pd.DataFrame(biocat())

Page  1
Page  2
Page  3
Page  4
Page  5
Page  6
Page  7
Page  8
Page  9
Page  10
Page  11
Finished !


In [49]:
df_biocat['phone'] = df_biocat['phone'].apply(lambda x: "'" + x)

In [50]:
new_columns = ["name","location","contact_person","phone","founded","summary","email","website","sector","specialisation"]
df_biocat.columns = new_columns

In [51]:
df_biocat

Unnamed: 0,name,location,contact_person,phone,founded,summary,email,website,sector,specialisation
0,3D Tech Omega Zeta,"C/ Valencia nº1, Complejo Roma 2000, Planta -1...","Contacte Corporatiu - 3D Tech Omega Zeta, Cont...",',2013,3D Tech Omega Zeta specializes in developing a...,info@3dtoz.com,http://www.3dtechomegazeta.com,Digital Health,Online Health Communities
1,3D-Shaper Medical,"Carrer de París, 17908036 Barcelona BarcelonaE...",,',2020,3D-Shaper Medical is a medical imaging softwar...,,https://www.3d-shaper.com/en/index.html,Digital Health,Medical Big Data & analytics
2,3dthinks,Carrer de la Blanqueria 1308003 Barcelona Barc...,,',2022,Startup that helps people with disabilites to ...,,https://3dthinks.com/,Digital Health,
3,3DVisiomedicavirtual,"Apartat de Correus, 1008171 Sant Cugat del Val...","Santiago Pellicer Pérez, Assessor",'+34 934 525 411,2007,3DVisiomedicavirtual is a company with more th...,info@visionmedicavirtual.com,http://www.visionmedicavirtual.com,Digital Health,E-Medical Record / E-health Record
4,8Wires,"Carrer Pau Claris, 100, 608009 Barcelona Barce...","Anxo Armada Fernández, CEO & Founder",'+34 630 83 19 59,2016,8Wires is specialized in data analysis with bi...,8wires@8wires.io,http://8wires.io,Digital Health,Medical Big Data & analytics
...,...,...,...,...,...,...,...,...,...,...
195,TripMedic,"VIlafranca, 708024 Barcelona BarcelonaEspanya","Filippo Meloni, CEO",',2019,TripMedic is a multilingual medical service pr...,info@tripmedic.com,https://tripmedic.com/,Digital Health,Health Services Search
196,UhDa Health (Universal Health Digital Access),Carrer de Luis Antúnez 608006 Barcelona Barcel...,,',2007,Spin-off company of Universal Doctor. We work ...,,https://www.uhda.health/,Digital Health,
197,Universal Doctor,"Carrer Lluis Antúnez, 6 (Kubik)08006 Barcelona...","Jordi Serrano Pons, Founder & CEO",'+34 68 6 6 07 819,2007,UniversalDoctor Project facilitates multilingu...,info@universaldoctor.com,http://www.universaldoctor.com/,Digital Health,Healthcare Mobile Communication
198,Up2Smart,"Avinguda dels Països Catalans, 1843007 Tarrago...","Pau Puig, CEO",',2019,UP2Smart focuses on the area of computer visio...,uptwosmart@gmail.com,https://www.up2smart.com/,Digital Health,Medical Big Data & analytics


In [52]:
#Export the dataframe into a google sheets

#Open a new sheet
sheet1 = gc.open_by_url("https://docs.google.com/spreadsheets/d/11PZqjOByAbXv7qYan6t1j2P1vu7uE_vkK9rHjyifHHM/edit#gid=1793959200")
sheet_name1 = "Biocat"
worksheet = sheet1.worksheet(sheet_name1)
#Export our "df_partner_linked" dataframe into our worksheet
gd.set_with_dataframe(worksheet, df_biocat)


print('Data uploaded to Google Sheets!')

Data uploaded to Google Sheets!


**Define df_scraped and all_dict**

In [53]:
# List of dictionnaries containing the scraped data on Linkedin
all_dict = list()

#Define columns of the dataframe
new_columns_dict = {'Présentation':"summary",
               "Site web":"website", 
               "Téléphone":"phone",
               "Secteur":"sector",
               'Taille de l’entreprise':"size",
               'Siège social':"location",
               'Fondée en':"founded",
               'Spécialisations':"specialisation",
               'name':"name"}
new_columns = ["summary","website", "phone","sector","size","location","founded","specialisation","name"]
#Define the new order of columns
new_columns_order = ["name","summary","website", "phone","sector","size","location","founded","specialisation"]

**Get the already scraped data**

In [54]:
# Open the spreadsheet and get the specific sheet
worksheet_name = "Sheet1"

worksheet = gc.open_by_url("https://docs.google.com/spreadsheets/d/15FWNNLaJW-QU9OKtN3KmrWEcGf7ZeWAAm4_05leRz9I/edit#gid=0").worksheet(worksheet_name)

# Get the values from the sheet
rows = worksheet.get_all_values()

columns_to_keep = [0,1,2,3,4,5,6,7,8]
selected_rows = [[row[i] for i in columns_to_keep] for row in rows[1:]]

# Convert to a DataFrame
df_scraped = pd.DataFrame.from_records(selected_rows,columns=new_columns_order)

**Create "scraped_companies" : list of tuples (index, company) -> to keep track of which companies have already been searched on Linkedin**

In [55]:
scraped_companies = list()
for index, company in enumerate(df_merged["name"]):
    if company == df_scraped['name'].tail(1).values[0]:
        break
    else:
        scraped_companies.append((index,company))

IndexError: index 0 is out of bounds for axis 0 with size 0

 **Scrape the data and load it in a dataframe (df_scraped)**

In [56]:
new_info = data_enrichment(df_merged, scraped_companies)

**Add new data to df_scraped**

In [78]:
if len(new_info) > 0:
    # Create a DataFrame from all_dict
    new_data = pd.DataFrame(all_dict)
    new_data.columns=new_columns
    new_data = new_data[new_columns_order]
    new_data['phone'] = new_data['phone'].astype(str)
    new_data['phone'] = new_data['phone'].apply(lambda x: "'" + x)
    new_data['phone'] = new_data['phone'].replace("'nan", np.nan)

    # Concatenate df_scraped and new_data
    df_scraped = pd.concat([df_scraped, new_data], ignore_index=True).drop_duplicates(subset='name')
    df_scraped.reset_index(inplace=True)
    df_scraped = df_scraped.drop('index', axis=1)
df_scraped

Unnamed: 0,name,summary,website,phone,sector,size,location,founded,specialisation
0,Albert-Ludwigs-Universität,Contact: info(at)uni-freiburg.de Imprint: htt...,http://www.uni-freiburg.de/,'+49761203-0,Research Services,"5,001-10,000 employees","Freiburg, Baden-Württemberg",1457,"Wissenschaft, Forschung, Lehre, Technologietra..."
1,ATG biosynthetics GmbH,ATG is a synthetic biology and biotech compan...,https://www.atg-biologics.com/,'+49 7618889424,Biotechnology Research,2-10 employees,"Merzhausen, Baden Wurttemberg",2001,"gene design and synthesis, biopeptide expressi..."
2,BBI Solutions,BBI Solutions - Serving the Science of Diagno...,https://www.bbisolutions.com,'+44 (0)1495 363000,Biotechnology Research,201-500 employees,"Crumlin, Gwent",1986,"Gold Nanoparticles, Assay Reagents, Contract L..."
3,BioCopy Gmbh,BioCopy is a multinational company headquarte...,http://www.biocopy.com,,Biotechnology Research,11-50 employees,"Emmendingen, Baden-Württemberg",,"Binding Kinetics, Microarrays, and Screening"
4,BioFluidix GmbH,LOW VOLUME LIQUID HANDLING AUTOMATION BioFlui...,http://www.biofluidix.com/,'0761 4589380,Biotechnology,11-50 employees,"Freiburg im Breisgau, Baden-Württemberg",2005,"LifeScience, Dispensing Technologie, Non-Conta..."
...,...,...,...,...,...,...,...,...,...
1507,Sleepio,Sleepio is the digital sleep improvement prog...,http://www.sleepio.com,,Mental Health Care,2-10 employees,,,
1508,Kaiku Health,Kaiku Health is a digital health company aimi...,http://www.kaikuhealth.com,,Medical Equipment Manufacturing,51-200 employees,"Helsinki, Uusimaa",2012,"cancer care, digital health, ehealth, occupati..."
1509,Remente,Remente is a holistic wellbeing system for in...,https://remente.com/,,Wellness and Fitness Services,2-10 employees,,2011,"Software Development, Quantified Self, Interne..."
1510,Zava,Remente is a holistic wellbeing system for in...,https://remente.com/,,Wellness and Fitness Services,2-10 employees,,2011,"Software Development, Quantified Self, Interne..."


**Merge df_scraped with df_biocat and clean**

In [79]:
df_scraped = pd.concat([df_scraped, df_biocat], ignore_index=True).drop_duplicates(subset='name')
df_scraped['founded'] = df_scraped['founded'].replace('', np.nan)

In [80]:
df_scraped

Unnamed: 0,name,summary,website,phone,sector,size,location,founded,specialisation,contact_person,email
0,Albert-Ludwigs-Universität,Contact: info(at)uni-freiburg.de Imprint: htt...,http://www.uni-freiburg.de/,'+49761203-0,Research Services,"5,001-10,000 employees","Freiburg, Baden-Württemberg",1457,"Wissenschaft, Forschung, Lehre, Technologietra...",,
1,ATG biosynthetics GmbH,ATG is a synthetic biology and biotech compan...,https://www.atg-biologics.com/,'+49 7618889424,Biotechnology Research,2-10 employees,"Merzhausen, Baden Wurttemberg",2001,"gene design and synthesis, biopeptide expressi...",,
2,BBI Solutions,BBI Solutions - Serving the Science of Diagno...,https://www.bbisolutions.com,'+44 (0)1495 363000,Biotechnology Research,201-500 employees,"Crumlin, Gwent",1986,"Gold Nanoparticles, Assay Reagents, Contract L...",,
3,BioCopy Gmbh,BioCopy is a multinational company headquarte...,http://www.biocopy.com,,Biotechnology Research,11-50 employees,"Emmendingen, Baden-Württemberg",,"Binding Kinetics, Microarrays, and Screening",,
4,BioFluidix GmbH,LOW VOLUME LIQUID HANDLING AUTOMATION BioFlui...,http://www.biofluidix.com/,'0761 4589380,Biotechnology,11-50 employees,"Freiburg im Breisgau, Baden-Württemberg",2005,"LifeScience, Dispensing Technologie, Non-Conta...",,
...,...,...,...,...,...,...,...,...,...,...,...
1707,TripMedic,TripMedic is a multilingual medical service pr...,https://tripmedic.com/,',Digital Health,,"VIlafranca, 708024 Barcelona BarcelonaEspanya",2019,Health Services Search,"Filippo Meloni, CEO",info@tripmedic.com
1708,UhDa Health (Universal Health Digital Access),Spin-off company of Universal Doctor. We work ...,https://www.uhda.health/,',Digital Health,,Carrer de Luis Antúnez 608006 Barcelona Barcel...,2007,,,
1709,Universal Doctor,UniversalDoctor Project facilitates multilingu...,http://www.universaldoctor.com/,'+34 68 6 6 07 819,Digital Health,,"Carrer Lluis Antúnez, 6 (Kubik)08006 Barcelona...",2007,Healthcare Mobile Communication,"Jordi Serrano Pons, Founder & CEO",info@universaldoctor.com
1710,Up2Smart,UP2Smart focuses on the area of computer visio...,https://www.up2smart.com/,',Digital Health,,"Avinguda dels Països Catalans, 1843007 Tarrago...",2019,Medical Big Data & analytics,"Pau Puig, CEO",uptwosmart@gmail.com


**Loading the scraped data into a spreadsheet**

In [81]:
#Export the dataframe into a google sheets

#Open a new sheet
worksheet = gc.open_by_url("https://docs.google.com/spreadsheets/d/15FWNNLaJW-QU9OKtN3KmrWEcGf7ZeWAAm4_05leRz9I/edit#gid=0").sheet1
#Export our "df_partner_linked" dataframe into our worksheet
gd.set_with_dataframe(worksheet, df_scraped)

print('Data uploaded to Google Sheets!')

Data uploaded to Google Sheets!


# **Data filtering**

**Filtering the data to keep only e-health startups**

In [82]:
# Convert 'founded' column to datetime format
df_scraped['founded'] = df_scraped['founded'].fillna(0).astype(int)

# Filter the dataframe to keep only companies founded after 2016
df_filtered = df_scraped[(df_scraped['founded'].isnull()) | (df_scraped['founded'] >= 2017)]

# Filter the dataframe to keep only rows with specific 'size' values
df_filtered['size'] = df_filtered['size'].fillna('missing')
df_filtered = df_filtered[df_filtered['size'].str.contains('0-1 employees|2-10 employees|11-50 employees|missing')]

df_filtered = df_filtered.reset_index()
df_filtered = df_filtered.drop("index",axis=1)
df_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['size'] = df_filtered['size'].fillna('missing')


Unnamed: 0,name,summary,website,phone,sector,size,location,founded,specialisation,contact_person,email
0,BioLabs Heidelberg,BioLabs Heidelberg is designed with life scie...,https://www.biolabs.io/heidelberg,,Biotechnology Research,11-50 employees,,2021,,,
1,CORAT Therapeutics GmbH,CORAT develops phage-display derived fully hu...,http://corat-therapeutics.com/,'+49 1522 4047488,Biotechnology Research,2-10 employees,"Brunswick, Lower Saxony",2020,,,
2,Docuply,Die GxP-Dokumenten­management- und Kollabo­ra...,https://docuply.io,,IT Services and IT Consulting,2-10 employees,,2022,,,
3,Evotec International GmbH,CEBINA Bridge Capital has entered into a coll...,https://www.cebinabridgecapital.com/,,Biotechnology Research,2-10 employees,"Gibraltar, Gibraltar",2021,,,
4,Genius Search,Genius Search is one of the leading personnel...,http://www.genius-search.com,,Staffing and Recruiting,2-10 employees,"Heidelberg, Deutschland",2018,"Executive Search, Board Services, Talent Pipel...",,
...,...,...,...,...,...,...,...,...,...,...,...
395,Trackyourmed,An innovative digital technology providing a n...,http://www.trackyourmed.com/,',Digital Health,missing,"Suissa, 9-1108023 Barcelona BarcelonaEspanya",2018,Patient Engagement,"Oriol Bestard Matamoros, CEO & Founder",corporate@trackyourmed.com
396,Treat,Treat is a Health Community segmented by topic...,https://www.treathealthcare.com.au/treat-app,',Digital Health,missing,"Pau Claris 108, 4-208009 Barcelona BarcelonaEs...",2017,Mobile Fitness / Health Apps,,
397,Trialing,Trialing helps physicians to easily find the m...,https://www.trialing.org/,',Digital Health,missing,Carrer de la Diputació 4808015 Barcelona Barce...,2021,Health Services Search,,
398,TripMedic,TripMedic is a multilingual medical service pr...,https://tripmedic.com/,',Digital Health,missing,"VIlafranca, 708024 Barcelona BarcelonaEspanya",2019,Health Services Search,"Filippo Meloni, CEO",info@tripmedic.com


**ChatGPT API**

In [83]:
openai.organization = None
#API key for ChatGPT
openai.api_key = "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"

In [84]:
#Setting up the context for ChatGPT
messages = [
    {"role": "system", "content": "You will only answer with 'True' or 'False' to my questions"},
]

In [85]:
df_gpt = df_filtered.copy()
df_gpt['is_startup'] = pd.Series(np.nan, dtype='string', index=df_gpt.index)

**Filtering the startups using ChatGPT**

In [90]:
#ChatGPT will read each startup's summary and tell if its an e-health startup
for i, summary in enumerate(df_gpt['summary']):
    if pd.isna(df_gpt["is_startup"][i]):
        try:
            print(i)
            messages = []
            #Create the message
            message = f"{summary}. \nAccording to this paragraph, is this company/institution a relatively new startup in e-Health? Respond by only 'True' or 'False' and without punctuation. If you don't know, respond by 'NaN'"
            messages.append(
                {"role": "user", "content": message},
            )
            #Generate the response of ChatGPT
            chat = openai.ChatCompletion.create(
                model="gpt-3.5-turbo", messages=messages
            )

            chatgpt_response = chat.choices[0].message.content
            messages.append({"role": "assistant", "content": chatgpt_response})

            #Add the response of ChatGPT (True or False) to the  dataframe
            df_gpt["is_startup"][i] = chatgpt_response
        except:
            print("Error")
     

**Delete non startups**

In [92]:
# Drop rows where 'is_startup' is equal to 'False'
df_gpt = df_gpt[df_gpt['is_startup'] == 'True']
df_gpt = df_gpt.reset_index(drop=True)

In [93]:
df_gpt

Unnamed: 0,name,summary,website,phone,sector,size,location,founded,specialisation,contact_person,email,is_startup
0,PixelBiotech GmbH ​,PixelBiotech’s mission is to deliver cutting-...,https://www.pixelbiotech.com,,Biotechnology,2-10 employees,"Heidelberg, Baden-Württemberg",2018,"single molecule FISH, genetic testing, artific...",,,True
1,Tcelltech GmbH,"Tcelltech GmbH, a spin-out of the German Canc...",https://www.tcelltech.eu/,,Biotechnology Research,2-10 employees,"Mannheim, Baden-Württemberg",2022,"Cell Therapy, Cancer Vaccines, Non-viral Gene ...",,,True
2,Trialflow GmbH,"Clinical trials need to be made faster, smart...",http://www.trialflow.io,'+49 25159064698,IT Services and IT Consulting,2-10 employees,"Münster, Nordrhein-Westfalen",2020,,,,True
3,Trialzen,Clinical trials constitute a critical compone...,http://www.trialzen.com,,Software Development,2-10 employees,,2021,,,,True
4,Connected-Pathology,"Founded in 2021, C-Path, or Connected-Patholo...",https://www.connected-pathology.com/,,Biotechnology,2-10 employees,,2021,"pathology, omics, digitalization, preclinical,...",,,True
...,...,...,...,...,...,...,...,...,...,...,...,...
258,Trackyourmed,An innovative digital technology providing a n...,http://www.trackyourmed.com/,',Digital Health,missing,"Suissa, 9-1108023 Barcelona BarcelonaEspanya",2018,Patient Engagement,"Oriol Bestard Matamoros, CEO & Founder",corporate@trackyourmed.com,True
259,Treat,Treat is a Health Community segmented by topic...,https://www.treathealthcare.com.au/treat-app,',Digital Health,missing,"Pau Claris 108, 4-208009 Barcelona BarcelonaEs...",2017,Mobile Fitness / Health Apps,,,True
260,Trialing,Trialing helps physicians to easily find the m...,https://www.trialing.org/,',Digital Health,missing,Carrer de la Diputació 4808015 Barcelona Barce...,2021,Health Services Search,,,True
261,TripMedic,TripMedic is a multilingual medical service pr...,https://tripmedic.com/,',Digital Health,missing,"VIlafranca, 708024 Barcelona BarcelonaEspanya",2019,Health Services Search,"Filippo Meloni, CEO",info@tripmedic.com,True


In [94]:
#Export the dataframe into a google sheets

#Open a new sheet
worksheet = gc.open_by_url("https://docs.google.com/spreadsheets/d/1ouEg77SMB8RhD0AmgdbpD0sXlxTBcxknuzvyGKSvQOU/edit#gid=0").sheet1
#Export our "df_partner_linked" dataframe into our worksheet
gd.set_with_dataframe(worksheet, df_gpt)

print('Data uploaded to Google Sheets!')

Data uploaded to Google Sheets!
