In [4]:
import os, random, sys, time, shutil, re, csv, datetime, pickle, math 

import pandas as pd

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.webdriver import Options

from webdriver_manager.chrome import ChromeDriverManager

from bs4 import BeautifulSoup

In [2]:
chrome_options = Options()
#chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--log-level=3")  # fatal
chrome_options.add_argument("--start-maximized")

In [None]:
browser = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)

In [4]:
companies_path = 'CSVs/clutch_companies.csv'

In [5]:
# Clutch IT Firms URL
clutch_url = 'https://clutch.co/it-services/outsourcing' 

In [6]:
#HTML Div class references (not using yet)

firms_number_html='tabs-info'
firms_list_html='company_logotype'
firm_name_html='header-company--title'
firm_score_html='ratingValue'
firm_location_html='location-name'
firm_clients_html='field-name-clients'
firm_client_name='field c'

In [7]:
def get_companies_csv():
    try: return pd.read_csv(companies_path)
    except: return pd.DataFrame()

In [8]:
def get_total_pages_at(browser):
    soup = BeautifulSoup(browser.page_source, features='html.parser')
    firms_tab_text = soup.find("div",attrs={"class" : "tabs-info"}, recursive=True).text
    firms_number = firms_tab_text.replace(',', '').replace('Firms', '').strip()
    return math.ceil(int(firms_number) / 19)

In [9]:
def find_url_references_at(browser):
    soup = BeautifulSoup(browser.page_source, features='html.parser')
    url_list = soup.find_all("div",attrs={"class" : "company_logotype"}, recursive=True)
    return [url.find('a')['href'] for url in url_list]

In [10]:
def collect_data_from(ref_list):
    df = get_companies_csv()
    for ref in ref_list:
        print('Collecting data from:', ref)
        data = colect_infos_at(ref)
        df = df.append(data, ignore_index=True)
    return df

In [11]:
def colect_infos_at(reference):
    browser.get(f"https://clutch.co{reference}")
    time.sleep(2)
    
    soup = BeautifulSoup(browser.page_source, features='html.parser')

    try:
        name = soup.find("h1", attrs={"class" : "header-company--title"}).text.replace('\n', '').strip()
    except:
        name = 'not found'
    
    try:
        score = soup.find("span", attrs={"itemprop" : "ratingValue"}).text.replace('\n', '').strip()
    except:
        score = 'not found' 
        
    try:
        location = soup.find("span", attrs={"class" : "location-name"}).text.replace('\n', '').strip()
    except:
        location = 'not found'
        
    try:
        main_div = soup.find("div", attrs={"class" : "module-list"}) 
        main_div.find_all("span")
        main_div = [i.text for i in main_div.find_all("span")]
        
        valuation = main_div[0]
        price_hour = main_div[1]
        employees = main_div[2]
    
    except:
        valuation = 'not found'
        price_hour = 'not found'
        employees = 'not found'
    
    clients = ''
    try:
        clients = soup.find("div", attrs={"class" : "field-name-clients"})
        clients =  ' '.join([p.text.replace('\n', ' ').strip() for p in clients.find_all('p')])
    except:
        try:
            clients = soup.find("div", attrs={"class" : "field c"})
            clients = ' '.join([p.text.replace('\n', ' ').strip() for p in clients.find_all('p')])
        except:
            print(f"CLIENTES NÃO ENCONTRADOS EM {name}")
            
    return {'name': name,
            'valuation': valuation,
            'hour_price': price_hour,
            'employees': employees,
            'score': score,
            'location': location,
            'clients': clients}

In [12]:
def collect_companies(lim_search: int = 1000):
    
    print('Accessing: ', clutch_url)
    browser.get(clutch_url)
    time.sleep(2)
    
    pages_limit = int(lim_search/20)
    pages_founded = get_total_pages_at(browser)
    pages = min(pages_founded,pages_limit)
    
    print('Get: ', str(pages), 'pages')
    
    for i in range(1,pages):
        
        print('Accessing page: ', i)
        browser.get(f"https://clutch.co/it-services/outsourcing?page={i}")
        time.sleep(1)
        
        reference_list = find_url_references_at(browser)
        print('Find ', len(reference_list), 'references')
        
        clutch_df = collect_data_from(reference_list)
        clutch_df.to_csv(companies_path, index=False)
    
    return clutch_df

In [13]:
collect_companies()

get_companies_csv()

Accessing:  https://clutch.co/it-services/outsourcing
Get:  50 pages
Accessing page:  1
Find  20 references
Collecting data from: /profile/clickit-smart-technologies
Collecting data from: /profile/delaplex
CLIENTES NÃO ENCONTRADOS EM delaPlex
Collecting data from: /profile/sysbee
Collecting data from: /profile/n-ix
Collecting data from: /profile/avenga
CLIENTES NÃO ENCONTRADOS EM Avenga
Collecting data from: /profile/uplers
Collecting data from: /profile/grape-2
Collecting data from: /profile/techmagic
CLIENTES NÃO ENCONTRADOS EM TechMagic
Collecting data from: /profile/ibexlabs
Collecting data from: /profile/opinov8-technology-services
Collecting data from: /profile/binariks
Collecting data from: /profile/unicsoft
Collecting data from: /profile/chimera-prime
Collecting data from: /profile/mutual-mobile
Collecting data from: /profile/forte-group
Collecting data from: /profile/icreon
Collecting data from: /profile/experion-technologies
Collecting data from: /profile/devcom
Collecting da

Collecting data from: /profile/dysnix
Collecting data from: /profile/maven-wave-partners
Collecting data from: /profile/marlabs
Collecting data from: /profile/locuz
CLIENTES NÃO ENCONTRADOS EM Locuz
Collecting data from: /profile/braineering-it-solutions
CLIENTES NÃO ENCONTRADOS EM Braineering IT Solutions
Collecting data from: /profile/instinctools
Accessing page:  9
Find  20 references
Collecting data from: /profile/lcloud-sp-z-oo
Collecting data from: /profile/vates
Collecting data from: /profile/it-svit
CLIENTES NÃO ENCONTRADOS EM IT Svit
Collecting data from: /profile/iscg
Collecting data from: /profile/softserve
Collecting data from: /profile/kyos
CLIENTES NÃO ENCONTRADOS EM Kyos
Collecting data from: /profile/your-server-doctor
Collecting data from: /profile/predica
Collecting data from: /profile/netsmartz
Collecting data from: /profile/maxitech
Collecting data from: /profile/code-inspiration
CLIENTES NÃO ENCONTRADOS EM Code Inspiration
Collecting data from: /profile/adastra
Col

Collecting data from: /profile/ronwell-digital
Accessing page:  16
Find  20 references
Collecting data from: /profile/andes-digital
CLIENTES NÃO ENCONTRADOS EM Andes Digital
Collecting data from: /profile/112hub
CLIENTES NÃO ENCONTRADOS EM 112HUB
Collecting data from: /profile/centling-technologies
Collecting data from: /profile/epsilon
CLIENTES NÃO ENCONTRADOS EM Epsilon
Collecting data from: /profile/ascendix-technologies
Collecting data from: /profile/chetu
Collecting data from: /profile/objectivity
Collecting data from: /profile/cygnet-infotech
Collecting data from: /profile/boreddevops
CLIENTES NÃO ENCONTRADOS EM boredDevOps
Collecting data from: /profile/venturedive
Collecting data from: /profile/scalex-technology-solutions
CLIENTES NÃO ENCONTRADOS EM Scalex Technology Solutions
Collecting data from: /profile/profile-software-services
Collecting data from: /profile/grupo-oruss
Collecting data from: /profile/maruti-techlabs
Collecting data from: /profile/brevitaz-systems
CLIENTES 

CLIENTES NÃO ENCONTRADOS EM The One Technologies
Collecting data from: /profile/teonite
Collecting data from: /profile/focusteck
Collecting data from: /profile/kairos-0
Collecting data from: /profile/cimpleo
Collecting data from: /profile/surekha-technologies
Accessing page:  23
Find  20 references
Collecting data from: /profile/zimmic
CLIENTES NÃO ENCONTRADOS EM Zimmic
Collecting data from: /profile/brightgrove
Collecting data from: /profile/shalb
Collecting data from: /profile/7edge
Collecting data from: /profile/tripleoak
Collecting data from: /profile/libnamic
Collecting data from: /profile/opgk-software
CLIENTES NÃO ENCONTRADOS EM OPGK Software
Collecting data from: /profile/100-crm
CLIENTES NÃO ENCONTRADOS EM 100 CRM
Collecting data from: /profile/techstern
Collecting data from: /profile/mazaj
Collecting data from: /profile/mellivora-software
Collecting data from: /profile/alignminds-technologies
Collecting data from: /profile/magedirect
Collecting data from: /profile/further-dig

Accessing page:  30
Find  20 references
Collecting data from: /profile/vrp-consulting
Collecting data from: /profile/vti-cloud
CLIENTES NÃO ENCONTRADOS EM VTI Cloud
Collecting data from: /profile/evolpe-consulting-group
Collecting data from: /profile/cartesian-consulting
Collecting data from: /profile/gtm-plus
Collecting data from: /profile/sword-software-n-technologies
Collecting data from: /profile/direction-software-solutions
Collecting data from: /profile/qubit-labs
Collecting data from: /profile/astrea-it-services
Collecting data from: /profile/1950labs
Collecting data from: /profile/addact-technologies
Collecting data from: /profile/grip-software-solutions
CLIENTES NÃO ENCONTRADOS EM GRIP Software Solutions
Collecting data from: /profile/groupbwt
Collecting data from: /profile/logix-technology
CLIENTES NÃO ENCONTRADOS EM Logix Technology
Collecting data from: /profile/highsolutions
Collecting data from: /profile/geek-solutions
CLIENTES NÃO ENCONTRADOS EM Geek-Solutions
Collecting

Collecting data from: /profile/pricewise
Collecting data from: /profile/right-information
Collecting data from: /profile/yameo
Collecting data from: /profile/intellisys-technology
Collecting data from: /profile/appscrip
Collecting data from: /profile/polytech-software
Collecting data from: /profile/basquare
Accessing page:  37
Find  20 references
Collecting data from: /profile/prolitus-technologies
CLIENTES NÃO ENCONTRADOS EM Prolitus Technologies
Collecting data from: /profile/galliot
Collecting data from: /profile/e-lietuva
CLIENTES NÃO ENCONTRADOS EM e-Lietuva
Collecting data from: /profile/eternal-web
CLIENTES NÃO ENCONTRADOS EM Eternal Web Pvt. Ltd.
Collecting data from: /profile/plusinfosys
Collecting data from: /profile/latentview-analytics
Collecting data from: /profile/efficiencie
Collecting data from: /profile/datalabs
Collecting data from: /profile/baltic-amadeus
CLIENTES NÃO ENCONTRADOS EM Baltic Amadeus
Collecting data from: /profile/develoop-software
Collecting data from:

CLIENTES NÃO ENCONTRADOS EM ISK Global
Collecting data from: /profile/relevant-software
Collecting data from: /profile/iwebservices
Collecting data from: /profile/revinfotech
Collecting data from: /profile/coobers
Collecting data from: /profile/devodrome
Collecting data from: /profile/parafernalia-interativa
Collecting data from: /profile/pupa-clic-technologies
Collecting data from: /profile/zeus-group
Collecting data from: /profile/nullgravity
Collecting data from: /profile/freeport-metrics
Collecting data from: /profile/alphosys-technologies
CLIENTES NÃO ENCONTRADOS EM Alphosys Technologies Pvt Ltd
Collecting data from: /profile/neufango-labs
Collecting data from: /profile/professional-staff-recruitment
Collecting data from: /profile/clurgo
Accessing page:  44
Find  20 references
Collecting data from: /profile/deepinspire
CLIENTES NÃO ENCONTRADOS EM DeepInspire
Collecting data from: /profile/chainzilla
Collecting data from: /profile/weblanddesign-agency
Collecting data from: /profile

CLIENTES NÃO ENCONTRADOS EM Synapse Team


Unnamed: 0,clients,employees,hour_price,location,name,score,valuation
0,Ticketbooth Boatsetter Trumedia Blade IBSS Cu...,10 - 49,$25 - $49 / hr,"Saltillo, Mexico",ClickIT Smart Technologies,4.9,"$5,000+"
1,,250 - 999,$25 - $49 / hr,"Atlanta, GA",delaPlex,5.0,"$25,000+"
2,"RTL Hrvatska, Pro-bit, Dudubags, Veliki Odmor",2 - 9,$50 - $99 / hr,"Pula, Croatia",Sysbee,5.0,"$1,000+"
3,"Fortune 500 companies, Gogo, Fluke, TuneIn, H...","1,000 - 9,999",$50 - $99 / hr,"Lviv, Ukraine",N-iX,4.9,"$100,000+"
4,,"1,000 - 9,999",$50 - $99 / hr,"Rochelle Park, NJ",Avenga,4.8,"$50,000+"
...,...,...,...,...,...,...,...
1195,,10 - 49,$25 - $49 / hr,"Kharkiv, Ukraine",Space Whale,5.0,"$25,000+"
1196,"TD Ameritrade, Bridgestone Firestone, Dell EM...",10 - 49,$100 - $149 / hr,"New York, NY",impakt Advisors,5.0,"$10,000+"
1197,,10 - 49,$25 - $49 / hr,"Stockholm, Sweden",Chimplie,4.5,"$5,000+"
1198,,2 - 9,< $25 / hr,"Jaipur, India",Kaptcha Softwares,5.0,"$1,000+"


In [18]:
df_clean = get_companies_csv().drop_duplicates(subset=['name'])
df_clean.to_csv(companies_path, index=False) 
df_clean

Unnamed: 0,clients,employees,hour_price,location,name,score,valuation
0,Ticketbooth Boatsetter Trumedia Blade IBSS Cu...,10 - 49,$25 - $49 / hr,"Saltillo, Mexico",ClickIT Smart Technologies,4.9,"$5,000+"
1,,250 - 999,$25 - $49 / hr,"Atlanta, GA",delaPlex,5.0,"$25,000+"
2,"RTL Hrvatska, Pro-bit, Dudubags, Veliki Odmor",2 - 9,$50 - $99 / hr,"Pula, Croatia",Sysbee,5.0,"$1,000+"
3,"Fortune 500 companies, Gogo, Fluke, TuneIn, H...","1,000 - 9,999",$50 - $99 / hr,"Lviv, Ukraine",N-iX,4.9,"$100,000+"
4,,"1,000 - 9,999",$50 - $99 / hr,"Rochelle Park, NJ",Avenga,4.8,"$50,000+"
...,...,...,...,...,...,...,...
1195,,10 - 49,$25 - $49 / hr,"Kharkiv, Ukraine",Space Whale,5.0,"$25,000+"
1196,"TD Ameritrade, Bridgestone Firestone, Dell EM...",10 - 49,$100 - $149 / hr,"New York, NY",impakt Advisors,5.0,"$10,000+"
1197,,10 - 49,$25 - $49 / hr,"Stockholm, Sweden",Chimplie,4.5,"$5,000+"
1198,,2 - 9,< $25 / hr,"Jaipur, India",Kaptcha Softwares,5.0,"$1,000+"


In [19]:
df_clean.sort_values(by=['score'], ascending=False)

Unnamed: 0,clients,employees,hour_price,location,name,score,valuation
373,"Nimble inc, GoodData, Open source",2 - 9,$25 - $49 / hr,"Shlapanice, Brno-Country District, Czech Republic",abstractR,not found,"$1,000+"
955,,"1,000 - 9,999",$25 - $49 / hr,"Sofia, Bulgaria",BULPROS,not found,"$5,000+"
854,,50 - 249,$150 - $199 / hr,"Madrid, Spain",Intelligence Partner,not found,Undisclosed
460,"Qwintry, Citrus, StilyoApps",10 - 49,< $25 / hr,"Kyiv, Ukraine",SYSTEM ADMINS PRO,5.0,"$1,000+"
546,XEQUALS,2 - 9,$150 - $199 / hr,"Wellington, New Zealand",CLVR Cloud Native Solutions,5.0,"$10,000+"
...,...,...,...,...,...,...,...
924,"Center for Action and Contemplation, MTOPO Pa...",50 - 249,$25 - $49 / hr,"Coral Springs, FL",Prakash Software Solutions,4.0,"$10,000+"
525,"Foot Locker, Wild Birds Unlimited, Brookstone...","1,000 - 9,999",Undisclosed,"Plantation, FL","Chetu, Inc.",3.8,"$10,000+"
901,"CloudOn, Dropbox, Audi, USAID, Bueno, Microso...",50 - 249,$50 - $99 / hr,"Milpitas, CA",IQVIS,3.5,"$5,000+"
657,BNP Paribas; Airbus; Seoullimited; Kaesa; Giv...,2 - 9,$150 - $199 / hr,"Seoul, South Korea",Kairos,3.5,"$1,000+"


In [22]:
df_clean.sort_values(by=['valuation'], ascending=True)

Unnamed: 0,clients,employees,hour_price,location,name,score,valuation
946,"ABInbev, UN, US State Gov't, Bank of India",10 - 49,< $25 / hr,"Kampala, Uganda",Efficiencie,5.0,"$1,000+"
676,"ECOMPLY.io, Coding Blog, Stakeholder Club Inc...",2 - 9,$25 - $49 / hr,"Munich, Germany",Connect Platform,4.8,"$1,000+"
400,,10 - 49,$50 - $99 / hr,"Wrocław, Poland",Fly On The Cloud,5.0,"$1,000+"
401,"Third Wave Network, Real Trade Imobiliare",2 - 9,$25 - $49 / hr,"Brasov, Romania",BKHosting,5.0,"$1,000+"
1108,,2 - 9,< $25 / hr,"Ahmedabad, India",Tech Mitraa,5.0,"$1,000+"
...,...,...,...,...,...,...,...
463,"selectspecs.com, ab.ua, autocentre.ua",10 - 49,$25 - $49 / hr,"Irpin, Ukraine",CoreQ,5.0,Undisclosed
1080,,10 - 49,$25 - $49 / hr,"L'viv, Ukraine",DeepInspire,5.0,Undisclosed
456,"EY (former Ernst&Young), Tele2, Effective Cov...",50 - 249,$25 - $49 / hr,"New York City, NY",Azoft,5.0,Undisclosed
1070,- http://www.olbg.com/ - http://www.viihealth...,10 - 49,$25 - $49 / hr,"Bucharest, Romania",Devodrome,4.6,Undisclosed


In [21]:
df_clean.sort_values(by=['employees'], ascending=False)

Unnamed: 0,clients,employees,hour_price,location,name,score,valuation
835,"12Labels Inc, Primetime Ventures Pty Ltd, Eyw...",Freelancer,< $25 / hr,"Solapur, India",CloudLab.in,4.9,"$1,000+"
846,I will add this later,Freelancer,$25 - $49 / hr,"Kolkata, India",ELOGICSOFT.COM,5.0,"$1,000+"
1170,"Investment Banking, Capital Markets, Financial...",Freelancer,$100 - $149 / hr,"London, United Kingdom",The London Software Testing Company,5.0,"$100,000+"
744,,Freelancer,$50 - $99 / hr,"Warsaw, Poland",Varstadt,5.0,"$1,000+"
1051,,50 - 249,Undisclosed,"Atlanta, GA",Kratikal Tech Pvt Ltd,4.9,Undisclosed
...,...,...,...,...,...,...,...
388,"Samsung, Unite Private Networks, Windstream, C...","1,000 - 9,999",$50 - $99 / hr,"Pittsford, NY",Netsmartz LLC,4.9,"$25,000+"
384,"Panasonic, Henry Schein Practice Solutions, B...","1,000 - 9,999",$100 - $149 / hr,"Austin, TX",SoftServe,4.8,"$50,000+"
955,,"1,000 - 9,999",$25 - $49 / hr,"Sofia, Bulgaria",BULPROS,not found,"$5,000+"
376,"Marlabs works with global enterprises, mid-si...","1,000 - 9,999",Undisclosed,"Piscataway, NJ",Marlabs Inc,4.0,"$100,000+"
