In [1]:
import os, random, sys, time, shutil, re, csv, datetime, pickle, math 

import pandas as pd

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.webdriver import Options

from bs4 import BeautifulSoup

In [2]:
chrome_options = Options()
#chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--log-level=3")  # fatal
chrome_options.add_argument("--start-maximized")

In [3]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

browser = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324






[WDM] - Driver [/home/carlos.cayres/.wdm/drivers/chromedriver/linux64/88.0.4324.96/chromedriver] found in cache


In [4]:
companies_path = 'CSVs/clutch_companies.csv'

In [5]:
# Clutch IT Firms URL

clutch_url = 'https://clutch.co/it-services/outsourcing' 

In [6]:
#HTML Div class references (not using yet)

firms_number_html='tabs-info'
firms_list_html='company_logotype'
firm_name_html='header-company--title'
firm_score_html='ratingValue'
firm_location_html='location-name'
firm_clients_html='field-name-clients'
firm_client_name='field c'

In [7]:
def get_total_pages_at(browser):
    soup = BeautifulSoup(browser.page_source, features='html.parser')
    firms_tab_text = soup.find("div",attrs={"class" : "tabs-info"}, recursive=True).text
    firms_number = firms_tab_text.replace(',', '').replace('Firms', '').strip()
    return math.ceil(int(firms_number) / 19)

In [8]:
def find_url_references_at(browser):
    soup = BeautifulSoup(browser.page_source, features='html.parser')
    url_list = soup.find_all("div",attrs={"class" : "company_logotype"}, recursive=True)
    return [url.find('a')['href'] for url in url_list]

In [9]:
def get_companies_csv():
    try: return pd.read_csv(companies_path)
    except: return pd.DataFrame()

In [10]:
def collect_data_from(ref_list):
    df = get_companies_csv()
    for ref in ref_list:
        print('Collecting data from:', ref)
        data = colect_infos_at(ref)
        df = df.append(data, ignore_index=True)
    return df

In [11]:
def colect_infos_at(reference):
    browser.get(f"https://clutch.co{reference}")
    time.sleep(2)
    
    soup = BeautifulSoup(browser.page_source, features='html.parser')

    name = soup.find("h1", attrs={"class" : "header-company--title"}).text.replace('\n', '').strip()
    score = soup.find("span", attrs={"itemprop" : "ratingValue"}).text.replace('\n', '').strip()
    location = soup.find("span", attrs={"class" : "location-name"}).text.replace('\n', '').strip()
    
    main_div = soup.find("div", attrs={"class" : "module-list"}) 
    main_div.find_all("span")

    main_div = [i.text for i in main_div.find_all("span")]
    valuation = main_div[0]
    price_hour = main_div[1]
    employees = main_div[2]
    
    clients = ''
    try:
        clients = soup.find("div", attrs={"class" : "field-name-clients"})
        clients =  ' '.join([p.text.replace('\n', ' ').strip() for p in clients.find_all('p')])
    except:
        try:
            clients = soup.find("div", attrs={"class" : "field c"})
            clients = ' '.join([p.text.replace('\n', ' ').strip() for p in clients.find_all('p')])
        except:
            print(f"CLIENTES NÃO ENCONTRADOS EM {name}")
            
    return {'name': name,
            'valuation': valuation,
            'hour_price': price_hour,
            'employees': employees,
            'score': score,
            'location': location,
            'clients': clients}

In [12]:
def collect_companies():
    
    print('Accessing: ', clutch_url)
    browser.get(clutch_url)
    time.sleep(2)
    
    pages = 5 #get_total_pages_at(browser)
    print('Get: ', str(pages), 'pages')
    
    for i in range(1,pages):
        
        print('Accessing page: ', i)
        browser.get(f"https://clutch.co/it-services/outsourcing?page={i}")
        time.sleep(1)
        
        reference_list = find_url_references_at(browser)
        print('Find ', len(reference_list), 'references')
        
        clutch_df = collect_data_from(reference_list)
        clutch_df.to_csv(companies_path, index=False)    

In [13]:
collect_companies()

get_companies_csv()

Accessing:  https://clutch.co/it-services/outsourcing
Get:  5 pages
Accessing page:  1
Find  20 references
Collecting data from: /profile/clickit-smart-technologies
Collecting data from: /profile/delaplex
CLIENTES NÃO ENCONTRADOS EM delaPlex
Collecting data from: /profile/sysbee
Collecting data from: /profile/n-ix
Collecting data from: /profile/avenga
CLIENTES NÃO ENCONTRADOS EM Avenga
Collecting data from: /profile/uplers
Collecting data from: /profile/grape-2
Collecting data from: /profile/techmagic
CLIENTES NÃO ENCONTRADOS EM TechMagic
Collecting data from: /profile/ibexlabs
Collecting data from: /profile/opinov8-technology-services
Collecting data from: /profile/binariks
Collecting data from: /profile/unicsoft
Collecting data from: /profile/chimera-prime
Collecting data from: /profile/mutual-mobile
Collecting data from: /profile/forte-group
Collecting data from: /profile/icreon
Collecting data from: /profile/experion-technologies
Collecting data from: /profile/devcom
Collecting dat

Unnamed: 0,clients,employees,hour_price,location,name,score,valuation
0,Ticketbooth Boatsetter Trumedia Blade IBSS Cu...,10 - 49,$25 - $49 / hr,"Saltillo, Mexico",ClickIT Smart Technologies,4.9,"$5,000+"
1,,250 - 999,$25 - $49 / hr,"Atlanta, GA",delaPlex,5.0,"$25,000+"
2,"RTL Hrvatska, Pro-bit, Dudubags, Veliki Odmor",2 - 9,$50 - $99 / hr,"Pula, Croatia",Sysbee,5.0,"$1,000+"
3,"Fortune 500 companies, Gogo, Fluke, TuneIn, H...","1,000 - 9,999",$50 - $99 / hr,"Lviv, Ukraine",N-iX,4.9,"$100,000+"
4,,"1,000 - 9,999",$50 - $99 / hr,"Rochelle Park, NJ",Avenga,4.8,"$50,000+"
...,...,...,...,...,...,...,...
75,"PwC, Beckman Coulter, Grundfos, AIDS Healthca...",250 - 999,$25 - $49 / hr,"Richardson, TX",Impiger Technologies,4.7,"$25,000+"
76,,10 - 49,$50 - $99 / hr,"Warsaw, Poland",Evojam,4.9,"$10,000+"
77,300+ midsize to large corporations including:...,50 - 249,$50 - $99 / hr,"Krakow, Poland",AMC TECH,5.0,"$10,000+"
78,"GetCompliant, Homer, Rule, FundedByMe, Trailh...",50 - 249,$25 - $49 / hr,"Kharkov, Ukraine",Vilmate LLC,5.0,"$25,000+"


In [14]:
df_clean = get_companies_csv().drop_duplicates(subset=['name'])

In [15]:
df_clean.sort_values(by=['score'], ascending=False)

Unnamed: 0,clients,employees,hour_price,location,name,score,valuation
38,"Google/QwikLabs, Teladoc/Livongo, PaloAltoNet...",250 - 999,< $25 / hr,"Campbell, CA",Tudip Technologies Pvt. Ltd.,5.0,"$1,000+"
28,"Redbull, Hilton, Sony Music, Boy Scouts of Am...",50 - 249,$25 - $49 / hr,"Orlando, FL",Simform,5.0,"$10,000+"
26,"Gotham Web, Telappliant, Shred Labs, Neopix, ...",50 - 249,$50 - $99 / hr,"Belgrade, Serbia",SuperAdmins,5.0,"$1,000+"
25,"Coca-Cola, ebay, Vodafone, JLL, SKF, Brightco...",250 - 999,$25 - $49 / hr,"Sunnyvale, CA",Algoworks,5.0,"$5,000+"
24,,10 - 49,$50 - $99 / hr,"Kyiv, Ukraine",Alpacked,5.0,"$5,000+"
...,...,...,...,...,...,...,...
75,"PwC, Beckman Coulter, Grundfos, AIDS Healthca...",250 - 999,$25 - $49 / hr,"Richardson, TX",Impiger Technologies,4.7,"$25,000+"
37,"First Data, Generali, IGT, Hewlett Packard, R...",250 - 999,$50 - $99 / hr,"Warsaw, Poland",Britenet,4.7,"$1,000+"
59,"Our clients include BioMensio, Ensto Oy, The ...",50 - 249,$50 - $99 / hr,"Poznań, Poland",Espeo Software,4.7,"$10,000+"
58,"LifeTime Fitness, Daikin, WEX Health, MTS Sys...","1,000 - 9,999",$25 - $49 / hr,"Minneapolis, MN",Coherent Solutions,4.6,"$50,000+"


In [16]:
df_clean.sort_values(by=['valuation'], ascending=False)

Unnamed: 0,clients,employees,hour_price,location,name,score,valuation
36,"Stratasys, UASC, HLB International, CMG Financ...",50 - 249,$50 - $99 / hr,"Minsk, Belarus",Brimit,5.0,Undisclosed
61,"VES Environmental Solutions, Milk Specialties ...",10 - 49,$50 - $99 / hr,"Montevideo, Uruguay",Octobot,4.9,Undisclosed
20,"Willis Towers Watson, Bridgestone, Bank al Eti...",50 - 249,$25 - $49 / hr,"Kyiv, Ukraine",Diceus,4.8,"$50,000+"
15,"IMG Models, National Geographic Channel, FOX ...",250 - 999,$50 - $99 / hr,"New York, NY",Icreon,5.0,"$50,000+"
14,"BMO Harris Bank, Serta Simmons Bedding, CNH I...",250 - 999,$50 - $99 / hr,"Chicago, IL",Forte Group,4.9,"$50,000+"
...,...,...,...,...,...,...,...
38,"Google/QwikLabs, Teladoc/Livongo, PaloAltoNet...",250 - 999,< $25 / hr,"Campbell, CA",Tudip Technologies Pvt. Ltd.,5.0,"$1,000+"
37,"First Data, Generali, IGT, Hewlett Packard, R...",250 - 999,$50 - $99 / hr,"Warsaw, Poland",Britenet,4.7,"$1,000+"
56,"Content Analytics, AlgoTrader, Podcast Host, ...",50 - 249,$50 - $99 / hr,"Kyiv, Ukraine",OpsWorks Co.,5.0,"$1,000+"
26,"Gotham Web, Telappliant, Shred Labs, Neopix, ...",50 - 249,$50 - $99 / hr,"Belgrade, Serbia",SuperAdmins,5.0,"$1,000+"


In [17]:
df_clean.sort_values(by=['employees'], ascending=False)

Unnamed: 0,clients,employees,hour_price,location,name,score,valuation
29,"Pacific Life Global, Cox Communications, Roya...",50 - 249,$25 - $49 / hr,"Malvern, PA",AllianceTek,4.8,"$5,000+"
17,DevCom provides complex and cost-effective ye...,50 - 249,$25 - $49 / hr,"Lviv, Ukraine",DevCom,4.9,"$25,000+"
31,Charles Schwab IMRF Credible Behavioral Healt...,50 - 249,$50 - $99 / hr,"Columbia, MD",Inflection Point,4.9,"$25,000+"
72,Nybble Group - Our Services,50 - 249,$25 - $49 / hr,"Miami, FL",Nybble Group,4.9,"$5,000+"
59,"Our clients include BioMensio, Ensto Oy, The ...",50 - 249,$50 - $99 / hr,"Poznań, Poland",Espeo Software,4.7,"$10,000+"
...,...,...,...,...,...,...,...
68,"BMW, Bosch, Claro, Corvalent, Electrolux, Mad...","1,000 - 9,999",$50 - $99 / hr,"Miami, FL",CINQ Technologies,4.8,"$10,000+"
45,Over the past 18 years we collaborated with F...,"1,000 - 9,999",$50 - $99 / hr,"Kyiv, Ukraine",Sigma Software,4.7,"$50,000+"
4,,"1,000 - 9,999",$50 - $99 / hr,"Rochelle Park, NJ",Avenga,4.8,"$50,000+"
3,"Fortune 500 companies, Gogo, Fluke, TuneIn, H...","1,000 - 9,999",$50 - $99 / hr,"Lviv, Ukraine",N-iX,4.9,"$100,000+"
