# AWS Client Analysis


### Objective: 
- Scrape AWS case study webpage to get 2020 AWS clients

#### Functions in this notebooks:
- Similar to the `collect_data` notebook but it pulls from HMTL using the selenium library

In [1]:
import os
import time
import pprint
import pickle
import requests
import pandas as pd

<font color=red>__Please note that the code cell _below_ was used in getting the data__. I did this on my local machine to retrieve the data. All other cells should run as I have included the retrieved data in my data folder.</font> 

In [2]:
# from selenium import webdriver
# from selenium.webdriver.chrome.options import Options
# from webdriver_manager.chrome import ChromeDriverManager

In [3]:
os.chdir("/Commjhub/jupyterhub/comm318_fall2019/daniellegin/comm318_F20/comm318_F20_Final_Project")

In [4]:
class Company:

    def __init__(self, name, year, sector):

        self.name = name
        self.year = year
        self.sector = sector

    def __str__(self):
        return "{}: {}\t{}: {}\t{}: {}".format(
            "Year", self.year,
            "Name", self.name,
            "Sector", self.sector
        )

In [5]:
def get_driver(startup_url):
    options = Options()
    options.add_argument("window-size={},{}".format(1280, 1000))
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    if startup_url:
        driver.get(startup_url)
    return driver

In [6]:
def get_year(web_element):
    return web_element.find_elements_by_class_name("m-info-txt")[0].text

In [7]:
def get_name(web_element):
    img_list = web_element.find_elements_by_tag_name("img")
    if len(img_list) == 0:
        return None
    return img_list[0].get_attribute("alt")

In [8]:
def get_sector(web_element):
    return web_element.find_elements_by_class_name("m-category")[0].text

In [9]:
def get_company_list():

    driver = get_driver(startup_url="https://aws.amazon.com/solutions/case-studies/?customer-references-cards.sort-by=item.additionalFields.publishedDate&customer-references-cards.sort-order=desc&awsf.customer-references-location=location%23americas&customer-references-cards.q=2020&customer-references-cards.q_operator=AND&awsm.page-customer-references-cards=1")

    page_count = 1
    company_list = []
    while True:
        time.sleep(.5)

        div_list = driver.find_elements_by_class_name("m-card-container")
        print("Reading Page {}: {} articles found!".format(page_count, len(div_list)))
        page_count += 1

        for div in div_list:
            year = get_year(div)
            name = get_name(div)
            sector = get_sector(div)
            if name is not None:
                company_list.append(Company(name, year, sector))

        try:
            driver.find_elements_by_class_name("m-icon-angle-right")[0].click()
        except:
            break
    return company_list

In [10]:
def save_companies(company_list):
    save_dir = "data/company_data"
    for i, company in enumerate(company_list):
        print("Attempting to save {}... ".format(company), end="")
        save_file_path = "{}/company_{}.p".format(save_dir, i)
        if not os.path.exists(save_file_path):
            pickle.dump(company, open(save_file_path, "wb"))
            print("saved!")
        else:
            print("exists!")

In [11]:
def get_saved_companies(source_dir):
    company_list = []
    for file_name in os.listdir(source_dir):
        file_path = source_dir + "/" + file_name
        if file_name[-2:] == ".p" and os.path.getsize(file_path) > 0:
            company_list.append(pickle.load(open(file_path, "rb")))
    return company_list

In [12]:
def main():
    # company_list = get_company_list()
    # save_companies(company_list)

    company_list = get_saved_companies(source_dir="data/company_data")

    grouped_company_list_dict = {}
    for company in company_list:
        if company.sector in grouped_company_list_dict:
            grouped_company_list_dict[company.sector].append(company)
        else:
            grouped_company_list_dict[company.sector] = [company]
    grouped_company_list_dict = dict(reversed(sorted(grouped_company_list_dict.items(), key=lambda item: len(item[1]))))

    print("Number of sectors: {}".format(len(grouped_company_list_dict)))
    for i, key in enumerate(grouped_company_list_dict):
        print("\t{}: {} in sector".format(key, len(grouped_company_list_dict[key])))
        if i < 3:
            for company in grouped_company_list_dict[key]:
                print("\t\t{}".format(company.name))

# 2020 AWS Clients And Their Sectors


In [13]:
# main()