In [15]:
import pandas as pd
import datetime
import time
import os
import requests
import codecs
import pdb
import sys
import filter_downloaded_html as fdh
import downloaded_6_20 as downloaded
import numpy as np
from bs4 import BeautifulSoup
import pprint as pp

Read the data

In [4]:
url_list = pd.read_csv('data/startup_url_list.csv')
timestamp_list = os.listdir('data/optimal-timestamps')
timestamp_list = [int(file[:-15]) for file in timestamp_list]

Find already finished, take them out

(This is hardcoded for now to prevent any bugs)

In [5]:
already_finished_crude = [int(f) for f in os.listdir('data/html') if not f.startswith('.')]
url_list = url_list[(url_list.entityid.isin(timestamp_list)) & (
    url_list.entityid.isin(already_finished_crude) == True)]

In [6]:
url_list

Unnamed: 0,entityid,weburl,pb_companyid,startdate,lastVC,ownershipstatus,exit_date
1490,58591,www.baylogics.com,,1987-06-01,2002-10-29,Acquired/Merged,2002-10-29
1946,60590,www.btspartners.com,,1998-07-01,2004-09-01,Out of Business,
1961,60654,www.bullant.com,,1995-01-01,2001-03-28,Out of Business,
1963,60659,www.bulldogit.com,,2000-12-01,2004-11-30,Private & Independent,
1965,60662,www.bullhorn.com,,1999-08-04,2012-06-14,Acquired/Merged,2012-06-14
...,...,...,...,...,...,...,...
71088,1049403881,www.kirilys.com,,2021-03-01,2023-01-25,Private & Independent,
71089,1049403944,www.prologue.xyz,,,2022-01-25,Private & Independent,
71101,1049406914,www.tea.xyz,,2021-01-01,2022-11-08,Private & Independent,
71157,1049417039,www.quadrata.com,,2021-01-01,2022-07-12,Private & Independent,


Select a founding year. Let's see which years have a lot of companies

In [7]:
url_list['startyear'] = url_list.startdate.str.slice(start=0, stop=4)
url_list['startyear'].value_counts()

2020    229
2021    169
2019    155
2018     71
2017     59
2016     30
2015     30
2014     11
2013      8
2022      7
2012      7
2011      6
2007      6
2010      6
1998      5
2003      5
2006      5
2009      4
2005      4
1999      3
1976      2
2008      2
1992      2
1995      2
2001      2
2004      2
2000      2
1987      1
1989      1
1986      1
2002      1
1981      1
1990      1
1978      1
1997      1
1996      1
1967      1
1994      1
Name: startyear, dtype: int64

2015 has a lot. We'll do that

In [8]:
url_list = url_list[url_list['startyear'] == '2015']

Now, let's create a bunch of statistics that describes this data. Some ideas:

1. How big is the website?
2. How much bigger/smaller is it than the original website?
3. The previous website?
4. How many `<a>` tags does the website have?
5. What are the words within each `<a>` tag?
* Using BeautifulSoup, have a column containing an array of the innerText for each `<a>` tag within the website
6. Of the `<a>` tags, what proportion of the links are internal?
* This will also use beautiful soup.
7. Does it have a `<meta>` tag with name "description"? If so, what is its "content"?
8. Does it have a `<meta>` tag with name "keywords"? If so, what is its "content" (in array form)?
9. Look for copyright information...how outdated is it?

Two function definitions that will be useful:

But first, create the empty DF

In [45]:
columns = ['entityid', 'domain', 'capture_yr', 'capture_m', 'time_from_start_m', 'website_size_kb', 'title', 'num_a_tags', 'a_innertext', 'meta_description', 'meta_keywords']

In [46]:
webpage_metadata = pd.DataFrame(columns=columns)

In [47]:
def get_htmls(entityid, base_path):
    global webpage_metadata
    htmls = []
    co_directory = os.path.join(base_path, entityid)
    years = [f for f in os.listdir(co_directory) if not f.startswith('.')]
    for year in years:
            yr_directory = os.path.join(co_directory, year)
            for month in os.listdir(yr_directory):
                index_path = os.path.join(yr_directory, month, "index.html")
                if os.path.isfile(index_path):
                    #Get file size (kb)
                    website_size_kb = os.stat(index_path).st_size / 1024
                    with open(index_path, 'r') as file:
                        html_content = file.read()
                        soup = BeautifulSoup(html_content, 'html.parser')
                        # Call a bunch of functions that give this more data
                        a_tags = soup.find_all('a')
                        num_a_tags = len(a_tags)
                        a_inner_texts = []
                        for a_tag in a_tags:
                            at_txt = str(a_tag.text)
                            at_txt = at_txt.replace("\n", "")
                            if at_txt != "":
                                a_inner_texts.append(at_txt)
                        page_title = soup.title.text
                        htmls.append(soup)
                        has_meta_description_tag = (soup.find('meta', attrs={'name': 'description'}) is not None)
                        has_meta_keywords_tag = (soup.find('meta', attrs={'name': 'keywords'}) is not None)
                        data_to_add = {
                            'entityid': entityid,
                            'capture_yr': year,
                            'capture_m': month,
                            'time_from_start_m': None,
                            'website_size_kb': website_size_kb,
                            'title': page_title,
                            'num_a_tags': num_a_tags,
                            'a_innertext': a_inner_texts,
                            'meta_description': int(has_meta_description_tag),
                            'meta_keywords': int(has_meta_keywords_tag)
                        }
                        print("Adding:", pp.pprint(data_to_add))
                        webpage_metadata = webpage_metadata.append(data_to_add, ignore_index=True)
    return htmls

Execution

In [52]:
base_path = "data/html"
companies = url_list.entityid.astype(str)
for company in companies:
    html_list = get_htmls(company, base_path)
    #Iterate through HTMLs...basically fill in all the columns

{'a_innertext': ["We're making work meaningful everywhere with our expanded "
                 'international presence!Learn more',
                 'Watch exclusive RfH Virtual sessions featuring Adam Grant '
                 'and more!Watch On-demand',
                 'Lattice in Raconteur: Beat the Great Resignation with a '
                 'thriving cultureRead now â\x86\x92',
                 'Lattice OverviewInvest in your people by turning people '
                 'strategy into business strategy',
                 'App IntegrationsSeamless integrations with your favorite '
                 'software',
                 'PerformanceReviews, 1:1s, Updates, Feedback, and Praise that '
                 'make continuous performance a reality',
                 'EngagementSurveys, Pulses, and eNPS to understand and act on '
                 'how your employees feel about work',
                 'GoalsOKR and goal management that drives performance',
                 'GrowCompetenci

{'a_innertext': ['Request a demo',
                 'PerformanceContinuous performance management suite',
                 'ReviewsRun a feedback cycle',
                 'FeedbackGive helpful feedback',
                 'GoalsSet expectations & drive performance',
                 'PraiseCelebrate wins in public',
                 '1:1sStructure meetings',
                 'UpdatesAnswer status questions',
                 'EngagementSurveys combined with performance attributes',
                 'Discover the power of Lattice Analytics',
                 'People Management LibraryActionable advice for HR, managers, '
                 'employees & execs',
                 'ArticlesExplore articles',
                 'BooksExplore books',
                 'InterviewsExplore interviews',
                 'WebinarsExplore webinars',
                 'Resources for HumansSlack Community of People Operations '
                 'leaders',
                 'Sign In',
                 'Custom

{'a_innertext': ['Lattice',
                 'Products',
                 'PerformanceContinuous performance management solution',
                 'ReviewsRun a feedback cycle',
                 'GoalsSet expectations',
                 'FeedbackGive helpful feedback',
                 'PraiseCelebrate wins in public',
                 '1:1sStructure meetings',
                 'UpdatesAnswer status questions',
                 'Resources',
                 'LibraryBooks and videos on performance management',
                 'InterviewsConversations with people operations experts',
                 'CommunityJoin our Slack community to learn with industry '
                 'peers',
                 'Help CenterLearn how to use Lattice products',
                 'Customers',
                 'Blog',
                 'Sign in',
                 'Request a demo',
                 'Request a demo',
                 'Lattice',
                 'Request Demo',
                 'Products'

AttributeError: 'NoneType' object has no attribute 'text'

In [53]:
companies

28081        674030
29645        683889
47102    1016894171
47716    1017906536
52729    1026782984
52894    1026822557
53099    1026877610
53428    1027782074
59321    1043793110
59539    1043829200
59751    1043860124
59762    1043861753
60938    1044326531
61120    1044869132
61144    1045002818
61505    1045432973
61514    1045435169
61843    1045507583
62274    1045851707
62357    1045865054
63321    1046221364
63745    1046313749
64078    1046400122
65205    1046714078
66141    1047046979
66177    1047055511
66379    1047101330
68604    1048396475
69320    1048935305
70320    1049178170
Name: entityid, dtype: object

In [49]:
webpage_metadata

Unnamed: 0,entityid,domain,capture_yr,capture_m,time_from_start_m,website_size_kb,title,num_a_tags,a_innertext,meta_description,meta_keywords
0,674030,,2022,2,,123.238281,People Management | Performance and Engagement...,106,[We're making work meaningful everywhere with ...,1,0
1,674030,,2022,8,,176.769531,People Success Platform | Performance and Enga...,119,[US: SEP 21 Â | Â EMEA: OCT 06Virtual Confere...,1,0
2,674030,,2023,2,,198.650391,People Success Platform | Performance and Enga...,134,"[Products, Performance ManagementReviews, 1:1s...",1,0
3,674030,,2023,6,,201.636719,People Success Platform | Performance and Enga...,137,"[Products, Performance ManagementReviews, 1:1s...",1,0
4,674030,,2017,8,,44.780273,Lattice | Performance Management Software,70,"[Lattice, Products, ReviewsRun performance rev...",1,1
5,674030,,2019,8,,56.37793,People Management | Performance and Engagement...,88,"[Request a demo, PerformanceContinuous perform...",1,0
6,674030,,2019,1,,53.999023,People Management: Performance & Engagement So...,84,"[Request a demo, PerformanceContinuous perform...",1,0
7,674030,,2021,8,,91.258789,People Management | Performance and Engagement...,97,[ Join us Sept. 22 to hear Serena Wil...,1,0
8,674030,,2021,1,,80.78125,People Management | Performance and Engagement...,96,[\t\t\t\t\t Employee Development for the New W...,1,0
9,674030,,2020,2,,61.589844,People Management | Performance and Engagement...,114,[PerformanceContinuous Performance ManagementD...,1,0
