In [200]:
import pandas as pd
import os
import matplotlib.pyplot as plt

# Table of Contents

1. [Load data](#Load-data)
2. [Sample Selection Summary](#Summary-of-the-"sample-selection")
3. [Subdivisions](#Subdivisions)

## Load data

In [201]:
html = pd.read_csv('../data/2_final_webpage_metadata.csv')

How many HTMLs have we successfully parsed? Filter out all the ones we couldn't

In [202]:
html = html[(html['is_429'] == 0) & (html['is_parsable'] == 1) & (html['file_exists'] == 1)]
print(len(html.index))

644439


Drop those columns now...

In [203]:
html = html.drop(['is_429', 'is_parsable', 'file_exists'], axis=1)

In [204]:
html = html.drop(columns=['Unnamed: 0'])

How many companies are we working with?

In [205]:
print(html['entityid'].nunique())

44110


Combine data with the input file

In [206]:
in_file = pd.read_csv('../data/startup_url_list.csv')
html = html.merge(in_file, on='entityid')

In [208]:
html.head()

Unnamed: 0,entityid,capture_yr,capture_m,file_path,website_size_kb,careers,blog,login,contact,team,...,form_count,script_count,embedded_js,external_js,weburl,pb_companyid,startdate,lastVC,ownershipstatus,exit_date
0,1006526630,2001,3,data/html/1006526630/2001/03/index.html,4.592773,0,0,0,0,0,...,1,3,True,True,www.usclaims.com,,1996-06-01,2020-01-01,Acquired/Merged,
1,1006526630,2001,5,data/html/1006526630/2001/05/index.html,4.822266,0,0,0,0,0,...,1,3,True,True,www.usclaims.com,,1996-06-01,2020-01-01,Acquired/Merged,
2,1006526630,2002,5,data/html/1006526630/2002/05/index.html,2.606445,0,0,0,0,0,...,0,3,True,True,www.usclaims.com,,1996-06-01,2020-01-01,Acquired/Merged,
3,1006526630,2002,11,data/html/1006526630/2002/11/index.html,2.604492,0,0,0,0,0,...,0,3,True,True,www.usclaims.com,,1996-06-01,2020-01-01,Acquired/Merged,
4,1006526630,2003,6,data/html/1006526630/2003/06/index.html,2.595703,0,0,0,0,0,...,0,3,True,True,www.usclaims.com,,1996-06-01,2020-01-01,Acquired/Merged,


In [209]:
html.dtypes.to_dict()

{'entityid': dtype('int64'),
 'capture_yr': dtype('int64'),
 'capture_m': dtype('int64'),
 'file_path': dtype('O'),
 'website_size_kb': dtype('float64'),
 'careers': dtype('int64'),
 'blog': dtype('int64'),
 'login': dtype('int64'),
 'contact': dtype('int64'),
 'team': dtype('int64'),
 'about': dtype('int64'),
 'news': dtype('int64'),
 'faq': dtype('int64'),
 'call_to_action': dtype('int64'),
 'testimonial': dtype('int64'),
 'title': dtype('O'),
 'description': dtype('O'),
 'keywords': dtype('O'),
 'author': dtype('O'),
 'language': dtype('O'),
 'p_count': dtype('int64'),
 'h_count': dtype('int64'),
 'img_count': dtype('int64'),
 'a_count': dtype('int64'),
 'table_count': dtype('int64'),
 'form_count': dtype('int64'),
 'script_count': dtype('int64'),
 'embedded_js': dtype('O'),
 'external_js': dtype('O'),
 'weburl': dtype('O'),
 'pb_companyid': dtype('O'),
 'startdate': dtype('O'),
 'lastVC': dtype('O'),
 'ownershipstatus': dtype('O'),
 'exit_date': dtype('O')}

## Summary of the "sample selection"

## Subdivisions

For all companies that have survived at least *n* years, divide that into exited and non-exited companies...

Let's start with 5. And then move to 10

Let's add some columns that will enable us to do this easier

In [211]:
html["has_exit"] = (html["exit_date"].isna() == False).astype('int8')
html["end_yr"] = html["exit_date"].str.slice(start=0, stop=4).astype('float' ,errors='ignore').fillna(2023)
html["start_yr"] = html["startdate"].str.slice(start=0, stop=4).fillna(html["lastVC"].str.slice(start=0, stop=4)).astype('float', errors='ignore')
html["lifespan"] = html["end_yr"] - html["start_yr"]

In [212]:
survived_5 = html[html["lifespan"] > 5]

Of the companies that survived at least five years, how *many* had an exit? How many didn't?

In [213]:
print(survived_5[survived_5["has_exit"] == 1]['entityid'].nunique())
print(survived_5[survived_5["has_exit"] == 0]['entityid'].nunique())

4862
23434


How long did it take for the exited companies to exit?

In [215]:
x = html.select_dtypes(include='number').head()
x = x.loc[:, ~x.columns.isin(['entityid'])]
x.head()

Unnamed: 0,capture_yr,capture_m,website_size_kb,careers,blog,login,contact,team,about,news,...,h_count,img_count,a_count,table_count,form_count,script_count,has_exit,end_yr,start_yr,lifespan
0,2001,3,4.592773,0,0,0,0,0,0,0,...,0,1,4,6,1,3,0,2023.0,1996.0,27.0
1,2001,5,4.822266,0,0,0,0,0,0,0,...,0,1,5,6,1,3,0,2023.0,1996.0,27.0
2,2002,5,2.606445,0,0,0,0,0,0,0,...,0,0,1,2,0,3,0,2023.0,1996.0,27.0
3,2002,11,2.604492,0,0,0,0,0,0,0,...,0,0,1,2,0,3,0,2023.0,1996.0,27.0
4,2003,6,2.595703,0,0,0,0,0,0,0,...,0,0,1,2,0,3,0,2023.0,1996.0,27.0


For companies that have survived at least *n* years, divide that into winners (exited) and "losers". How do their trajectories change?