In [1]:
import pandas as pd
import os

In [2]:
url_list = pd.read_csv('data/startup_url_list.csv')

In [3]:
downloaded_jsons = os.listdir('data/raw-json')
downloaded_jsons = [int(file[:-4]) for file in downloaded_jsons]

#### How many JSONs did we not download?

In [4]:
missed_jsons = url_list[url_list.entityid.isin(downloaded_jsons) == False]
len(missed_jsons.index)

405

#### Why couldn't we download them?

In [5]:
json_log_report = pd.read_csv('log-reports/archive/json-log-report.csv')
missed_json_reason = json_log_report[json_log_report.entityid.isin(missed_jsons.entityid)].reason_for_failure
missed_json_reason.value_counts()

NoJSON                                                                                                                204
ConnectionError                                                                                                       118
403 Client Error: FORBIDDEN for url: https://web.archive.org/web/timemap/json?url=www.parting.com/                      1
403 Client Error: FORBIDDEN for url: https://web.archive.org/web/timemap/json?url=www.shadowbox.com/                    1
403 Client Error: FORBIDDEN for url: https://web.archive.org/web/timemap/json?url=www.dock.io/                          1
                                                                                                                     ... 
403 Client Error: FORBIDDEN for url: https://web.archive.org/web/timemap/json?url=www.handy.com/                        1
403 Client Error: FORBIDDEN for url: https://web.archive.org/web/timemap/json?url=www.confection.io/                    1
429 Client Error: Too Ma

#### How may timestamps have we not calculate (that we can calculate)?

In [6]:
downloaded_timestamps = os.listdir('data/optimal-timestamps')
downloaded_timestamps = [int(file[:-15]) for file in downloaded_timestamps]
missed_timestamps = url_list[url_list.entityid.isin(downloaded_timestamps) == False]
missed_timestamps

Unnamed: 0,entityid,weburl,pb_companyid,startdate,lastVC,ownershipstatus,exit_date
3,53140,www.gmswireless.com,,1997-08-01,2000-04-12,Acquired/Merged,2000-04-12
4,53141,www.motion-inc.com,,1997-11-01,2000-02-10,Acquired/Merged,2000-02-10
6,53144,www.atoutcome.com,,1997-01-01,1999-06-23,Out of Business,
7,53146,www.atroad.com,,1996-07-01,2000-09-29,Publicly-held,2000-09-29
14,53165,www.10charge.com,,2004-01-01,2008-12-31,Acquired/Merged,2008-12-31
...,...,...,...,...,...,...,...
78321,1052271875,www.xyonetx.com,,2021-01-01,2022-10-01,Private & Independent,
78323,1052274143,www.digitalex.io,,,2023-04-04,Private & Independent,
78324,1052276555,www.solmslanding.com,,2019-01-14,2023-03-30,Private & Independent,
78327,1052277599,www.cammie.com,,,2023-01-01,Private & Independent,


In [7]:
timestamp_log_report = pd.read_csv('log-reports/archive/optimal_timestamp_log_report_6-13.csv')
timestamp_log_report = timestamp_log_report[timestamp_log_report.entityid.isin(missed_timestamps.entityid)]
timestamp_log_report

Unnamed: 0,entityid,domain,time_of_run,earliest_screenshot,start_date_with_buffer,latest_screenshot,end_date_with_buffer,file_path,failed,reason_for_failure
3,53140,www.gmswireless.com,2023-06-13 03:13:01,,1997-02-02 00:00:00,,2000-10-09 00:00:00,,1,No timestamps within company life span
4,53141,www.motion-inc.com,2023-06-13 03:13:01,,1997-05-05 00:00:00,,2000-08-08 00:00:00,,1,No timestamps within company life span
6,53144,www.atoutcome.com,2023-06-13 03:13:01,,1996-07-05 00:00:00,,2023-06-06 00:00:00,,1,No timestamps within company life span
7,53146,www.atroad.com,2023-06-13 03:13:01,,1996-01-03 00:00:00,,2001-03-28 00:00:00,,1,No timestamps within company life span
14,53165,www.10charge.com,2023-06-13 03:13:01,,2003-07-05 00:00:00,,2009-06-29 00:00:00,,1,No timestamps within company life span
...,...,...,...,...,...,...,...,...,...,...
78321,1052271875,www.xyonetx.com,2023-06-13 04:17:32,,2021-01-01,,,,1,No JSON data
78323,1052274143,www.digitalex.io,2023-06-13 04:17:32,,,,,,1,No JSON data
78324,1052276555,www.solmslanding.com,2023-06-13 04:17:32,,2019-01-14,,,,1,No JSON data
78327,1052277599,www.cammie.com,2023-06-13 04:17:32,,,,,,1,No JSON data


All 10,808 companies with missed timestamps have logs, which is good.

Let's see why we couldn't get their timestamps:

In [8]:
timestamp_log_report.reason_for_failure.value_counts()

No timestamps within company life span    8667
No JSON data                              1484
Missing start date                         481
Name: reason_for_failure, dtype: int64

...this was run a few days ago, so we now have more JSON files. Let's see how many of those _now_ have JSON files

In [9]:
missing_jsons = timestamp_log_report[timestamp_log_report.reason_for_failure == "No JSON data"].entityid
missing_jsons_executable = missing_jsons[missing_jsons.isin(downloaded_jsons)]
missing_jsons

162           53739
282           54072
347           54274
451           54640
754           55716
            ...    
78321    1052271875
78323    1052274143
78324    1052276555
78327    1052277599
78329    1052277806
Name: entityid, Length: 1484, dtype: int64

We can also add the JSONs with missing start dates since we can use their LastVC date as a start date

In [10]:
missing_jsons_executable = missing_jsons_executable.append(timestamp_log_report[timestamp_log_report.reason_for_failure == "Missing start date"].entityid)

In [11]:
missing_jsons_executable_urls = url_list[url_list.entityid.isin(missing_jsons_executable)]
missing_jsons_executable_urls.to_csv('data/missing_jsons_executable.csv', index=False)

Ok, just ran missing_jsons_executable_urls into a timestamp. Let's look at the log report

In [28]:
msje_le = pd.read_csv('log-reports/optimal_timestamp_log_report_6-14.csv')
msje_le_f = msje_le[msje_le.failed == 1]

We can add these, as well as the previously failed, to a list that we can call msjne ("missing jsons *not* executable")

So first we filter timestamp_log_report to have only "No timestamps within company life span"

In [29]:
mjne = timestamp_log_report[timestamp_log_report.reason_for_failure == "No timestamps within company life span"]

Merge with most recent log report failed co's (all failed b/c no timestamps within company life span)

In [30]:
mjne = pd.concat([mjne, msje_le_f], axis=0)
mjne

Unnamed: 0,entityid,domain,time_of_run,earliest_screenshot,start_date_with_buffer,latest_screenshot,end_date_with_buffer,file_path,failed,reason_for_failure
3,53140,www.gmswireless.com,2023-06-13 03:13:01,,1997-02-02 00:00:00,,2000-10-09 00:00:00,,1,No timestamps within company life span
4,53141,www.motion-inc.com,2023-06-13 03:13:01,,1997-05-05 00:00:00,,2000-08-08 00:00:00,,1,No timestamps within company life span
6,53144,www.atoutcome.com,2023-06-13 03:13:01,,1996-07-05 00:00:00,,2023-06-06 00:00:00,,1,No timestamps within company life span
7,53146,www.atroad.com,2023-06-13 03:13:01,,1996-01-03 00:00:00,,2001-03-28 00:00:00,,1,No timestamps within company life span
14,53165,www.10charge.com,2023-06-13 03:13:01,,2003-07-05 00:00:00,,2009-06-29 00:00:00,,1,No timestamps within company life span
...,...,...,...,...,...,...,...,...,...,...
1731,1052271875,www.xyonetx.com,2023-06-14 14:57:21,,2020-07-05,,2023-06-06,,1,No timestamps within company life span
1732,1052274143,www.digitalex.io,2023-06-14 14:57:21,,2022-10-06,,2023-06-06,,1,No timestamps within company life span
1733,1052276555,www.solmslanding.com,2023-06-14 14:57:21,,2018-07-18,,2023-06-06,,1,No timestamps within company life span
1734,1052277599,www.cammie.com,2023-06-14 14:57:21,,2022-07-05,,2023-06-06,,1,No timestamps within company life span


Make sure all this stuff checks out.

No duplicates?

In [32]:
len(pd.unique(mjne.entityid))
len(mjne.index)

10227

Does len(mjne.index) + number of timestamps = number of jsons?

In [34]:
len(mjne.index) + len(downloaded_timestamps) == len(downloaded_jsons)

True

Add in all of the Output this to a CSV in "data". Add "reason_for_failure" to the CSV

In [44]:
mjne_cos = url_list[url_list.entityid.isin(mjne.entityid)]
mjne_cos = mjne_cos.merge(mjne[['entityid', 'reason_for_failure']], on='entityid', how='left')
mjne_cos.to_csv("data/failed_cos/no_timestamps.csv", index=False)

Now do the same for the missing jsons

In [57]:
missed_jsons = url_list[url_list.entityid.isin(downloaded_jsons) == False]
missed_json_reason = json_log_report[json_log_report.entityid.isin(missed_jsons.entityid)]
missed_jsons = missed_jsons.merge(missed_json_reason[['entityid', 'reason_for_failure']], on='entityid', how='left')
missed_jsons.to_csv("data/failed_cos/no_json.csv", index=False)