-
Notifications
You must be signed in to change notification settings - Fork 4
/
angellist_plot.py
181 lines (167 loc) · 7.85 KB
/
angellist_plot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import bisect, contextlib, datetime, json, lifelines, numpy, os, scipy.stats, seaborn, time
from matplotlib import pyplot, ticker
def parse_date(d):
return datetime.datetime.utcfromtimestamp(time.mktime(time.strptime(d, '%b %d, %Y')))
data = []
for fn in os.listdir('cache'):
if fn.startswith('companies'):
stages = json.load(open(os.path.join('cache', fn)))
seeds = [stage for stage in stages if stage['stage'] == 'Seed' or stage['stage'].startswith('Series')]
exits = [stage for stage in stages if stage['stage'] == 'IPO' or stage['stage'].startswith('Acquired')]
if not len(seeds):
continue
try:
seeded = min(parse_date(stage['date']) for stage in seeds)
if exits:
exited = min(parse_date(stage['date']) for stage in exits)
if exited < seeded:
continue
else:
exited = None
data.append((seeded, exited))
except OverflowError:
continue
except TypeError:
continue
print(len(data))
print(sum(1 for seeded, exited in data if exited is not None))
YEAR = 365.25 * 24 * 60 * 60
years = list(range(2008, 2016))
@contextlib.contextmanager
def shared_plot_setup(y_fmt=None, x_label=None, y_label=None, title=None, legend_loc='upper right', output_fn=None, xlim=None, ylim=None):
fig = pyplot.figure(1, (9, 6))
ax = fig.add_subplot(1, 1, 1)
yticks = ticker.FormatStrFormatter(y_fmt)
ax.yaxis.set_major_formatter(yticks)
yield
pyplot.legend(loc=legend_loc)
pyplot.xlabel(x_label)
pyplot.ylabel(y_label)
pyplot.title(title)
if xlim:
pyplot.xlim(xlim)
if ylim:
pyplot.ylim(ylim)
if output_fn:
pyplot.savefig(output_fn)
pyplot.show()
# Plot conversion by year
with shared_plot_setup(y_fmt='%.0f%%',
x_label='Year company was started',
y_label='Fraction exited',
title='Fraction of companies exited by year started',
output_fn='conversion_by_year.png'):
n_by_year, k_by_year = {}, {}
for seeded, exited in data:
n_by_year[seeded.year] = n_by_year.get(seeded.year, 0) + 1
if exited:
k_by_year[seeded.year] = k_by_year.get(seeded.year, 0) + 1
ns = numpy.array([n_by_year.get(year, 0) for year in years])
ks = numpy.array([k_by_year.get(year, 0) for year in years])
pyplot.plot(years, 100.*ks/ns, linestyle=':', marker='o', markersize=10, label='Exit %')
pyplot.fill_between(years,
100.*scipy.stats.beta.ppf(0.05, ks+1, ns-ks+1),
100.*scipy.stats.beta.ppf(0.95, ks+1, ns-ks+1),
alpha=0.2,
label='Confidence interval')
# Plot time to conversion
with shared_plot_setup(y_fmt='%d',
x_label='Year company was started',
y_label='Number of years until exit',
title='Time to exit by year started',
output_fn='time_to_conversion_by_year.png'):
t_by_year = {}
for seeded, exited in data:
if exited:
t_by_year.setdefault(seeded.year, []).append((exited - seeded).total_seconds()/YEAR)
ts = [t_by_year.get(year, []) for year in years]
pyplot.plot(years, [numpy.median(t) for t in ts], linestyle=':', color=(1, 0.2, 0.2), marker='o', markersize=10, label='Median time to exit')
pyplot.plot(years, [numpy.percentile(t, 5) for t in ts], linestyle=':', color=(1, 0.7, 0.7), markersize=10, label='5th percentile')
pyplot.plot(years, [numpy.percentile(t, 95) for t in ts], linestyle=':', color=(1, 0.7, 0.7), markersize=10, label='95th percentile')
def get_grouped_data(group_years):
now = datetime.datetime.now()
TE_by_group = {}
if group_years:
split_year = years[int(len(years)/2)]
group_lo = '%d-%d' % (years[0], split_year-1)
group_hi = '%d-%d' % (split_year, years[-1])
groups = [group_lo, group_hi]
else:
groups = years
for seeded, exited in data:
if not years[0] <= seeded.year <= years[-1]:
continue
if group_years:
group = seeded.year < split_year and group_lo or group_hi
else:
group = seeded.year
T, E = TE_by_group.setdefault(group, ([], []))
if exited:
T.append((exited - seeded).total_seconds()/YEAR)
E.append(True)
else:
T.append((now - seeded).total_seconds()/YEAR)
E.append(False)
return groups, TE_by_group
# Plot cohort conversion rates
for with_confidence_interval, group_years, output_fn in [(False, False, 'cohort_plot_all_years.png'), (True, True, 'cohort_plot_grouped.png')]:
with shared_plot_setup(y_fmt='%.0f%%',
x_label='Years after start',
y_label='Fraction of companies exited',
title='Fraction of companies exited by time after start',
legend_loc='upper left',
output_fn=output_fn,
xlim=[0, 5.2],
ylim=[0, 25]):
groups, TE_by_group = get_grouped_data(group_years)
colors = seaborn.color_palette('hls', len(groups))
for group, color in zip(groups, colors):
T, E = TE_by_group[group]
group_max_t = min(t for t, e in zip(T, E) if not e)
exits = sorted(t for t, e in zip(T, E) if e and t < group_max_t)
ts = [0] + exits
ks = numpy.arange(len(exits) + 1)
pyplot.plot(ts, 100. * ks / len(T), label='Started in %s' % group, color=color)
if with_confidence_interval:
pyplot.fill_between(ts,
100.*scipy.stats.beta.ppf(0.05, ks+1, len(T)-ks+1),
100.*scipy.stats.beta.ppf(0.95, ks+1, len(T)-ks+1),
color=color, alpha=0.2)
# Plot Kaplan-Meier estimate
for with_confidence_interval, group_years, output_fn in [(False, False, 'kaplan_meier_all_years.png'), (True, True, 'kaplan_meier_grouped.png')]:
with shared_plot_setup(y_fmt='%.0f%%',
x_label='Years after start',
y_label='Fraction of companies exited',
title='Fraction of companies exited by time after start - Kaplan-Meier',
legend_loc='upper left',
output_fn=output_fn,
xlim=[0, 5.2],
ylim=[0, 25]):
groups, TE_by_group = get_grouped_data(group_years)
colors = seaborn.color_palette('hls', len(groups))
for group, color in zip(groups, colors):
# Compute Kaplan-Meier using lifelines
T, E = TE_by_group[group]
kmf = lifelines.KaplanMeierFitter()
kmf.fit(T, event_observed=E)
ts = kmf.survival_function_.index.values
# max_i = bisect.bisect_left(ts, 5.0)
ps = 1.0 - kmf.survival_function_['KM_estimate'].values
ps_hi = 1.0 - kmf.confidence_interval_['KM_estimate_lower_0.95'].values
ps_lo = 1.0 - kmf.confidence_interval_['KM_estimate_upper_0.95'].values
pyplot.plot(ts, 100. * ps, label='Started in %s' % group, color=color)
if with_confidence_interval:
pyplot.fill_between(ts, 100. * ps_lo, 100. * ps_hi, color=color, alpha=0.2)
continue
# Compute it manually
te = sorted(zip(*TE_by_group[group]))
n, k = len(te), 0
ts, ys = [], []
p = 1.0
for t, e in te:
if e:
p *= (n-1) / n
n -= 1
ts.append(t)
ys.append(100. * (1-p))
pyplot.plot(ts, ys, 'b')