In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
from plotnine import *
data = pd.read_csv("jobs.csv")
data["date_opened"] = pd.to_datetime(data["date_opened"])
print(data.dtypes)

title                  object
company                object
location               object
date_opened    datetime64[ns]
offer                    bool
rejected                 bool
remote                   bool
dtype: object


In [2]:
# creating another df that counts the number of times a company appeears
company_counts = data.groupby(["company"])["company"].count().to_frame()
company_counts.rename(columns={"company":"count"}, inplace=True)
company_counts.reset_index(inplace=True) # index -> column
company_counts.sort_values(["count"], ascending=False, inplace = True, ignore_index = True)

In [3]:
# location counts
location_counts = data.groupby(["location"])["location"].count().to_frame()
location_counts.rename(columns={"location":"count"}, inplace=True)
location_counts.reset_index(inplace=True) # index -> column
location_counts.sort_values(["count"], ascending=False, inplace = True, ignore_index = True)

In [4]:
EARLIEST_DATE = min(data["date_opened"])

data["days_in"] = data["date_opened"] - EARLIEST_DATE
data["days_in"] = data["days_in"].apply(lambda x: x.days) # convert this to ints

# find cumulative count of applications
# using days in, make cumulative count matrix

date_range = (max(data["date_opened"]) - min(data["date_opened"])).days + 1

# keeps track of days applied
days_raw = []
days_cumu = []

# check previous cell and only check for 'x' days in
days_raw.append(data[(data["days_in"] == 0)].shape[0])
days_cumu.append(days_raw[0])

for day in range(1, date_range+1):
    
    # not sure big O of this
    days_raw.append(data[(data["days_in"] == day)].shape[0])
    
    # adding previous cumulative count
    days_cumu.append(days_cumu[day-1] + days_raw[day])

day_applied_df = pd.DataFrame({"day_cnt_raw":days_raw,
                                "day_cnt_cumu":days_cumu})
day_applied_df.head(5)

Unnamed: 0,day_cnt_raw,day_cnt_cumu
0,14,14
1,8,22
2,4,26
3,0,26
4,0,26


In [5]:
weeks_raw = []
weeks_cumu = []

i = 0
while i < day_applied_df.shape[0]:
    # iterate 7 times
    cur_week_sum = 0
    
    for _ in range(7): # overshooting, making estimated 7 day avg
        if i < day_applied_df.shape[0]:
            cur_week_sum += day_applied_df["day_cnt_raw"][i]
            i += 1
        
    weeks_raw.append(cur_week_sum)
    
    cur_week_cumu = weeks_cumu[-1] if weeks_cumu else 0# last val
    cur_week_cumu += cur_week_sum
    
    weeks_cumu.append(cur_week_cumu)
    
    # do the cumu calculation here?
    
weeks_applied_df = pd.DataFrame({"week_cnt_raw":weeks_raw,
                                 "week_cnt_cumu":weeks_cumu})

weeks_applied_df["date"] = [EARLIEST_DATE + pd.Timedelta(i*7, "d") for i in weeks_applied_df.index]

day_applied_df["date"] = [EARLIEST_DATE + pd.Timedelta(i, "d") for i in day_applied_df.index]


day_applied_df.head(5)

Unnamed: 0,day_cnt_raw,day_cnt_cumu,date
0,14,14,2022-08-24
1,8,22,2022-08-25
2,4,26,2022-08-26
3,0,26,2022-08-27
4,0,26,2022-08-28


In [6]:
weeks_applied_df.head(5)

Unnamed: 0,week_cnt_raw,week_cnt_cumu,date
0,35,35,2022-08-24
1,17,52,2022-08-31
2,19,71,2022-09-07
3,30,101,2022-09-14
4,8,109,2022-09-21


In [7]:
company_counts.to_csv("company_cnt.csv")
location_counts.to_csv("location_cnt.csv")
day_applied_df.to_csv("day_cnt.csv")
weeks_applied_df.to_csv("week_cnt.csv")

In [8]:
# current dataframes
# data
# location_counts
# company_counts
# day_applied_df
# weeks_applied_df

## plots

software engineer job search

In [9]:
# plotting
import plotly.express as px
import plotly.graph_objects as go

In [10]:
day_applied_df.columns

Index(['day_cnt_raw', 'day_cnt_cumu', 'date'], dtype='object')

In [11]:
weeks_applied_df.columns

Index(['week_cnt_raw', 'week_cnt_cumu', 'date'], dtype='object')

In [12]:
# plot prep
day_applied_df_temp = day_applied_df
day_applied_df_temp.rename(columns={
                            "date":"Date",
                            "day_cnt_raw":"Day Count",
                            "day_cnt_cumu":"Cumulative Count"}, 
                           inplace = True)

# I want to add vertical lines at the dates that I got the job in yellow
# find the dates then add vbar? i guess
offers = data[data["offer"] == True]
offers["title"] = offers["title"].apply(lambda t: t.title())
offers["company"] = offers["company"].apply(lambda t: t.title())

# join date to cumulative date offers <> day_applied_df
offers = offers.merge(right=day_applied_df,
                      how="left",
                      left_on="date_opened",
                      right_on="Date")

In [13]:
fig = px.line(
    day_applied_df_temp, 
    x = "Date", 
    y = "Cumulative Count",
    title="Cumulative Applications Over Time",
    hover_data={"Cumulative Count":True,
                "Date":False})

fig.add_bar(
    x=day_applied_df_temp["Date"], 
    y=day_applied_df_temp["Day Count"],
    name="Daily Apps",
    marker_color = "green")

fig.add_scatter(
    hoverinfo= 'none',
    mode="markers",
        x=offers["date_opened"],
        y=offers["Cumulative Count"],
        name = "Applications that led to offers",
        marker=dict(color="rgba(0,0,0,0)",
                    size = 10,
                    line = dict(color="orange",
                                width=3))
)

fig.update_layout(
    xaxis_title="Date",
    yaxis_title="Number of Applications",
    legend=dict( # raw {} doesn't work
        orientation="h",
        yanchor="bottom",
        y= -.2,
        xanchor = "right",
        x = 1
    ),
    hovermode="x unified"
)

fig.show(config={
    'modeBarButtonsToRemove':['lasso2d','pan']
})

fig.write_html("output/cumulative_app_chart.html")
fig.write_json("output/cumulative_app_chart.json")
fig.write_image("output/cumulative_app_chart.png", engine="kaleido", height=400, width = 600)

del day_applied_df_temp




In [14]:
ghost_prop = (data[(data["rejected"] == False) & (data["offer"] == False)].shape[0]) / data.shape[0]
rejected_prop = (data[(data["rejected"] == True)].shape[0]) / data.shape[0]
offer_prop = (data[(data["offer"] == True)].shape[0]) / data.shape[0]

print(f"{ghost_prop=}")
print(f"{rejected_prop=}")
print(f"{offer_prop=}")

ghost_prop=0.6714801444043321
rejected_prop=0.3140794223826715
offer_prop=0.01444043321299639
