In [None]:
import pandas as pd


path = '../data/observations-580259.csv'

df = pd.read_csv(path)
df

In [None]:
df['observed_on'] = pd.to_datetime(df['observed_on'])

In [None]:
df['year'] = df['observed_on'].dt.year
df['month'] = df['observed_on'].dt.month
df['day'] = df['observed_on'].dt.day

In [None]:
number_of_observations_series = df.groupby('observed_on').size()
number_of_observations_series

In [None]:
number_of_distinct_users_per_day = df.groupby('observed_on')['user_id'].nunique()
number_of_distinct_users_per_day

In [None]:
import matplotlib.pyplot as plt

# Resample to yearly sums
obs_per_year = number_of_observations_series.resample('YE').sum()
users_per_year = number_of_distinct_users_per_day.resample('YE').sum()
ratio_per_year = obs_per_year / users_per_year

fig, axs = plt.subplots(2, 1, figsize=(10, 8), sharex=True)

# Plot total observations and users per year
axs[0].plot(obs_per_year.index.year, obs_per_year.values, marker='o', label='Observations')
axs[0].plot(users_per_year.index.year, users_per_year.values, marker='s', label='Distinct Users')
axs[0].set_ylabel('Count')
axs[0].set_title('Yearly Observations and Distinct Users')
axs[0].legend()
axs[0].grid(True)

# Plot ratio
axs[1].plot(ratio_per_year.index.year, ratio_per_year.values, marker='d', color='purple')
axs[1].set_ylabel('Avg Observations per User')
axs[1].set_xlabel('Year')
axs[1].set_title('Average Observations per User per Year')
axs[1].grid(True)

# Adjust xlim to start just after 2000
axs[1].set_xlim(left=2000)
axs[1].set_xticks(range(2000, ratio_per_year.index.year.max() + 1, 2))
axs[1].set_xticklabels(range(2000, ratio_per_year.index.year.max() + 1, 2), rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Quero gerar a série do número de usuários nesse dia que não tinham feito observações em nenhum dia anterior.
# Ou seja, o número de usuários que fizeram a primeira observação nesse dia.
first_observations_per_day = df.groupby('observed_on')['user_id'].apply(lambda x: x[x.isin(x.drop_duplicates(keep='first'))])
first_observations_per_day = first_observations_per_day.groupby('observed_on').nunique()
first_observations_per_day


In [None]:
first_observations_per_day.resample('YE').sum().plot(marker='o', figsize=(12, 6))