In [None]:
import os
from dotenv import load_dotenv
import pandas as pd
import sqlalchemy as sqla
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Graph settings
sns.set(context='notebook',style="whitegrid")
plt.rc("figure", figsize=(16,9)) # global figsize

# Load DB URI secret .env
load_dotenv()
DB_URI = os.getenv("DB_URI")

engine = sqla.create_engine(DB_URI)
CONNECTION = engine.connect()
metadata = sqla.MetaData(bind=CONNECTION)

In [None]:
users = pd.read_sql('SELECT * FROM user_stats', CONNECTION)
# users.to_csv('users.csv')
# users = pd.read_csv('users.csv').drop(columns=0)
users

In [None]:
g = sns.scatterplot(data=users, x="count_total", y="count_replies_to_others")
g.set_xscale("log")

In [None]:
users.describe(percentiles=np.arange(0.0, 0.99, 0.05))

In [None]:
sns.jointplot(data=users, x="count_total", y="count_replies_to_others", xlim=[3,2000], ylim=[1, 300])

In [None]:
ax = sns.scatterplot(data=users, x="count_total", y="count_replies_to_others")
ax.set_xscale("log")
ax.set_xlim((1,2000))
ax.set_ylim((0,1000))

In [None]:
quantiles = np.arange(0.0,1.000001,0.01)
percentiles = users.quantile(q=quantiles).drop(columns=['user_id', 'top_department_count'])
percentiles

In [None]:
from matplotlib.ticker import ScalarFormatter

plt.figure(figsize=(16, 9))

ax = sns.lineplot(data=percentiles.drop(columns=['count_answers_received_self', 'count_top_level']))
ax.set_xticks(np.arange(0.0,1.05,0.05))
ax.set_yscale("log")
ax.set_yticks([0, 1, 2,3,4, 5, 10, 50, 100, 1000])
ax.set_ylim(0,1000)
ax.set_xlim(0.3, 1.0)
ax.axvline(color='g', x=percentiles.query('count_replies_to_others >= 10 <= count_answers_received').index[0])
ax.yaxis.set_major_formatter(ScalarFormatter())

In [None]:
ax = percentiles.plot()
ax.set_xticks(np.arange(0.0,1.05,0.05))
ax.set_yscale("log")
ax.set_ylim(1, 1000)
ax.yaxis.set_major_formatter(ScalarFormatter())

In [None]:
ax = sns.lineplot(data=percentiles, x=quantiles, y="count_answers_received")
#ax.set_yscale("log")

In [None]:
sns.histplot(data=users, x="count_total", stat="count", binrange=(10,100), binwidth=10)