This notebook is to run some exploratory work on the cloudquery database in a code-based (crucially, unit testable) way.

In [None]:
import sqlalchemy as sa

import psycopg2 as ps
import pandas as pd

conn = ps.connect(
    host='???',
    port='???',
    dbname='???',
    user='???',
    password='???')

engine = sa.create_engine('postgresql://', creator=lambda: conn)

def select(table_name:str, columns: list[str], db_engine = engine) -> pd.DataFrame:
    return pd.read_sql_table(table_name, con=db_engine, columns=columns)

In [None]:

topics_df = select("guardian_production_status", ['status', 'priority'])
topics_list = topics_df['status'].tolist()
print(topics_df['status'].tolist())

In [None]:

non_pe_teams_list = select('guardian_non_p_and_e_github_teams', ['team_name'])['team_name'].tolist()
topics_df = select("github_repositories", ['full_name', 'topics'])
teams_df = select('github_teams', ['name', 'slug'])
#select function doesn't work on views, so we have to use read_sql_query
ownership_df = pd.read_sql_query("select repo_name, github_team_name, github_team_id from view_repo_ownership", con=conn)
new_df = ownership_df.merge(teams_df, how='left', left_on='github_team_name', right_on='name')[['repo_name', 'github_team_name', 'slug']]


In [None]:
from rules import github_06
topic_rule_df = github_06(new_df, topics_df, topics_list, non_pe_teams_list)
freq = topic_rule_df['github_06'].value_counts()
freq.plot.pie(subplots=True, figsize=(11, 6), title='Repo has an appropriate topic or is owned by a non-P&E team')

In [None]:
import ipywidgets as widgets

selected_topics = widgets.SelectMultiple(
    options=topics_list,
    description='Topics',
)

selected_topics

In [None]:
freq = ownership_df['github_team_name'].value_counts()
freq

In [None]:
import matplotlib.pyplot as plt
freq[:15].plot(kind='bar', title='Frequency Count of team name', xlabel='team name', ylabel='Count')
plt.show()

In [None]:
list(selected_topics.value)