In [None]:
from constants.abs_paths import AbsDirPath
import pandas as pd

file_dir = AbsDirPath.KEYWORDS / "matched_wikis"

def load_all_csvs(dir):
    dfs = []
    for file_path in dir.glob("*.csv"):
        try:
            file = pd.read_csv(file_path)
            file['fname'] = file_path
            dfs.append(file)
            print(f"Loaded {file_path}")
        except:
            print(f"Error while loading {file_path}")

    df = pd.concat(dfs)
    return df


df = load_all_csvs(file_dir)

In [None]:
file_dir = AbsDirPath.KEYWORDS / "matched_wikis_pq"

def load_all_csvs(dir):
    dfs = []
    for file_path in dir.glob("*.parquet"):
        try:
            file = pd.read_parquet(file_path)
            file['fname'] = file_path
            dfs.append(file)
            print(f"Loaded {file_path}")
        except:
            print(f"Error while loading {file_path}")

    df2 = pd.concat(dfs)
    return df2


df2 = load_all_csvs(file_dir)

In [None]:
# from pandasgui import show
# show(df)

In [None]:
df2.head()

In [None]:
df2.pivot_table(index=["sentence", "keyword"], columns=["quality_attribute"], values="id", aggfunc="count", fill_value=0, margins=True)

In [None]:
df2.pivot_table(index=["quality_attribute"], columns=["repo", "source"], values="id", aggfunc="count", fill_value=0, margins=True).sort_values(by="All", ascending=False)

In [None]:
df2.pivot_table(index=["quality_attribute"], columns=["repo", "source"], values="id", aggfunc="count", fill_value=0, margins=True).sort_values(by="All", ascending=False)

In [None]:
df2[(df2["repo"] == "scanpy") & (df2['quality_attribute'] == "energy efficiency")]

In [None]:
df2.groupby(["sentence", "source", "keyword"])['id'].count().sort_values(ascending=False)

In [None]:
df2['sentence_counts'] = df2.groupby(["sentence", "source", "keyword"])['id'].transform('count')

In [None]:
df2[(df2['sentence_counts'] > 100) & ~(df2['repo'] == "root")].sort_values(by=['sentence_counts', "sentence", "source", "keyword"],ascending=[False]*4)

In [None]:
import hashlib

quick_hash = lambda x: hashlib.md5(x.encode('utf-8')).hexdigest()

df2['text_hash'] = df2['sentence'].apply(quick_hash)

In [None]:
df2['real_id'] = df2['repo'] + "." + df2['author'] + "." + df2['source'] + "." + df2['quality_attribute'] + "." +  df2['text_hash'] + "." + df2['keyword'].str.lower() + "." +df2['matched_word'].str.lower()

In [None]:
df2.groupby(["real_id"])['id'].count().sort_values(ascending=False)

In [None]:
df2.groupby(["real_id"])['id'].count().sort_values(ascending=False).count()

In [None]:
df2['id'].count()

In [None]:
942631 / 2375956

In [None]:
df2['similar_count'] = df2.groupby(["real_id"])['id'].transform('count')

In [None]:
df2 = df2.groupby(['real_id']).first()

In [None]:
df2.head()

In [None]:
df2[~df2['url'].str.startswith("https://root.cern/root/") & ~df2['url'].str.startswith("https://scanpy.readthedocs.io/en/stable/")]

In [None]:
import itertools

sources = pd.Series(df2.source.unique()).str.split(".").str[0].tolist()
repos = df2[['repo', 'version', 'author']].drop_duplicates().values.tolist()
output_dir = AbsDirPath.OPTIMIZED_KEYWORDS
output_dir.mkdir(exist_ok=True)

for source, (repo, version, author) in itertools.product(sources, repos):
    output_file = output_dir / f"{author}.{repo}.{version}.{source}.parquet"
    df2[(df2['source'] == source) & (df2['repo'] == repo)].to_parquet(output_file, engine='pyarrow', compression='snappy', index=False)
    print(f"Saved {output_file}")

In [None]:
df2