In [73]:
# !pip install pandas

# First you must use https://github.com/Tyrrrz/DiscordChatExporter to export the chat logs

# dotnet DiscordChatExporter.Cli.dll exportguild -g <server-id> -t "bot-token" -f PlainText --parallel 20

## Extracting and Grouping Data

In [74]:
import re
import os
import pandas as pd
from urllib.parse import urlparse
from collections import defaultdict

# Set the input folder and output folder
input_folder = "chats"
output_folder = "extracted_links"

# Initialize a dictionary to store extracted links grouped by domain
links_by_domain = defaultdict(list)

# Walk through the directory tree
for root, dirs, files in os.walk(input_folder):
    for filename in files:
        # Check if the file has a .txt extension
        if filename.endswith(".txt"):
            file_path = os.path.join(root, filename)
            
            # Read the input file
            with open(file_path, "r") as file:
                file_contents = file.read()

            # Extract links using regular expressions
            url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
            links = re.findall(url_pattern, file_contents)
            
            # Add extracted links to the dictionary
            for link in links:
                domain = urlparse(link).netloc
                if domain not in links_by_domain:
                    links_by_domain[domain] = set()
                links_by_domain[domain].add(link)

# Function to group domains
def group_domains(domain):
    if domain in ['twitter.com', 't.co']:
        return 'twitter'
    return domain

# Group specific domains and create a 'Misc.txt' file for domains with less than 5 results
grouped_links = defaultdict(list)
misc_links = []

for domain, links in links_by_domain.items():
    group = group_domains(domain)
    
    if len(links) < 5:
        misc_links.extend(links)
    else:
        grouped_links[group].extend(links)

if misc_links:
    grouped_links["Misc"].extend(misc_links)

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Save the grouped links to separate .txt files
for group, links in grouped_links.items():
    output_file = os.path.join(output_folder, f"{group}.txt")
    
    with open(output_file, "w") as file:
        for link in links:
            file.write(f"{link}\n")

print(f"Extracted and saved links to the {output_folder} directory.")


Extracted and saved links to the extracted_links directory.


## Helper Functions

In [75]:
import toml
import os
import re
import glob
from urllib.parse import urlsplit

def get_toml_path(toml_base_name, base_path):
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file == toml_base_name:
                return os.path.join(root, file)
    return None

def is_valid_url(url):
    try:
        result = urlsplit(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

def trim_url(url):
    """Trim a GitHub URL to the base organization or repository URL."""
    url = url.strip()
    url = re.sub(r"\?.*$", "", url)  # Remove query parameters
    url = re.sub(r"#.*$", "", url)  # Remove fragments
    url = url.rstrip("/")
    if "/pull/" in url:
        url = url[: url.index("/pull/")]
    if "/wiki" in url:
        url = url[: url.index("/wiki")]
    match = re.search(r"(https?://github\.com/[^/]+/[^/]+)", url)
    url = re.sub(r"[\)\.\*\"]+$", "", url)
    if match:
        return match.group(1)
    else:
        return re.sub(r"[\)\.\"]+$", "", url)
    
def remove_special_chars(url):
    return re.sub(r'[\*\)]+$', '', url)


    
output_toml = "../../../crypto-ecosystems/data/ecosystems/e/ergo-developer-tooling.toml"
ergo_toml = "../../../crypto-ecosystems/data/ecosystems/e/ergo.toml"
# Load the ergo.toml file
with open(ergo_toml, "r") as file:
    ergo_data = toml.load(file)

# Load the github.com.txt file
with open("extracted_links/github.com.txt", "r") as file:
    github_links = file.readlines()


## Get all subecosystem .toml files

In [76]:

# Get the sub_ecosystems list
sub_ecosystems = ergo_data["sub_ecosystems"]

# Define the base directory for searching .toml files
base_dir = "/Users/m/Documents/GitHub/crypto-ecosystems/data/ecosystems/"

# Collect .toml files for each sub_ecosystem
sub_ecosystem_toml_files = []

# Iterate over the sub_ecosystems list
for sub_ecosystem in sub_ecosystems:
    toml_base_name = sub_ecosystem.lower().replace(" ", "-") + ".toml"
    toml_path = get_toml_path(toml_base_name, base_dir)
    if toml_path:
        sub_ecosystem_toml_files.append(toml_path)
    else:
        print(f"Could not find {toml_base_name}")

# Print the .toml files corresponding to the sub_ecosystems
print(len(sub_ecosystem_toml_files), 'sub ecosystem .toml files found:')

# A set to keep track of existing organizations and repositories
existing_orgs_and_repos = set()

# A set to keep track of existing URLs
existing_urls_set = set()


41 sub ecosystem .toml files found:


## Process the sub ecosystem .toml files

In [77]:

# Process each sub ecosystem .toml file
for toml_file_pattern in sub_ecosystem_toml_files:
    for toml_file in glob.glob(toml_file_pattern):
        # Load the .toml file
        with open(toml_file, "r") as file:
            data = toml.load(file)

        # Extract existing URLs from the .toml file
        existing_urls = []
        if "github_organizations" in data:
            existing_urls += data["github_organizations"]
        if "repo" in data:
            existing_urls += [repo["url"] for repo in data["repo"]]

        # Extract organization and repository names from existing URLs
        for url in existing_urls:
            #print(url)
            trimmed_url = trim_url(url)
            
            if is_valid_url(trimmed_url):
                # Extract the organization and repository names from the URL
                match = re.search(r"github.com/([^/]+)(?:/([^/]+))?", trimmed_url)

                # If the URL is a valid GitHub URL, add it to the set
                if match:
                    org_and_repo = match.groups()
                    existing_orgs_and_repos.add(org_and_repo)
                    existing_urls_set.add(trimmed_url)
                

                # Add the organization to the set as well, if it is in github_organizations
                for org in data.get("github_organizations", []):
                    if trimmed_url.startswith(org):
                        org_name = urlsplit(org).path.strip('/')
                        existing_orgs_and_repos.add((org_name, None))
                        #print(org_name, 'Added!')

print(len(existing_urls_set), "existing URLs found")
print(len(existing_orgs_and_repos), "existing organizations and repositories found")

584 existing URLs found
584 existing organizations and repositories found


## Find missing URLs

In [78]:
# Specify organizations to ignore
ignore_orgs = {
            "fusesource", 'cardano-community', 'DefiLlama', 
            'halsafar', 'Arman92', 'electric-capital', 'IndeedMiners', 
            'firstcontributions', 'Xilinx', 'ethereum-optimism', 'ExpediaGroup',
            'trustwallet', 'BLAKE3-team', 'ZSLP', 'bitcoin', 'maticnetwork',
            'etclabscore', 'coinfoundry', 'Lolliedieb', 'lustefaniak',
            'Uniswap', 'trexminer', 'minershive', 'NebuTech', 'doktor83',
            'arnabk', 'YouMinerDev', 'freebsd', 'scalameta', 'GetScatter',
            'todxx', 'oliverw', 'jpg-store', 'paritytech', 'dogecoin',
            '42wim', 'RainbowMiner', 'rainbowminer', 'doktor83', 'hexresearch',
            'bruno-garcia', 'ZeroSync', 'advisories', 'RavenCommunity',
            'trezor', 'SChernykh', 'ElementsProject', 'honungsburk',
            'chadouming', 'tvanepps', 'i1skn', 'nanopool', 
            'brave', '045bkp', 'trezor', 'twitter', 'WinterTFG0',
            'ethereum', 'TremendouslyHighFrequency', 'menonsamir',
            'certusone', 'coreybutler', 'certusone', 'bzminer',
            'ScorexFoundation', 'ergoMixer', 'rust-bitcoin', 'non', 'WyvernTKC',
            'JohnLaw2', 'nervosnetwork', 's-nomp', 'sp-hash', 'zone117x',
            'ShiftLeftSecurity', 'MrMaxweII', 'forknote', 'AtomicLoans',
            'Dav-Git', 'starkware-libs', 'reach-sh', 'reflexer-labs',
            'aeternity', 'alt3', 'NixOS', 'OpenAPITools', 'Mikerah',
            'BySergeyDev', 'sangria-graphql', 'soullesscomputerboy',
            'TwiN', 'SundaeSwap-finance', 'minernl', 'WinterTFG',
            'JulianKemmerer', 'kyuupichan', 'sdaveas', 'lightbend',
            'AlphaX-Projects', 'ma-ha', 'rsmmnt', 'arduino', 'electron',
            'C4K3', 'BLAKE2', 'minio', 'ethereum-mining', 'kadena-io',
            'wavesplatform', 'plebbit', 'dcSpark', 'john-light', 'JetBrains',
            'Eliovp', 'libp2p', 'simerplaha', 'ossu', 'dashevo', 'ethereumclassic',
            'ergoplatform', 'btclinux', 'orgs', 'ghostdogpr', 'yuriy0803', 'portable-scala', 'KomodoPlatform',
            'tiangolo', 'bwbush', 'CLRX', 'OhGodPet', 'YfryTchsGD', 'Comcast', 'atomiclabs', 'pikvm',
            'PyO3', 'Astodialo', 'SpaceXpanse', 'sbt', 'alephium', 'emeraldpay', 'spantaleev', 'japgolly',
            'FgForrest', 'jrbender', 'berry-pool', 'Gravity-Tech', 'obolflip', 'SuSy-One', 'OhGodAPet', 'Balbin-Labs',
            '.insteadOf', 'hyperledger-labs', 'spaceswapio', 'search', 'prometheus', 'DevSCNinja', 'aragogi', 'bitcoincashorg', 'bcgit', 'gemlink',
            'robkorn', 'sininen-taivas', 'ergo', 'zawy12', 'rooooooooob', 'ergomixer', 'akyo8', 'arcnet', 'adventurersdao',
            'gsblabsio'
            
            }

In [79]:

# Compare the github.com.txt file and create a list of missing links
missing_links = set()

for link in github_links:
    link = remove_special_chars(link.strip())
    trimmed_link = trim_url(link)
    if is_valid_url(trimmed_link):
        match = re.search(r"github.com/([^/]+)(?:/([^/]+))?", trimmed_link)

        # if the URL is a valid GitHub URL, check if it is already in the list of existing organizations and repositories
        if match:
            # Extract the organization and repository names from the URL
            org_and_repo = match.groups()
            # Check if the organization is in the ignore list
            if org_and_repo[0] not in ignore_orgs:

                # Check if the organization exists in the existing_orgs_and_repos set
                org_exists = any(existing_org == org_and_repo[0] and existing_repo is None for existing_org, existing_repo in existing_orgs_and_repos)
                
                # If the organization doesn't exist, and the URL is not in the existing_urls_set, add it to the missing_links
                if not org_exists and trimmed_link not in existing_urls_set:
                    print("Adding to missing_links", trimmed_link)
                    missing_links.add(trimmed_link)
                    existing_orgs_and_repos.add(org_and_repo)
                    #print(f"Added {org_and_repo} to existing_orgs_and_repos")
# Convert the missing_links set back to a list
missing_links = list(missing_links)


print(len(existing_orgs_and_repos), "existing organizations and repositories found")
print(len(missing_links), "missing links found")

Adding to missing_links https://github.com/mhssamadani/ErgoStratumServer.
Adding to missing_links https://github.com/ergop
Adding to missing_links https://github.com/anon-br/ledger-ergo-js.
Adding to missing_links https://github.com/zkastn/ergo-raffle-bot,
Adding to missing_links https://github.com/Emurgo/yoroi-frontend
Adding to missing_links https://github.com/scalahub/ErgoScriptCompiler
Adding to missing_links https://github.com/Emurgo
Adding to missing_links https://github.com/scalahub/ErgoScriptCompiler
Adding to missing_links https://github.com/nirvanush/whale-alerts-twit...
Adding to missing_links https://github.com/mhssamadani/ErgoStratumServer>
Adding to missing_links https://github.com/ThierryM1212/SAFEW>
Adding to missing_links https://github.com/capt-nemo429/nautilus-wallet>
Adding to missing_links https://github.com/abchrisxyz/ergowatch>
Adding to missing_links https://github.com/mhssamadani/Autolykos2_NV_Miner.
Adding to missing_links https://github.com/andrehafner/my.erg

## Export

In [80]:
# Load the ergo.toml file
with open(output_toml, "r") as file:
    ergo_data = toml.load(file)

# Add the missing links to ergo.toml
for link in missing_links:
    if is_valid_url(link):
        ergo_data["repo"].append({"url": link})

# Save the changes in the existing ergo.toml file
with open(output_toml, "w") as file:
    toml.dump(ergo_data, file)

In [82]:
import toml
import re

with open(output_toml, "r") as file:
    data = toml.load(file)

data["repo"] = [d for d in data["repo"] if re.match(r"^https?://[\w\-\.~:\#\?@\[\]\!\$&'\(\)\*\+,;=/]+$", d["url"])]

with open(output_toml, "w") as file:
    toml.dump(data, file)



import glob
import toml

sub_ecosystem_toml_files = glob.glob("../../../crypto-ecosystems/data/ecosystems/*/ecosystem.toml")

existing_orgs_and_repos = set()

for toml_file in sub_ecosystem_toml_files:
    with open(toml_file, "r") as file:
        data = toml.load(file)

        if "github_organizations" in data:
            for org in data["github_organizations"]:
                existing_orgs_and_repos.add((org.strip("/").split("/")[-1], None))

        if "repo" in data:
            for repo in data["repo"]:
                url = repo.get("url")
                if url:
                    trimmed_url = url.strip("/").replace("https://github.com/", "")
                    org_and_repo = tuple(trimmed_url.split("/")[:2])
                    existing_orgs_and_repos.add(org_and_repo)

with open("../../../crypto-ecosystems/data/ecosystems/e/ergo.toml", "r") as file:
    ergo_data = toml.load(file)

new_repos = []
for repo in ergo_data.get("repo", []):
    url = repo.get("url")
    if url:
        trimmed_url = url.strip("/").replace("https://github.com/", "")
        org_and_repo = tuple(trimmed_url.split("/")[:2])
        if org_and_repo not in existing_orgs_and_repos:
            new_repos.append(repo)

ergo_data["repo"] = new_repos

with open("../../../crypto-ecosystems/data/ecosystems/e/ergo.toml", "w") as file:
    toml.dump(ergo_data, file)

def get_toml_path(toml_base_name, base_path):
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file == toml_base_name:
                return os.path.join(root, file)
    return None



import toml
import glob
import os


# Load ergo.toml
with open("../../../crypto-ecosystems/data/ecosystems/e/ergo.toml", "r") as f:
    ergo_data = toml.load(f)

# Get sub-ecosystems
sub_ecosystems = ergo_data["sub_ecosystems"]

# Get repo URLs
existing_urls_set = set()
for repo in ergo_data.get("repo", []):
    existing_urls_set.add(repo.get("url"))

# Define the base directory for searching .toml files
base_dir = "../../../crypto-ecosystems/data/ecosystems/"

# Check each sub-ecosystem for duplicate URLs
for sub_ecosystem in sub_ecosystems:
    toml_base_name = sub_ecosystem.lower().replace(" ", "_") + ".toml"
    toml_path = get_toml_path(toml_base_name, base_dir)
    print(sub_ecosystem)
    print(toml_path)
    if toml_path:
        with open(toml_path, "r") as f:
            data = toml.load(f)
        if "repo" in data:
            for repo in data["repo"]:
                if repo.get("url") in existing_urls_set:
                    # Remove the duplicate URL from the repo section of ergo.toml
                    ergo_data["repo"] = [r for r in ergo_data.get("repo", []) if r.get("url") != repo.get("url")]


# Save the changes to ergo.toml
with open("output.toml", "w") as f:
    toml.dump(ergo_data, f)
