# Data Fetching

In [5]:
import requests
import pandas as pd
import time

def fetch_top_5000_coins():
    """Fetch the top 5000 coins by market cap using the /coins/markets endpoint."""
    url = "https://api.coingecko.com/api/v3/coins/markets"
    all_coins = []
    
    for page in range(1, 21):  # 20 pages * 250 coins per page = 5000 coins
        params = {
            "vs_currency": "usd",
            "order": "market_cap_desc",
            "category": "meme-token",
            "per_page": 250,
            "page": page
        }
        response = requests.get(url, params=params)
        if response.status_code == 200:
            all_coins.extend(response.json())
        else:
            print(f"Failed to fetch page {page}: {response.status_code}")
        time.sleep(16)  # Sleep to avoid hitting rate limits
    
    return all_coins

# Step 1: Fetch top 5000 coins by market cap
top_5000_coins = fetch_top_5000_coins()

# Step 2: Extract coin IDs
coin_ids = [coin['id'] for coin in top_5000_coins]

# Adding Descriptions for Classification

In [None]:
headers = {'x-cg-pro-api-key': "CG-FtwnDTUpog7Z3GPYYqxXxQDb"}
def fetch_coin_descriptions(coin_ids):
    """Fetch descriptions for each coin using the /coins/{id} endpoint."""
    descriptions = {}
    
    for coin_id in coin_ids:
        response = requests.get(f"https://pro-api.coingecko.com/api/v3/coins/{coin_id}", headers=headers)
        if response.status_code == 200:
            data = response.json()
            descriptions[coin_id] = data['description']['en']
        else:
            print(f"Failed to fetch description for {coin_id}: {response.status_code}")
        time.sleep(0.5)  # Sleep to avoid rate limits
    
    return descriptions
coin_descriptions = fetch_coin_descriptions(coin_ids)

# Step 4: Combine market cap data with descriptions
for coin in top_5000_coins:
    coin['description'] = coin_descriptions.get(coin['id'], "")

# Step 5: Convert to DataFrame and save to CSV
df = pd.DataFrame(top_5000_coins)
df = df.sort_values(by="market_cap", ascending=False)  # Ensure sorting by market cap
df.to_csv("top_5000_meme_coins.csv", index=False)

print("Saved top 5000 coins with descriptions to CSV.")

Saved top 5000 coins with descriptions to CSV.


# Project

In [1]:
import pandas as pd
import requests
import json
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
import nltk
from nltk.corpus import stopwords
import re
from collections import defaultdict, Counter

: 

# Process Data

In [20]:

df = pd.read_csv("top_5000_meme_coins.csv")
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'[^a-z\s]', '', text)  
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]  
    return ' '.join(tokens)
df['processed_description'] = [preprocess_text(desc) for desc in df['description']]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\natew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Build the model

In [15]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(preprocessed_descriptions)
embeddings = normalize(embeddings)  

In [24]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10, random_state=42)  # reduce dimensionality to decrease influence of common common words
reduced_embeddings = pca.fit_transform(embeddings)

In [29]:
n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(reduced_embeddings)

# Auto Assign Labels

In [30]:
df['cluster'] = labels  

clustered_coins = df.groupby('cluster')

cluster_keywords = {}
for label, group in clustered_coins:
    all_descriptions = ' '.join(group['processed_description'].tolist())
    word_counts = Counter(all_descriptions.split())
    common_words = [word for word, count in word_counts.most_common(5)]  # Top 5 common words
    cluster_keywords[label] = common_words

print("clusters")
for label, keywords in cluster_keywords.items():
    print(f"cluster {label}: {', '.join(keywords)}")

auto_meme_types = {
    label: ' '.join(keywords) for label, keywords in cluster_keywords.items()
}

def get_meme_type(label):
    return auto_meme_types.get(label, "Unknown")

df['meme_type'] = df['cluster'].apply(get_meme_type)

clusters
cluster 0: memecoin, meme, solana, community, cat
cluster 1: blockchain, project, decentralized, token, community
cluster 2: 
cluster 3: meme, community, token, romeo, memes
cluster 4: community, crypto, coin, world, token
cluster 5: token, meme, community, pepe, coin
cluster 6: dog, cat, solana, doge, meme
cluster 7: loading, ily, community, solana, pepe
cluster 8: community, token, meme, solana, project
cluster 9: dog, doge, meme, token, inu
cluster 10: meme, coin, memecoin, crypto, token
cluster 11: meme, community, crypto, token, coin
cluster 12: community, project, meme, memecoin, token
cluster 13: token, community, project, doge, blockchain
cluster 14: blockchain, cat, community, crypto, solana
cluster 15: meme, community, cat, token, project
cluster 16: ai, agent, project, agents, users
cluster 17: community, project, token, ecosystem, meme
cluster 18: token, tokens, community, holders, liquidity
cluster 19: kitty, keeps, spinning, community, pepe
