# Data Fetching

In [14]:
import requests
import pandas as pd
import time

def fetch_top_5000_coins():
    """Fetch the top 5000 coins by market cap using the /coins/markets endpoint."""
    url = "https://api.coingecko.com/api/v3/coins/markets"
    all_coins = []
    
    for page in range(1, 21):  # 20 pages * 250 coins per page = 5000 coins
        params = {
            "vs_currency": "usd",
            "order": "market_cap_desc",
            "category": "meme-token",
            "per_page": 250,
            "page": page
        }
        response = requests.get(url, params=params)
        if response.status_code == 200:
            all_coins.extend(response.json())
        else:
            print(f"Failed to fetch page {page}: {response.status_code}")
        time.sleep(16)  # Sleep to avoid hitting rate limits
    
    return all_coins

# Step 1: Fetch top 5000 coins by market cap
top_5000_coins = fetch_top_5000_coins()

# Step 2: Extract coin IDs
coin_ids = [coin['id'] for coin in top_5000_coins]

KeyboardInterrupt: 

# Adding Descriptions for Classification

In [None]:
headers = {'x-cg-pro-api-key': "CG-FtwnDTUpog7Z3GPYYqxXxQDb"}
def fetch_coin_descriptions(coin_ids):
    """Fetch descriptions for each coin using the /coins/{id} endpoint."""
    descriptions = {}
    
    for coin_id in coin_ids:
        response = requests.get(f"https://pro-api.coingecko.com/api/v3/coins/{coin_id}", headers=headers)
        if response.status_code == 200:
            data = response.json()
            descriptions[coin_id] = data['description']['en']
        else:
            print(f"Failed to fetch description for {coin_id}: {response.status_code}")
        time.sleep(0.5)  # Sleep to avoid rate limits
    
    return descriptions
coin_descriptions = fetch_coin_descriptions(coin_ids)

# Step 4: Combine market cap data with descriptions
for coin in top_5000_coins:
    coin['description'] = coin_descriptions.get(coin['id'], "")

# Step 5: Convert to DataFrame and save to CSV
df = pd.DataFrame(top_5000_coins)
df = df.sort_values(by="market_cap", ascending=False)  # Ensure sorting by market cap
df.to_csv("top_5000_meme_coins.csv", index=False)

print("Saved top 5000 coins with descriptions to CSV.")

Saved top 5000 coins with descriptions to CSV.


# Project

In [None]:
import pandas as pd
import requests
import json
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
import nltk
from nltk.corpus import stopwords
import re
from collections import defaultdict, Counter

# Process Data

In [None]:


df = pd.read_csv("top_5000_meme_coins.csv")
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if type(text) is not str:
        return ''
    text = text.lower()  
    text = re.sub(r'[^a-z\s]', '', text)  
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]  
    return ' '.join(tokens)
df['processed_description'] = [preprocess_text(desc) for desc in df['description']]
df['processed_description'] = df.apply(lambda row: f"{row['name']} {row['processed_description']}", axis=1)
processed_descriptions = df['processed_description'].tolist()


['Dogecoin dogecoin cryptocurrency based popular doge internet meme features shiba inu logo dogecoin hrefcoinslitecoinlitecoina fork introduced joke currency december dogecoin quickly developed online community reached capitalization us million january compared cryptocurrencies dogecoin fast initial coin production schedule billion coins circulation mid additional billion coins every year thereafter june billionth dogecoin mined dogecoin created billy markus portland oregon jackson palmer sydney australia wanted create fun cryptocurrency appeal beyond core bitcoin audience dogecoin primarily used tipping system reddit twitter users tip creating sharing good content community active organising fundraising activities deserving causes developers dogecoin havent made major changes coin since means dogecoin could get left behind shibas leaving dogecoin join advanced platforms like ethereum one dogecoin strengths relaxed funloving community however also weakness currencies way professional p

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\natew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Build the model

In [16]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(processed_descriptions)
embeddings = normalize(embeddings)  

In [17]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10, random_state=42)  # reduce dimensionality to decrease influence of common common words
reduced_embeddings = pca.fit_transform(embeddings)

In [18]:
n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(reduced_embeddings)

# Auto Assign Labels

In [19]:
df['cluster'] = labels  

clustered_coins = df.groupby('cluster')

cluster_keywords = {}
for label, group in clustered_coins:
    all_descriptions = ' '.join(group['processed_description'].tolist())
    word_counts = Counter(all_descriptions.split())
    common_words = [word for word, count in word_counts.most_common(5)]  # Top 5 common words
    cluster_keywords[label] = common_words

print("clusters")
for label, keywords in cluster_keywords.items():
    print(f"cluster {label}: {', '.join(keywords)}")

auto_meme_types = {
    label: ' '.join(keywords) for label, keywords in cluster_keywords.items()
}

def get_meme_type(label):
    return auto_meme_types.get(label, "Unknown")

df['meme_type'] = df['cluster'].apply(get_meme_type)

clusters
cluster 0: meme, coin, memecoin, token, community
cluster 1: community, romeo, meme, token, crypto
cluster 2: community, project, token, ecosystem, together
cluster 3: loading, ily, right, pewpew, eyes
cluster 4: meme, community, token, project, coin
cluster 5: community, meme, token, solana, project
cluster 6: blockchain, project, decentralized, users, ecosystem
cluster 7: cat, community, meme, token, solana
cluster 8: community, meme, project, token, blockchain
cluster 9: community, project, token, blockchain, crypto
cluster 10: ai, agent, agents, project, users
cluster 11: meme, pepe, matt, frog, community
cluster 12: token, community, doge, project, dog
cluster 13: meme, crypto, coin, community, world
cluster 14: kitty, keeps, spinning, cat, Cat
cluster 15: meme, dog, community, doge, token
cluster 16: meme, memecoin, pepe, memes, community
cluster 17: token, community, tokens, project, meme
cluster 18: token, coin, base, tokens, Coin
cluster 19: dog, inu, doge, solana, In