# Data Fetching

In [None]:
import requests
import pandas as pd
import time

def fetch_top_5000_coins():
    """Fetch the top 5000 coins by market cap using the /coins/markets endpoint."""
    url = "https://api.coingecko.com/api/v3/coins/markets"
    all_coins = []
    
    for page in range(1, 21):  # 20 pages * 250 coins per page = 5000 coins
        params = {
            "vs_currency": "usd",
            "order": "market_cap_desc",
            "category": "meme-token",
            "per_page": 250,
            "page": page
        }
        response = requests.get(url, params=params)
        if response.status_code == 200:
            all_coins.extend(response.json())
        else:
            print(f"Failed to fetch page {page}: {response.status_code}")
        time.sleep(16)  # Sleep to avoid hitting rate limits
    
    return all_coins

# Step 1: Fetch top 5000 coins by market cap
top_5000_coins = fetch_top_5000_coins()

# Step 2: Extract coin IDs
coin_ids = [coin['id'] for coin in top_5000_coins]

# Adding Descriptions for Classification

In [None]:
headers = {'x-cg-pro-api-key': "CG-FtwnDTUpog7Z3GPYYqxXxQDb"}
def fetch_coin_descriptions(coin_ids):
    """Fetch descriptions for each coin using the /coins/{id} endpoint."""
    descriptions = {}
    
    for coin_id in coin_ids:
        response = requests.get(f"https://pro-api.coingecko.com/api/v3/coins/{coin_id}", headers=headers)
        if response.status_code == 200:
            data = response.json()
            descriptions[coin_id] = data['description']['en']
        else:
            print(f"Failed to fetch description for {coin_id}: {response.status_code}")
        time.sleep(0.5)  # for rate limits
    
    return descriptions
coin_descriptions = fetch_coin_descriptions(coin_ids)

for coin in top_5000_coins:
    coin['description'] = coin_descriptions.get(coin['id'], "")

df = pd.DataFrame(top_5000_coins)
df = df.sort_values(by="market_cap", ascending=False)  
df.to_csv("top_5000_meme_coins.csv", index=False)



# Project

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
import nltk
from nltk.corpus import stopwords
import re
from collections import defaultdict, Counter

# Process Data

In [None]:
df = pd.read_csv("top_5000_meme_coins.csv")
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filler_words = set(['coin', 'solana', 'memes', 'cryptocurrency', 'crypto', 'token', 'project', 'meme', 'community', 'network', 'platform', 'and', 'or', 'it', 'etc', 'all', 'also', 'with', 'its', 'can', 'like', 'holders', 'ecosystem', 'chain', 'blockchain', 'meme'])

def preprocess_text(text):
    if type(text) is not str:
        return ''
    text = text.lower()  
    text = re.sub(r'[^a-z\s]', '', text)  
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words and word not in filler_words]  
    return ' '.join(tokens)
df['processed_description'] = [preprocess_text(desc) for desc in df['description']]
df['processed_description'] = df.apply(lambda row: f"{row['name']} {row['processed_description']}", axis=1)
processed_descriptions = df['processed_description'].tolist()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\natew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Build the model

In [30]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(processed_descriptions)
embeddings = normalize(embeddings)  

In [None]:
import faiss
categories = {
    "none": [],
    "dog": ['shiba', "inu", "dog", "doge", "dogecoin", "shib", "shiba", "inu"],
    "wif": ["wif", "hat", "wifhat", "dogwifhat"],
    "pepe": ["pepe", "frog", "wojak", "kek", "green", "rare"],
    "cat": ["cat", "kitty", "meow", "feline"],
    "ai": ["ai", "artificial", "intelligence", "agent"],
    "frog": ["frog", "ribbit", "amphibian"],
    "fart_butt": ["fart", "butt", "gassy", "flatulence"],
    "bird": ["bird", "parrot", "duck", "chirp", "avian"],
    "dragon": ["dragon", "fire", "scale", "mythical"],
    "penguin": ["penguin", "waddle", "antarctica"],
    "fish": ["fish", "aquatic", "ocean", "fin"],
    "crypto_parody": ["crypto", "parody", "satire", "spoof"],
    "food": ["food", "snack", "burger", "pizza", "sushi"],
    "anime": ["anime", "manga", "otaku", "japan", "waifu"],
    "political": ["political", "president", "election", "government", "policy"],
    "fantasy": ["fantasy", "magic", "wizard", "elf", "orc"],
    "gorilla": ["gorilla", "ape", "monkey", "primate"],
    "chad_chud_kek_4chan": ["chad", "chud", "sigma", "alpha", "giga", "4chan", "kek", "maximus"],
    "mog": ["mog", "mogging", "dominance", "overpower"],
    "chill_guy": ["chill", "guy", "relax", "vibe", "laidback"],
    "peanut": ['peanut', 'squirrel', 'peanutthesquirrel', 'peanut the squirrel', 'peanutthesquirrel'],
    "elon": ['elon', 'musk', 'elonmusk', 'elon musk', 'elonmuskdoge', 'elonmuskdoge', "grok"],
}

embedding_dim = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(embedding_dim)

category_descriptions = [' '.join(keywords) for keywords in categories.values()]
category_embeddings = model.encode(category_descriptions)
category_embeddings = normalize(category_embeddings)
faiss_index.add(category_embeddings)

predicted_categories = []

for i, embedding in enumerate(embeddings):
    embedding = embedding.reshape(1, -1) 
    distances, indices = faiss_index.search(embedding, 1)  # looks for the correct category to put it in
    max_score = 1 - distances[0][0]  # similarity score
    threshold = 0.5  # confidence needed to put into a category 

    if max_score >= threshold: # if we put it in the category
        category = list(categories.keys())[indices[0][0]]
        new_keywords = df['processed_description'].iloc[i].split()
        categories[category].extend(new_keywords)
        # we add the description from the new coin to the category, for learning
        categories[category] = list(set(categories[category]))  
        category_descriptions = [' '.join(keywords) for keywords in categories.values()]
        category_embeddings = model.encode(category_descriptions)
        category_embeddings = normalize(category_embeddings)
        faiss_index.reset()  
        faiss_index.add(category_embeddings)  
    else: # else, we create a new category
        new_category_name = f"new_category_{len(categories)}"
        new_keywords = df['processed_description'].iloc[i].split()
        categories[new_category_name] = new_keywords
        category_descriptions = [' '.join(keywords) for keywords in categories.values()]
        category_embeddings = model.encode(category_descriptions)
        category_embeddings = normalize(category_embeddings)
        faiss_index.reset()  
        faiss_index.add(category_embeddings)  
        category = new_category_name

    predicted_categories.append(category)

# Assign Labels

In [52]:
df['meme_type'] = predicted_categories

print(df[['id', 'meme_type', 'description']].head(10))  

df.to_csv("classified_meme_coins.csv", index=False)

               id            meme_type  \
0        dogecoin                  dog   
1       shiba-inu                  dog   
2            pepe                 pepe   
3            bonk                  wif   
4  pudgy-penguins              penguin   
5      dogwifcoin                  wif   
6           ai16z                   ai   
7           floki                 none   
8     based-brett  chad_chud_kek_4chan   
9        fartcoin            fart_butt   

                                         description  
0  Dogecoin is a cryptocurrency based on the popu...  
1  Shiba Inu (SHIB) is a meme token which began a...  
2  What is the project about?\r\nPepe is a commun...  
3  Bonk is the first Solana dog coin for the peop...  
4  PENGU is the official coin of Pudgy Penguins. ...  
5  Literally a dog wif a hat, dogwifhat (WIF) is ...  
6  ai16z is the first venture capital DAO led by ...  
7  FLOKI is the utility token of the Floki Ecosys...  
8  BRETT the dancer, gamer, and cultural i