In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

In [4]:
data = pd.read_csv("../data/tweets_final.csv")

In [6]:
data = data.iloc[0:100,:]

In [7]:
punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]
stop_words = text.ENGLISH_STOP_WORDS.union(punc)
desc = data.Text.values
vectorizer = TfidfVectorizer(stop_words = stop_words)
X = vectorizer.fit_transform(desc)

In [9]:
word_features = vectorizer.get_feature_names()



In [10]:
stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

def tokenize(text):
    return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]

In [11]:
vectorizer2 = TfidfVectorizer(stop_words = stop_words, tokenizer = tokenize)
X2 = vectorizer2.fit_transform(desc)
word_features2 = vectorizer2.get_feature_names()
print(len(word_features2))
print(word_features2[:50])

971
["'u", 'abl', 'access', 'acent', 'action', 'actualizacion', 'adttnf', 'advantag', 'age', 'ago', 'ai', 'airdrop', 'alert', 'alienworld', 'alison', 'altcoin', 'altern', 'alvin', 'ama', 'amaz', 'amazid', 'ambit', 'amidst', 'amp', 'android', 'ani', 'anim', 'anti', 'ap', 'ape', 'apesinvad', 'ar', 'arena', 'arriba', 'art', 'articl', 'asset', 'assistir', 'aswqdx', 'automat', 'auwh', 'av', 'avail', 'avatar', 'avax', 'avoid', 'axieinfin', 'ay', 'azka', 'b']




In [12]:
vectorizer3 = TfidfVectorizer(stop_words = stop_words, tokenizer = tokenize, max_features = 1000)
X3 = vectorizer3.fit_transform(desc)
words = vectorizer3.get_feature_names()



In [25]:
d = pd.DataFrame(X3.toarray())
d.to_csv("../data/data_to_cluster.csv",header=words,index=False)

In [16]:
kmeans = KMeans(n_clusters = 8, n_init = 20)
kmeans.fit(X3)
# Finally, we look at 8 the clusters generated by k-means.
common_words = kmeans.cluster_centers_.argsort()[:,-1:-26:-1]
for num, centroid in enumerate(common_words):
    print(str(num) + ' : ' + ', '.join(words[word] for word in centroid))

0 : t, https, metavers, p, e, nftcommun, kucoin, mr, like, dk, bof, pvagscx, vr, tanukivers, play, u, need, nft, black, build, ethereumnft, o, theapeinvad, projectseedgam, vsx
1 : web, t, project, https, nftgiveaway, metavers, nft, nfts, token, nftproject, d, verasaw, vr, communiti, luxuryrealest, l, great, got, qlbqhc, live, list, s, metac, time, featur
2 : gold, otraeconomia, othereconomi, ethereum, cryptocurr, bitcoin, nftcommun, n, nfts, nft, usd, es, m, portug, euro, el, yin, dxi, pfi, lo, s, metavers, vjyrtbohmk, yypohh, oro
3 : wishforbul, snw, nft, b, metavers, supernftwar, https, t, gama, qus, evou, bc, upland, beog, jzktn, buidl, elpkb, realitychain, metadogworld, race, vn, galuka, moon, hoangyen, juliapham
4 : otherde, whaleanalytica, bought, world, bayc, ape, othersid, mayc, nftworld, nftworldsnft, https, t, ggrxygqhwu, zxmgysbojb, metavers, cvkyluf, tsew, owya, ap, jtr, v, nft, c, b, gad
5 : defi, bsc, java, python, blockhain, nodej, btc, drop, solananft, metavers, cryptoc