In [1]:
# Terminal / Anaconda Navigator: conda install -c conda-forge gensim
import pickle
from gensim import matutils, models
import scipy.sparse
import pandas as pd
from gensim.models import CoherenceModel

In [2]:
f_data = pd.read_csv('../data/tweets_final.csv')
f_data = f_data.iloc[0:100,:]
f_data.head(3)

Unnamed: 0,Datetime,Tweet Id,Text,Username
0,2022-06-17 23:59:41+00:00,1537948125628747777,@BitMartExchange To the Moon 💯💵💎\n@galuka156 @...,cozyhomes88
1,2022-06-17 23:59:27+00:00,1537948068431007744,THIS!!!....@metazooxyz with @MrSweMusic1...RIG...,TTJP_1
2,2022-06-17 23:59:02+00:00,1537947961983832066,In #Mars4 #game demo you can free your imagina...,Dubai_community


In [3]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

# Vedi pickle
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(f_data.Text)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = f_data.index
data_dtm = data_dtm.iloc[0:10,:]



In [4]:
# One of the required inputs is a term-document matrix
tdm = data_dtm.transpose()
tdm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
00667,0,0,0,0,0,0,0,0,0,0
01,0,0,0,0,0,0,0,0,0,0
037,0,0,0,0,0,0,0,0,0,0
039bkt0av4,0,0,0,0,0,0,0,0,0,0
043,0,0,0,0,0,0,0,0,0,0


In [5]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(data_dtm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [6]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("../data/cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [7]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=1)
lda.print_topics()

[(0,
  '0.003*"aaaaahhhhhhh" + 0.002*"ability" + 0.002*"aaaaauuugghhhhhh" + 0.002*"abject" + 0.002*"aah" + 0.002*"aaah" + 0.001*"abc" + 0.001*"aaaahhhhh" + 0.001*"abcs" + 0.001*"aaaaah"'),
 (1,
  '0.005*"abc" + 0.005*"aah" + 0.004*"ability" + 0.004*"aaaahhhhh" + 0.004*"aaaaauuugghhhhhh" + 0.004*"abcs" + 0.003*"aaaaahhhhhhh" + 0.002*"abject" + 0.001*"aaaaah" + 0.001*"aaah"')]

In [8]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)]
    return ' '.join(all_nouns)

In [9]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(f_data.Text.apply(nouns))
data_nouns

Unnamed: 0,Text
0,BitMartExchange Moon 💯💵💎 @ galuka156 @ hoangye...
1,THIS metazooxyz @ MrSweMusic1 NOW DCLMETATIGER...
2,Mars4 game demo imagination buildings rules PL...
3,PROJECT project @ ProjectSeedGame https Projec...
4,@ verasaw project level NFTGaming List https /...
...,...
95,NFT👍🥰 show NFT NodeJS DEFI BLOCKHAIN cryptocur...
96,Blockchain Keys Unlock Murky Metaverse Alison ...
97,Más subas tasa Recesión Mayor inflación / lo q...
98,METAME EU5 facescans https //t.co/QCznV0OFwN


In [11]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.Text)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn



Unnamed: 0,01,043,0jggwas0wy,2421,2kbhcq8xhw,2qejf2scii,2wqubowap3,4pvzxjwgaw,670億円のweb3ゲームファンドをローンチ,6yzvmgmjbk,...,zl7npcn7qt,zwfzskn4ay,zxmgysbojb,édition,ótimo,こうゆうmr,メタバース,的な映像技術が進化していくと思います,私は今後,複合現実
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [13]:
# Let's try topics = 3
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.063*"metaverse" + 0.048*"https" + 0.019*"nft" + 0.013*"bitcoin" + 0.012*"nfts" + 0.011*"nftcommunity" + 0.009*"btc" + 0.008*"gold" + 0.008*"cryptocurrencies" + 0.008*"whaleanalytica"'),
 (1,
  '0.072*"https" + 0.040*"metaverse" + 0.024*"nfts" + 0.021*"nft" + 0.012*"nftcommunity" + 0.010*"crypto" + 0.009*"store" + 0.008*"project" + 0.008*"nftnyc" + 0.006*"father"'),
 (2,
  '0.036*"metaverse" + 0.030*"https" + 0.009*"nft" + 0.009*"project" + 0.007*"world" + 0.007*"vr" + 0.007*"black" + 0.006*"projectseedgame" + 0.006*"huobi" + 0.006*"projectseed"')]

In [14]:
# Let's create a function to pull out nouns from a string of text
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)]
    return ' '.join(nouns_adj)

In [16]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns_adj = pd.DataFrame(f_data.Text.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,Text
0,@ BitMartExchange Moon 💯💵💎 @ galuka156 @ hoang...
1,THIS .... @ metazooxyz @ MrSweMusic1 NOW 🔥🔥🔥🔥🎶...
2,Mars4 game demo imagination craziest buildings...
3,PROJECT big project potential 🔥🔥🔥💚👏 @ ProjectS...
4,@ verasaw project next level NFTGaming metaver...
...,...
95,NFT👍🥰 show NFT java NodeJS metaverse DEFI BLOC...
96,Blockchain Keys Unlock Murky Metaverse Alison ...
97,Más subas tasa Recesión Mayor inflación / lo q...
98,METAME metahuman unrealengine EU5 metaverse fa...


In [18]:
# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.Text)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna



Unnamed: 0,01,039bkt0av4,043,0jggwas0wy,10,11,15,2421,2kbhcq8xhw,2qejf2scii,...,zl7npcn7qt,zwfzskn4ay,zxmgysbojb,édition,ótimo,こうゆうmr,メタバース,的な映像技術が進化していくと思います,私は今後,複合現実
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [109]:
# Let's try 3 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.038*"https" + 0.014*"crypto" + 0.008*"bitcoin" + 0.008*"nft" + 0.008*"btc" + 0.008*"nfts" + 0.008*"new" + 0.008*"title" + 0.008*"news" + 0.006*"art"'),
 (1,
  '0.054*"https" + 0.028*"nft" + 0.018*"nfts" + 0.012*"nftcommunity" + 0.008*"cryptocurrencies" + 0.008*"defi" + 0.007*"gold" + 0.007*"bitcoin" + 0.006*"otherdeed" + 0.006*"blockhain"'),
 (2,
  '0.042*"https" + 0.016*"nft" + 0.012*"mintable" + 0.011*"store" + 0.011*"nfts" + 0.009*"crypto" + 0.007*"father" + 0.006*"gasless" + 0.006*"black" + 0.006*"blockchain"')]

In [100]:
from gensim import corpora
word2id = dict((k, v) for k, v in cvna.vocabulary_.items())
d = corpora.Dictionary()
d.id2token = id2word
d.token2id = word2id

In [110]:
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim_models.prepare(ldana, corpusna, d)
LDAvis_prepared

  default_term_info = default_term_info.sort_values(


In [133]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    lda_model = models.LdaMulticore(corpus=corpus, num_topics=k, id2word=dictionary, passes=10,alpha=a,
                                           eta=b)
    coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus,dictionary=dictionary, coherence='u_mass')
    return coherence_model_lda.get_coherence()

In [134]:
compute_coherence_values(corpusna,d,2,0.1,0.4)

-12.75998542540841

In [135]:
import numpy as np
from gensim import models
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 3
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
model_results = {
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
        # iterate through number of topics
    for k in topics_range:
            # iterate through alpha values
        for a in alpha:
                # iterare through beta values
            for b in beta:
                    # get the coherence score for the given parameters
                cv = compute_coherence_values(corpus=corpusna, dictionary=d,
                                                  k=k, a=a, b=b)
                    # Save the model results
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)

Traceback (most recent call last):
  File "<string>", line 1, in <module>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/usr/local/Cellar/python@3.9/3.9.7_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/usr/local/Cellar/python@3.9/3.9.7_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
  File "/usr/local/Cellar/python@3.9/3.9.7_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
    exitcode = _main(fd, parent_sentinel)
  File "/usr/local/Cellar/python@3.9/3.9.7_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    exitcode = _main(fd, parent_sentinel)
  File "/usr/local/Cellar/python@3.9/3.9.7_1/Framewor

KeyboardInterrupt: 

In [127]:
model_results

{'Topics': [2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2],
 'Alpha': [0.01,
  0.01,
  0.01,
  0.01,
  0.01,
  0.31,
  0.31,
  0.31,
  0.31,
  0.31,
  0.61,
  0.61,
  0.61,
  0.61,
  0.61,
  0.9099999999999999,
  0.9099999999999999,
  0.9099999999999999,
  0.9099999999999999,
  0.9099999999999999,
  'symmetric',
  'symmetric',
  'symmetric',
  'symmetric',
  'symmetric',
  'asymmetric',
  'asymmetric',
  'asymmetric',
  'asymmetric',
  'asymmetric'],
 'Beta': [0.01,
  0.31,
  0.61,
  0.9099999999999999,
  'symmetric',
  0.01,
  0.31,
  0.61,
  0.9099999999999999,
  'symmetric',
  0.01,
  0.31,
  0.61,
  0.9099999999999999,
  'symmetric',
  0.01,
  0.31,
  0.61,
  0.9099999999999999,
  'symmetric',
  0.01,
  0.31,
  0.61,
  0.9099999999999999,
  'symmetric',
  0.01,
  0.31,
  0.61,
  0.9099999999999999,
  'symmetric'],
 'Coherence': [-12.577708567196947,
  -13.485681601331065,
  -12.9360

In [None]:
#Plot multidimensionale?
 import matplotlib.pyplot as plt
 def plot_graph(model_results):
    plt.plot(model_results['Topics'], model_results['Coherence'])
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

plot_graph(model_results)