In [None]:
import glob
import pandas as pd

# get data file names
path =r'D:/VisualStudioCode/twitter data/bitcoin'
filenames = glob.glob(path + "/*.csv")

dfs = []
for filename in filenames:
    dfs.append(pd.read_csv(filename))

path1 =r'D:/VisualStudioCode/twitter data/ethereum'
filenames1 = glob.glob(path1 + "/*.csv")

for filename in filenames1:
    dfs.append(pd.read_csv(filename))

path2 =r'D:/VisualStudioCode/twitter data/litecoin'
filenames2 = glob.glob(path2 + "/*.csv")

for filename in filenames2:
    dfs.append(pd.read_csv(filename))

# Concatenate all data into one DataFrame
df = pd.concat(dfs, ignore_index=True)
df = pd.DataFrame(df[['tweets','likes','time']])
df.head()

In [None]:
df = df[~df.tweets.str.contains('RT')]
df = df.reset_index(drop=True)
df.head()

In [None]:
#initial cleaning of the tweets 
import re

def cleanTweet(Tweet):
    Tweet = re.sub('#[A-Za-z0-9]+',' ', Tweet)
    Tweet = re.sub('@[A-Za-z0-9]+',' ', Tweet)
    Tweet = re.sub('\\n', '', Tweet)
    Tweet = re.sub('https?:\/\/\S+',' ', Tweet)
    Tweet = re.sub('[0-9]', ' ', Tweet)
    return Tweet 

df['tweets'] = df['tweets'].apply(cleanTweet)

df.head()

In [None]:
df = pd.DataFrame(df['tweets'])
df['text'] = df['tweets']

df.head()

In [None]:
#lowercasing

import string
df['text'] = df["text"].str.lower()

df.head()

In [None]:
#removing punctuation

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df["text"] = df["text"].apply(lambda text: remove_punctuation(text))
df.head()

In [None]:
#removing stopwords

from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["text"] = df["text"].apply(lambda text: remove_stopwords(text))
df.head()

In [None]:
#removing emojis
import re
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

df["text"] = df["text"].apply(lambda text: remove_emoji(text))
df.head()

In [None]:
#Lemmatization with PART OF SPEECH TAGGING
import nltk

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df["text_lemmatized"] = df["text"].apply(lambda text: lemmatize_words(text))
df.head()

In [None]:
#removing frequent words

df1 = pd.DataFrame(df)

from collections import Counter
cnt = Counter()
for text in df1["text"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(15)



In [None]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(15)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df1["text_no_freqW"] = df1["text_lemmatized"].apply(lambda text: remove_freqwords(text))
df1.head()

In [None]:
#import libraries for LDA
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models  
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
#convert to list
topic = df.text_lemmatized.values.tolist()

topic_nofreq = df1.text_no_freqW.values.tolist()

print(topic[:1])

print(topic_nofreq[:1])

In [None]:
#tokenizing
def tokenize(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) 

topic_words = list(tokenize(topic))
topic_words_nof = list(tokenize(topic_nofreq))

print(topic_words[:1])
print(topic_words[:1])

In [None]:
# Build the bigram models
bigram = gensim.models.Phrases(topic_words, min_count=5, threshold=100) 
bigram_mod = gensim.models.phrases.Phraser(bigram)


#Build the bigram models NOFREQ
bigram1 = gensim.models.Phrases(topic_words_nof, min_count=5, threshold=100)
bigram_mod1 = gensim.models.phrases.Phraser(bigram)

In [None]:
#making bigrams
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_bigrams1(texts):
    return [bigram_mod1[doc] for doc in texts]

topic_words = make_bigrams(topic_words)

topic_words_nof = make_bigrams1(topic_words_nof)

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(topic_words)

# Create Corpus
texts = topic_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
#NOFREQ
# Create Dictionary
id2word1 = corpora.Dictionary(topic_words_nof)

# Create Corpus
texts1 = topic_words_nof

# Term Document Frequency
corpus1 = [id2word.doc2bow(text) for text in texts]

# View
print(corpus1[:1])

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=15, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=15,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
#NOFREQ
#Build LDA model
lda_model1 = gensim.models.ldamodel.LdaModel(corpus=corpus1,
                                           id2word=id2word1,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
pprint(lda_model1.print_topics())
doc_lda1 = lda_model1[corpus]

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=topic_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model1.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda1 = CoherenceModel(model=lda_model1, texts=topic_words_nof, dictionary=id2word1, coherence='c_v')
coherence_lda1 = coherence_model_lda1.get_coherence()
print('\nCoherence Score: ', coherence_lda1)

In [None]:
lda_model.save('topic.model')
lda_model1.save('topic.model1')

In [None]:
#Visualize the topics
pyLDAvis.enable_notebook(local=True)
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
pyLDAvis.display(vis)

In [None]:
#Visualize the topics NOFREQ
pyLDAvis.enable_notebook()
vis1 = pyLDAvis.gensim_models.prepare(lda_model1, corpus1, id2word1)
vis1

In [None]:
#tokenizing

topic = pd.DataFrame(df['text_lemmatized'])

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens 

topic['text'] = topic['text_lemmatized'].apply(lambda x: tokenize(x))

topic.head()

In [None]:
import gensim #the library for Topic modelling
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim_models #LDA visualization library

#create dictionary
dictionary = corpora.Dictionary(topic['text'])

#create document term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in topic['text'] ]

In [None]:
lda = gensim.models.ldamodel.LdaModel

num_topics=10
%time ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

In [None]:
#tokenizing

topic_nofreq = pd.DataFrame(df['text_no_freqW'])

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens 

topic_nofreq['text'] = topic_nofreq['text_no_freqW'].apply(lambda x: tokenize(x))

topic_nofreq.head()

In [None]:
import gensim #the library for Topic modelling
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim_models #LDA visualization library

#create dictionary
dictionary = corpora.Dictionary(topic_nofreq['text'])

#create document term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in topic_nofreq['text'] ]

In [None]:
lda = gensim.models.ldamodel.LdaModel

num_topics=10
%time ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)