In [None]:
import glob
import pandas as pd

# get data file names
path =r'D:/VisualStudioCode/twitter data/bitcoin'
filenames = glob.glob(path + "/*.csv")

dfs = []
for filename in filenames:
    dfs.append(pd.read_csv(filename))

path1 =r'D:/VisualStudioCode/twitter data/ethereum'
filenames1 = glob.glob(path1 + "/*.csv")

dfs1 = []
for filename in filenames1:
    dfs1.append(pd.read_csv(filename))

path2 =r'D:/VisualStudioCode/twitter data/litecoin'
filenames2 = glob.glob(path2 + "/*.csv")

dfs2 = []
for filename in filenames2:
    dfs2.append(pd.read_csv(filename))

# Concatenate all data into one DataFrame
bitcoin_df = pd.concat(dfs, ignore_index=True)
bitcoin_df = pd.DataFrame(bitcoin_df[['tweets','likes','time']])

ethereum_df = pd.concat(dfs1, ignore_index=True)
ethereum_df = pd.DataFrame(ethereum_df[['tweets','likes','time']])

litecoin_df = pd.concat(dfs2, ignore_index=True)
litecoin_df = pd.DataFrame(litecoin_df[['tweets','likes','time']])

bitcoin_df.head()


In [None]:
ethereum_df.head()

In [None]:
litecoin_df.head()

In [None]:
#dropping retweets
bitcoin_df = bitcoin_df[~bitcoin_df.tweets.str.contains('RT')]
bitcoin_df = bitcoin_df.reset_index(drop=True)

ethereum_df = ethereum_df[~ethereum_df.tweets.str.contains('RT')]
ethereum_df = ethereum_df.reset_index(drop=True)

litecoin_df = litecoin_df[~litecoin_df.tweets.str.contains('RT')]
litecoin_df = litecoin_df.reset_index(drop=True)

In [None]:
#initial cleaning of the tweets 
import re

def cleanTweet(Tweet):
    Tweet = re.sub('#[A-Za-z0-9]+',' ', Tweet)
    Tweet = re.sub('@[A-Za-z0-9]+',' ', Tweet)
    Tweet = re.sub('\\n', '', Tweet)
    Tweet = re.sub('https?:\/\/\S+',' ', Tweet)
    Tweet = re.sub('[0-9]', ' ', Tweet)
    return Tweet  

bitcoin_df['tweets'] = bitcoin_df['tweets'].apply(cleanTweet)

ethereum_df['tweets'] = ethereum_df['tweets'].apply(cleanTweet)

litecoin_df['tweets'] = litecoin_df['tweets'].apply(cleanTweet)


In [None]:
bitcoin_df = pd.DataFrame(bitcoin_df['tweets'])
bitcoin_df['text'] = bitcoin_df['tweets']

ethereum_df = pd.DataFrame(ethereum_df['tweets'])
ethereum_df['text'] = ethereum_df['tweets']

litecoin_df = pd.DataFrame(litecoin_df['tweets'])
litecoin_df['text'] = litecoin_df['tweets']


In [None]:
#lowercasing
import string

bitcoin_df['text'] = bitcoin_df["text"].str.lower()

ethereum_df['text'] = ethereum_df["text"].str.lower()

litecoin_df['text'] = litecoin_df["text"].str.lower()

In [None]:
#removing punctuation

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

bitcoin_df["text"] = bitcoin_df["text"].apply(lambda text: remove_punctuation(text))

ethereum_df["text"] = ethereum_df["text"].apply(lambda text: remove_punctuation(text))

litecoin_df["text"] = litecoin_df["text"].apply(lambda text: remove_punctuation(text))

In [None]:
#removing stopwords

from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

bitcoin_df["text"] = bitcoin_df["text"].apply(lambda text: remove_stopwords(text))

ethereum_df["text"] = ethereum_df["text"].apply(lambda text: remove_stopwords(text))

litecoin_df["text"] = litecoin_df["text"].apply(lambda text: remove_stopwords(text))


In [None]:
#removing emojis
import re
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

bitcoin_df["text"] = bitcoin_df["text"].apply(lambda text: remove_emoji(text))

ethereum_df["text"] = ethereum_df["text"].apply(lambda text: remove_emoji(text))

litecoin_df["text"] = litecoin_df["text"].apply(lambda text: remove_emoji(text))

In [None]:
#Lemmatization with PART OF SPEECH TAGGING
import nltk

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

bitcoin_df["text"] = bitcoin_df["text"].apply(lambda text: lemmatize_words(text))

ethereum_df["text"] = ethereum_df["text"].apply(lambda text: lemmatize_words(text))

litecoin_df["text"] = litecoin_df["text"].apply(lambda text: lemmatize_words(text))

In [None]:
# frequent words

btc = ['bitcoin', 'btc']

eth = ['ethereum', 'eth']

ltc = ['litecoin', 'ltc']

In [None]:
def remove_freqwords(text, freqwords):
    return " ".join([word for word in str(text).split() if word not in freqwords])

bitcoin_df["text"] = bitcoin_df["text"].apply(lambda text: remove_freqwords(text, btc))

ethereum_df["text"] = ethereum_df["text"].apply(lambda text: remove_freqwords(text, eth))

litecoin_df["text"] = litecoin_df["text"].apply(lambda text: remove_freqwords(text, ltc))


In [None]:
#import libraries for LDA
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models  
import matplotlib.pyplot as plt
%matplotlib inline

from pprint import pprint

In [None]:
#convert to list
topic_btc = bitcoin_df.text.values.tolist()

topic_eth = ethereum_df.text.values.tolist()

topic_ltc = litecoin_df.text.values.tolist()

In [None]:
def tokenize(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) 

topic_words_btc = list(tokenize(topic_btc))

topic_words_eth = list(tokenize(topic_eth))

topic_words_ltc = list(tokenize(topic_ltc))

In [None]:
#Dictionary
id2word_btc = corpora.Dictionary(topic_words_btc)
id2word_eth = corpora.Dictionary(topic_words_eth)
id2word_ltc = corpora.Dictionary(topic_words_ltc)
#Corpus
texts_btc = topic_words_btc
texts_eth = topic_words_eth
texts_ltc = topic_words_ltc

#Term Document Frequency
corpus_btc = [id2word_btc.doc2bow(text) for text in texts_btc]
corpus_eth = [id2word_eth.doc2bow(text) for text in texts_eth]
corpus_ltc = [id2word_ltc.doc2bow(text) for text in texts_ltc]

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_btc,
                                           id2word=id2word_btc,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus_btc]

In [None]:
from textblob import TextBlob

def getPolarity(Tweet):
    return TextBlob(Tweet).sentiment.polarity

def getSentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'


In [None]:
# topics and their sentiment btc
user_encouragment = 'Project go buy make crypto good great best future'
finances = 'Get wallet money account world need currency'
actions = 'Use value trading trade asset stock profit mining earn'

print(getSentiment(getPolarity(user_encouragment)))
print(getSentiment(getPolarity(finances)))
print(getSentiment(getPolarity(actions)))


In [None]:
lda_model1 = gensim.models.ldamodel.LdaModel(corpus=corpus_eth,
                                           id2word=id2word_eth,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

pprint(lda_model1.print_topics())
doc_lda1 = lda_model1[corpus_eth]

In [None]:
# topics and their sentiment eth
actions1 = 'Purchase buy get token market use'
disruption = 'Project collectible game vision future best people time hope'
real_world_application = 'Gas price transaction asset take change become'

print(getSentiment(getPolarity(actions1)))
print(getSentiment(getPolarity(disruption)))
print(getSentiment(getPolarity(real_world_application)))

In [None]:
lda_model2 = gensim.models.ldamodel.LdaModel(corpus=corpus_ltc,
                                           id2word=id2word_ltc,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

pprint(lda_model2.print_topics())
doc_lda2 = lda_model2[corpus_btc]

In [None]:
# topics and their sentiment ltc
actions2 = 'Get go buy coin market token earn trade money'
success = 'Price expect peak transaction good want big'
research = 'analysis daily news say follow keep tech'

print(getSentiment(getPolarity(actions2)))
print(getSentiment(getPolarity(success)))
print(getSentiment(getPolarity(research)))

In [None]:
#Visualize the topics
pyLDAvis.enable_notebook(local=True)
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus_btc, id2word_btc)
pyLDAvis.display(vis)

In [None]:
pyLDAvis.enable_notebook(local=True)
vis1 = pyLDAvis.gensim_models.prepare(lda_model1, corpus_eth, id2word_eth)
pyLDAvis.display(vis1)

In [None]:
pyLDAvis.enable_notebook(local=True)
vis2 = pyLDAvis.gensim_models.prepare(lda_model2, corpus_ltc, id2word_ltc)
pyLDAvis.display(vis2)