# Initial Setup

### Import Modules

In [None]:
import pandas as pd
import numpy as np
import datetime
import pickle as pkl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import defaultdict
from collections import OrderedDict
import re
from guess_language import guess_language
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
#import nltk
set(stopwords.words('english'))

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from gensim import corpora, models, similarities, matutils
from nltk.stem.porter import *
import corex as ct
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


### Reduce Memory Usage

In [None]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if (props[col].dtype != object and props[col].dtype != 'datetime64[ns]'):  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props

### Load Data

In [None]:
tweets = pd.read_csv('~/Desktop/Projects/Project_4/Data/tweets.csv')

In [None]:
tweets = reduce_mem_usage(tweets)

In [None]:
tweet = tweets[['tweet-id', 'timestamp']]
del tweets

# Prepocessing Formulas

In [None]:
#Creating a timestamp for each tweet in order to be able to make an hourly count
def timestamp(tweet):
    """
    Creating a timestamp for each tweet in order to be able to make an hourly count
    """
    d = defaultdict(str)
    x = tweet.timestamp.unique()
    for i in x:
        y = datetime.datetime.strptime(i, '%Y-%m-%d %H:%M:%S')
        timestamp = pd.Timestamp(year = y.year, month = y.month, day = y.day, hour = y.hour) 
        d[i] = timestamp
    return d

In [None]:
#Count the tweets happening at certain hours
def hourlytweets(df):
    """
    Count the tweets happening at certain hours
    """
    df_count = df.groupby(['hours'])['tweet-id'].count()
    d = df_count.to_dict()
    df['activity'] = df['hours'].map(d)
    return df

In [None]:
def remove_pattern(txt, pattern):
    r = re.findall(pattern,txt)
    for i in r:
        txt = txt.replace(i,'')
    return txt 

In [None]:
def gethashtags(txt):
    hashtag_list = []
    for i in txt:
        r = re.findall("#[\w]*",i)
        hashtag_list.append(r)
    #hashtag = np.asarray(hashtag_list)
    return hashtag_list

In [None]:
#Delete duplicates
def deleteretweets(df):
    df.dropna(inplace = True)
    df = df.drop_duplicates(subset = 'text', inplace = False)
    df = df[~df.text.str.contains('RT')]
    return df

# Calling Preproccesing Formulas

### Adding Timestamp

In [None]:
#Calling the function to create timestamps
d_timestamp= timestamp(tweet)
del tweet
pkl.dump(d_timestamp, open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/timestamps.pkl', 'wb'))

In [None]:
#Add timestamps to existing DataFrame
tweets = pd.read_csv('~/Desktop/Projects/Project_4/Data/tweets.csv')
tweets['hours'] = tweets['timestamp'].map(d_timestamp)

### Computing Hourly Activity

In [None]:
#Compute hourly tweets
df = hourlytweets(tweets)
pkl.dump(df, open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/tweets_timestamps.pkl','wb'))

### Deleting Retweets marked with RT

In [None]:
#df_sin = deleteretweets(df)
#pkl.dump(df_sin, open('tweets_no_retweets.pkl', 'wb'))

### Cleaning Tweets

In [None]:
df = pkl.load(open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/tweets_timestamps.pkl', 'rb'))

In [None]:
df['text'] = df.text.astype(str)
#remove twitter handles (@user)
df['clean_tweet']= np.vectorize(remove_pattern)(df['text'], "@[\w]*")
print('Removed all @...')

#Convert string to lowercase letter
df['clean_tweet'] = df.clean_tweet.apply(lambda x: x.lower())
print('Converted string to lowercase')

#Get Hashtags from tweets
df['hashtag'] = gethashtags(df['clean_tweet'])
#Eliminate list brackets from hashtags
df['hashtag_text'] = df['hashtag'].apply(lambda x: ' '.join(map(str, x)))
df['hashtag_text'] = df['hashtag_text'].str.replace("#", "")
# remove special characters, numbers, punctuations.
df['hashtag_text'] = df['hashtag_text'].str.replace("[^a-zA-Z#]", " ")

#Remove all hashtags from tweets
df['clean_tweet']= np.vectorize(remove_pattern)(df['clean_tweet'], "#[\w]*")
print('Removed all #...')

#Remove URLs
df['clean_tweet']= np.vectorize(remove_pattern)(df['clean_tweet'], "http\S+")

# remove special characters, numbers, punctuations. 
df['clean_tweet'] = df['clean_tweet'].str.replace("[^a-zA-Z#]", " ")

# Create a column with hashtags and tweet combined
df["whole_text"] = df["clean_tweet"].map(str) + df["hashtag_text"]
print('Removed all special characters')

In [None]:
pkl.dump(df, open('cleaned_tweets.pkl', 'wb'))

In [None]:
df_sin = df.drop_duplicates(subset = 'clean_tweet')

In [None]:
pkl.dump(df_sin, open('cleaned_tweets_no_duplicates.pkl', 'wb'))

In [None]:
#Remove all non-english tweets
df['clean_tweet_english'] = df.clean_tweet[~df.clean_tweet.apply(lambda x: guess_language(x) != 'en')]
df_english = df.dropna(inplace = False)
pkl.dump(df_english, open('clean_tweets_english.pkl', 'wb'))

In [None]:
#df_english = pkl.load(open('clean_tweets_english.pkl', 'rb'))

In [None]:
df = pkl.load(open('cleaned_tweets_no_duplicates.pkl', 'rb'))

Dates are ranging from 02/26/2016 to 03/28/2019

### Stemming

In [None]:
def stem(df, column):
    #Tokenization
    tokenized_tweet = df[column].apply(lambda x: x.split())
    #Stemming
    stemmer = PorterStemmer()

    tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
    tokenized_tweet = tokenized_tweet.apply(lambda x: ' '.join(x))
    return tokenized_tweet

In [None]:
df_english['tweet_stem'] = stem(df_english, 'whole_text')

In [None]:
df_english.head()

In [None]:
pkl.dump(df_english, open('tweets_final.pkl', 'wb'))

### Train / Test Data Set

In [None]:
# Split the data into training and test sets
#X_train, X_test = train_test_split(df_english, test_size=0.25, random_state=98)

In [None]:
#pkl.dump(X_train, open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/tweets_train.pkl', 'wb'))
#pkl.dump(X_test, open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/tweets_test.pkl', 'wb'))

# Vectorization

### Count Vectorizer

In [None]:
def vectorization(train):
    """Vectorizing the train set"""
    cv = CountVectorizer(stop_words = 'english', max_df = 0.7, min_df = 0.05)
    X_train_cv = cv.fit_transform(train)
    
    #Document term matrix including bigrams
    cv2 = CountVectorizer(ngram_range=(1,2), binary=True, stop_words='english',  max_df = 0.7, min_df = 0.05)
    X_train_cv2 = cv2.fit_transform(train)
    
    #df = pd.DataFrame(X_train.toarray(),columns = cv.get_feature_names())
    return X_train_cv, X_train_cv2

In [None]:
df = pkl.load(open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/tweets_final.pkl', 'rb'))

In [None]:
X_train_cv, X_train_cv2 = vectorization(df['whole_text'])

In [None]:
pkl.dump(X_train_cv, open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/cv.pkl', 'wb'))
pkl.dump(X_train_cv2, open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/cv_binary.pkl', 'wb'))

### TF-IDF 

In [None]:
# Create TF-IDF versions of the Count Vectorizers created earlier in the exercise
def tfidf(train):
    tfidf1 = TfidfVectorizer(stop_words='english',  max_df = 0.7, min_df = 0.05)
    X_train_tfidf1 = tfidf1.fit_transform(train)
    #X_test_tfidf1  = tfidf1.transform(X_test)

    tfidf2 = TfidfVectorizer(ngram_range=(1,2), binary=True, stop_words='english',  max_df = 0.7, min_df = 0.05)
    X_train_tfidf2 = tfidf2.fit_transform(train)
    #X_test_tfidf2  = tfidf2.transform(X_test)
    return X_train_tfidf1, X_train_tfidf2

In [None]:
#df = pkl.load(open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/tweets_train.pkl', 'rb'))
#df_test = pkl.load(open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/tweets_test.pkl', 'rb'))

In [None]:
X_train_tfidf1, X_train_tfidf2 = tfidf(df['whole_text'])

In [None]:
pkl.dump(X_train_tfidf1, open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/train_tfidf.pkl', 'wb'))
pkl.dump(X_train_tfidf1, open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/train_tfidf2.pkl', 'wb'))

In [None]:
df.head()

# Topic Modeling

In [None]:
stopwords1 = stopwords.words('english')

In [None]:
stopwords1.append('just')
stopwords1.append('like')
stopwords1.append('com')
#stopwords1.append('crypto')
#stopwords1.append('cryptocurrency')
#stopwords1.append('blockchain')

In [None]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

### Latent Semantic Analysis (LSA)

In [None]:
def lsa(text):
    #Applying Vectorization
    cv = CountVectorizer(stop_words = stopwords1, max_df = 0.12, min_df = 0.01)
    X_cv = cv.fit_transform(text)
    
    lsa = TruncatedSVD(3)
    doc_topic = lsa.fit_transform(X_cv)
    lsa.explained_variance_ratio_
    
    #Show most common words in topics
    display_topics(lsa, cv.get_feature_names(), 10)
    
    return doc_topic

In [None]:
doc_topic = lsa(df['whole_text'])
#doc_topic_english = lsa(df_english['whole_text'])

In [None]:
pd.Series(doc_topic.argmax(axis = 1)).value_counts()

### Non-Negative Matrix Factorization (NMF)

In [None]:
def nmf(text):
    #Applying Vectorization
    cv = CountVectorizer(stop_words = stopwords1, max_df = 0.12, min_df = 10)
    #tfidf = TfidfVectorizer(stop_words=stopwords1,  max_df = 0.15, min_df = 0.02)
    X_cv = cv.fit_transform(text)
    
    nmf_model = NMF(3)
    doc_topic = nmf_model.fit_transform(X_cv)
    
    #Show most common words in topics
    display_topics(nmf_model, cv.get_feature_names(), 10)
    
    return doc_topic

In [None]:
doc_topic = nmf(df['whole_text'])
#doc_topic_english = nmf(df_english['whole_text'])

In [None]:
doc_topic[:,0]

In [None]:
pd.Series(doc_topic.argmax(axis = 1)).value_counts()

In [None]:
df.reset_index(inplace = True)

In [None]:
df['Topic'] = pd.Series(doc_topic.argmax(axis = 1))
labels = ['Blockchain News', 'Cryptocurrency', 'Market']
for i in range(3):
    df['Topic'] = df['Topic'].replace(to_replace = float(i), value = labels[i])
    

In [None]:
df['Blockchain_News_proba'] = doc_topic[:,0]
df['Cryptocurrency_proba'] = doc_topic[:,1]
df['Market_News_proba'] = doc_topic[:,2]

In [None]:
fig, ax = plt.subplots(figsize=(15,7))
# use unstack()
df.groupby(['hours','Topic'])['Topic'].count().unstack().plot(kind = 'bar',ax=ax)
plt.xlabel('Date')
plt.ylabel('Tweet Count')
for label in ax.xaxis.get_ticklabels()[::200000]:
    label.set_visible(False)
plt.show()
#plt.xticks(ticks = [0,300000,600000,900000] , labels = ['2016', '2017', '2018', '2019'])

In [None]:
crypto_news = df['text'][df.Topic == 1.0]

In [None]:
doc_topic_news = nmf(crypto_news)

In [None]:
pd.set_option('display.max_colwidth', -1)

### Latent Dirichlet Allocation

In [None]:
def lda(text):
    #Applying Vectorization
    # Create a CountVectorizer for parsing/counting words
    cv = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words=stopwords1, token_pattern="\\b[a-z][a-z]+\\b")
    X_cv = cv.fit_transform(text).transpose()
    
    # Convert sparse matrix of counts to a gensim corpus
    corpus = matutils.Sparse2Corpus(X_cv)
    
    #Create dictionary with row and words
    id2word = dict((v, k) for k, v in cv.vocabulary_.items())
    
    lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=5)
    lda.print_topics()
    
    lda_corpus = lda[corpus]
    
    # Store the documents' topic vectors in a list so we can take a peak
    lda_docs = [doc for doc in lda_corpus]
    
    return lda_docs

In [None]:
#doc_topic = lda(df['whole_text'])
doc_topic_english = lda(df_english['whole_text'])

In [None]:
def ldask(text):
    cv = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words=stopwords1, token_pattern="\\b[a-z][a-z]+\\b")
    X_cv = cv.fit_transform(text)
    
    lda_model = LatentDirichletAllocation(n_components=4,random_state=454)
    doc_topic = lda_model.fit_transform(X_cv) 
    
    
    return doc_topic, cv, lda_model

In [None]:
#doc_topic = ldask(df['whole_text'])
doc_topic_english, cv, lda_model = ldask(df_english['whole_text'])

In [None]:
#Show most common words in topics
display_topics(lda_model, cv.get_feature_names(), 10)

In [None]:
vectorizer = CountVectorizer(stop_words='english')
doc_word = vectorizer.fit_transform(df_english['whole_text'])
doc_word.shape

### Corex

In [None]:
vectorizer = CountVectorizer(stop_words=stopwords1, token_pattern="\\b[a-z][a-z]+\\b",
                             binary=True, max_df = 0.15, min_df = 0.01)

doc_word = vectorizer.fit_transform(df['whole_text'])
words = list(np.asarray(vectorizer.get_feature_names()))

In [None]:
topic_model = ct.Corex(n_hidden=3, words=words, seed=10)
topic_model.fit(doc_word, words=words, docs=df.whole_text, 
                anchors=[['news', 'cryptonews'], 
                         ['eth'], 
                         ['price', 'market']],anchor_strength=0.6)

In [None]:
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

In [None]:
#Top documents for each model
topic_model.get_top_docs(topic=0, n_docs=2)

In [None]:
predictions = pd.DataFrame(topic_model.predict(doc_word), columns=['topic'+str(i) for i in range(3)])
predictions.head()

In [None]:
predictions.topic0.value_counts()

In [None]:
x = topic_model.predict_proba(doc_word)

# Sentiment Analysis

In [None]:
!pip install vaderSentiment

In [None]:
def sentiment_scores(sentence): 
  
    # Create a SentimentIntensityAnalyzer object. 
    sid_obj = SentimentIntensityAnalyzer() 
  
    # polarity_scores method of SentimentIntensityAnalyzer 
    # oject gives a sentiment dictionary. 
    # which contains pos, neg, neu, and compound scores. 
    sentiment_dict = sid_obj.polarity_scores(sentence) 
      
    #print("Overall sentiment dictionary is : ", sentiment_dict) 
    #print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative") 
    #print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral") 
    #print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive") 
  
    #print("Sentence Overall Rated As", end = " ") 
  
    # decide sentiment as positive, negative and neutral 
    #if sentiment_dict['compound'] >= 0.05 : 
        #print("Positive") 
  
    #elif sentiment_dict['compound'] <= - 0.05 : 
        #print("Negative") 
  
    #else : 
        #print("Neutral") 
    return sentiment_dict['compound']

In [None]:
def compoundconversion(value):
    """
    Converting compound sentiment value to positive, neutral or negative
    """ 
    if value >= 0.05 : 
        return "Positive"
  
    elif value <= - 0.05 : 
        return "Negative"
  
    else : 
        return "Neutral"

In [None]:
def topicactivity(df):
    #Add the number of times the same topic and sentiment occured at that specified time (in this case hour)
    topics = df.groupby(['hours', 'Topic'])['tweet-id'].count().reset_index()
    topics = topics.rename({'tweet-id':'topic_activity_hour'}, axis = 1)
    df = df.merge(topics, on = ['hours', 'Topic'], how = 'left')
    x = df.groupby(['hours', 'Topic', 'sentiment'])['tweet-id'].count().reset_index()
    x = x.rename({'tweet-id':'sentiment_topic_count_hour'}, axis = 1)
    df = df.merge(x, on = ['hours', 'Topic', 'sentiment'], how = 'left')
    #Add the number of times the same topic and sentiment occured at that specified time (in this case day)
    topics_day = df.groupby(['Day', 'Topic'])['tweet-id'].count().reset_index()
    topics_day = topics_day.rename({'tweet-id':'topic_activity_day'}, axis = 1)
    df = df.merge(topics_day, on = ['Day', 'Topic'], how = 'left')
    x_days = df.groupby(['Day', 'Topic', 'sentiment'])['tweet-id'].count().reset_index()
    x_days = x_days.rename({'tweet-id':'sentiment_topic_count_day'}, axis = 1)
    df = df.merge(x_days, on = ['Day', 'Topic', 'sentiment'], how = 'left')
    return df

In [None]:
compound = sentiment_scores(df['clean_tweet'])

In [None]:
df.sentiment.value_counts()

In [None]:
y = np.vectorize(sentiment_scores)(df['whole_text'])

In [None]:
pkl.dump(df, open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/tweet_sentiment.pkl', 'wb'))

In [None]:
df = pkl.load(open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/tweet_sentiment.pkl', 'rb'))

In [None]:
df.columns

In [None]:
df.hours.sort_values(ascending = True)

In [None]:
#df['sentiment'] = 
df['sentiment'] = np.vectorize(compoundconversion)(df['sentiment_compound'])
#s = compoundconversion(df.iloc[0:1000], 'sentiment_compound')

In [None]:
months = df["hours"].dt.month
years = df["hours"].dt.year
df['Month'] = pd.to_datetime(years.astype(str)+months.astype(str) , format = "%Y%m")
df['Day'] = df['hours'].apply(lambda x: pd.Timestamp(year =x.year, month = x.month, day = x.day, hour = 0, minute = 0, second = 0))

In [None]:
df = topicactivity(df)

In [None]:
pkl.dump(df, open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/tweets.pkl', 'wb'))

In [None]:
df = pkl.load(open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/tweets.pkl', 'rb'))

In [None]:
df.columns

In [None]:
def addsentiment(df, column):
    df_groupby = df.groupby([column, 'sentiment'])['tweet-id'].count().reset_index()
    df_negative = df_groupby[df_groupby.sentiment == 'Negative'].rename(columns={'tweet-id':'Negative_this_'+column})
    df_neutral = df_groupby[df_groupby.sentiment == 'Neutral'].rename(columns={'tweet-id':'Neutral_this_'+column})
    df_positive = df_groupby[df_groupby.sentiment == 'Positive'].rename(columns={'tweet-id':'Positive_this_'+column})
    df1 = df.merge(df_negative[[column, 'Negative_this_'+column]], on = column, how = 'left')
    df1 = df1.merge(df_neutral[[column, 'Neutral_this_'+column]], on = column, how = 'left')
    df1 = df1.merge(df_positive[[column, 'Positive_this_'+column]], on  = column, how = 'left')
    return df1

In [None]:
df = addsentiment(df, 'Day')
df = addsentiment(df, 'Month')
df = addsentiment(df, 'hours')

In [None]:
df.columns

In [None]:
def posoverneg(df):
    df['pos_neg_ratio_day'] = df['Positive_this_Day']/df['Negative_this_Day']
    df['pos_neg_ratio_month'] = df['Positive_this_Month']/df['Negative_this_Month']
    df['pos_neg_ratio_hour'] = df['Positive_this_hours']/df['Negative_this_hours']
    return df

In [None]:
df = posoverneg(df)

In [None]:
pkl.dump(df, open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/tweets.pkl', 'wb'))

In [None]:
df = pkl.load(open('/Users/ferdinandwohlenberg/Desktop/Projects/Project_4/Data/tweets.pkl', 'rb'))