In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import tweepy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import os
import nltk
from collections import Counter
import time
import sys
import pandas as pd
import re

consumer_key = os.getenv("TWITTER_PUBLIC_API")
consumer_secret = os.getenv("TWITTER_SECRET_KEY")
nltk.download('vader_lexicon')

addl_stopwords = [',','`', '', 'rt', 'http', 'https', 'RT', 'BTC', 'bitcoin', 'ETH', 'LTC', 'XRP', 'co', 'crypto', 'blockchain', 'cryptocurrency', 'cripto', 'litecoin']

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\cscat\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
def tokenizer(text):
    """Tokenizes text."""
    text = word_tokenize(text)
    text = [word.lower() for word in text]
    regex = re.compile("[^a-zA-Z ]")
    text = [regex.sub('', word) for word in text]
    sw = set(stopwords.words('english') + addl_stopwords)
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    clean_text = [word for word in text if word not in sw]
    return clean_text

def token_count(tokens, N=10):
    """Returns the top N tokens from the frequency count"""
    return Counter(tokens).most_common(N)


# Functions for Twitter


def get_tweets_list(topic_of_tweet, num_of_tweets):
    '''
    Returns a dataframe of the most recent 'N' tweets from Twitter tokenized and counted.
    
    Arguements: `topic_of_tweet` : str; what hashtag is being searched 
                'num_of_tweets' : int; how many tweet do you want returned
    '''
    text,time, word_list, word_count=[],[],[],[]
    auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
    api = tweepy.API(auth)
    for tweet in tweepy.Cursor(api.search, q=topic_of_tweet, tweet_mode='extended').items(num_of_tweets):
        text.append(tweet.full_text),
        time.append(tweet.created_at)
    tweets_df = pd.DataFrame({'Tweet':text}, index=time)
    [word_list.append(tokenizer(text)) for text in tweets_df.Tweet]
    tweets_df['Tokens'] = word_list
    [word_count.append(token_count(token)) for token in tweets_df.Tokens]
    tweets_df['Word_Count'] = word_count
    
    return tweets_df

def twitter_sent_analysis(tweet_df):    
    tweet_sentiments, comp, pos, neg, neu = [],[],[],[],[]
    analyzer = SentimentIntensityAnalyzer()
    for tweet in tweet_df.Tweet:
        sentiment = analyzer.polarity_scores(tweet),
        comp.append(sentiment[0]["compound"]),
        pos.append(sentiment[0]["pos"]),
        neg.append(sentiment[0]["neg"]),
        neu.append(sentiment[0]["neu"]),
  
    tweet_df['Compound'] = comp
    tweet_df['Positive'] = pos
    tweet_df['Negative'] = neg
    tweet_df['Neutral'] = neu

    return tweet_df

def count(df):
    '''
    Takes a DataFrame with a "compund" column and returns a basic count of positive, neutral, and negative sentiment in a dict format
    '''
    positive_count, negative_count, neutral_count = 0,0,0
    for i in df['Compound']:
        if i >= 0.05:
            positive_count += 1
        elif i <= -0.05:
            negative_count += 1
        else:
            neutral_count += 1
    count={
        'Positive Tweets': positive_count,
        'Neutral Tweets': neutral_count,
       'Negavtive Tweets': negative_count
    }
    return count

def get_twitter_scores(topic_of_tweet, num_of_tweets):
    df = get_tweets_list(topic_of_tweet, num_of_tweets)
    df = twitter_sent_analysis(df)
    
    return df

In [3]:
def twitter_score(df):    
    scores = []
    for ico in df.Name:
        search_term = '#' + str(ico)
        print(f"Searching and Scoring {search_term}")
        tweet_df = get_twitter_scores(search_term, 20)
        score = {ico :{
                'Compound' : tweet_df.Compound.mean(),
                'Positive' : tweet_df.Positive.mean(),
                'Negative' : tweet_df.Negative.mean(),
                'Neutral' : tweet_df.Neutral.mean(),
        }}
        scores.append(score)
        print(f"{ico} scored")
    print(f"Scoring of {len(scores)} tweet concluded, creating dataframe")
    
    return scores

In [4]:
def progressbar(it, prefix="", size=60, file=sys.stdout):
    count = len(it)
    def show(j):
        x = int(size*j/count)
        file.write("%s[%s%s] %i/%i\r" % (prefix, "#"*x, "."*(size-x), j, count))
        file.flush()        
    show(0)
    for i, item in enumerate(it):
        yield item
        show(i+1)
    file.write("\n")
    file.flush()

In [19]:
def twitter_df_score(df, N): 
    '''
    Scores an entire Dataframe of coins based on the last 'N' tweets.  Returns a dataframe of scores with a 
    '''
    scores = []
    df2 = pd.DataFrame
    for name in df.Name.values:
        search_term = str(name)
        print(f"Searching and Scoring {search_term}")
        tweet_df = get_twitter_scores(search_term, N)
        score = {name :{
                'Compound' : tweet_df.Compound.mean(),
                'Positive' : tweet_df.Positive.mean(),
                'Negative' : tweet_df.Negative.mean(),
                'Neutral' : tweet_df.Neutral.mean(),
        }}
        scores.append(score)
        print(f"{name} scored")
        for i in progressbar(range(10), "Waiting for Twitter Rate Limit: ", 40):
            time.sleep(4.5) # any calculation you need
    print(f"Scoring of {len(scores)} tweet concluded, creating dataframe")
    
#    for item in scores:
 #       df1=pd.DataFrame.from_dict(item).T
#        df2 = pd.concat([df1,df2], sort = True)
#    ndf = pd.concat([df, df2])

    return scores

In [20]:
df = pd.read_csv('../../data/cleandata/success_df.csv')

In [None]:
twitter_df = twitter_df_score(df, 50)
    

Searching and Scoring  Ethereum
 Ethereum scored
Waiting for Twitter Rate Limit: [########################################] 10/10
Searching and Scoring  Binance Coin
 Binance Coin scored
Waiting for Twitter Rate Limit: [########################################] 10/10
Searching and Scoring  EOS
 EOS scored
Waiting for Twitter Rate Limit: [########################################] 10/10
Searching and Scoring  Cardano
 Cardano scored
Waiting for Twitter Rate Limit: [########################################] 10/10
Searching and Scoring  Huobi Token
 Huobi Token scored
Waiting for Twitter Rate Limit: [########################################] 10/10
Searching and Scoring  Tezos
 Tezos scored
Waiting for Twitter Rate Limit: [########################################] 10/10
Searching and Scoring  Cosmos
 Cosmos scored
Waiting for Twitter Rate Limit: [########################################] 10/10
Searching and Scoring  Dash
 Dash scored
Waiting for Twitter Rate Limit: [########################

In [None]:
twitter_df

In [None]:
tdf = pd.DataFrame(twitter_df)

In [None]:
tdf.to_csv('../../data/cleandata/twitter_sentiment.csv')