# Twitter Scraping & NLP

In [15]:
import requests 
import json
from config import consumer_key, consumer_secret, access_key, access_secret, bearer_token

In [16]:
from textblob import TextBlob
import pandas as pd
import sys
import tweepy
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns
import time
import re
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('wordnet')

!pip install gensim
import gensim
from gensim.parsing.preprocessing import remove_stopwords 
import torch
import flair
from flair.models import TextClassifier
from flair.data import Sentence
from segtok.segmenter import split_single

from wordcloud import WordCloud, STOPWORDS
from PIL import Image
from langdetect import detect
from sklearn.feature_extraction.text import CountVectorizer

# Display max column width 
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/memme11/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/memme11/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/memme11/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/memme11/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




## Twitter API (Tweepy)

In [17]:
# Initialize and gain access to Twitter API
def initialize():
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    return api

api = initialize()

In [18]:
# Call on tweepy API and create dataframe
search_words = ("bitcoin", "etherium", "cardano")
crypto_data = pd.DataFrame()

def get_data(data):
    data = {
        'text': data.full_text,
        'date': data.created_at,
        'followers': data.user.followers_count,
        'favourites': data.user.favourites_count,
        'retweets': data.retweet_count
    }
    return data

for tweets in search_words:
    comp_tweets = api.search(q=tweets, lang = 'en', result_type = 'recent', count=250, tweet_mode='extended')
    
    for tweet in comp_tweets:
        row = get_data(tweet)
        crypto_data = crypto_data.append(row, ignore_index=True)
        
crypto_data

Unnamed: 0,date,favourites,followers,retweets,text
0,2021-07-28 00:51:20,1008.0,85.0,38.0,RT @nf4mation: Senator Warren Urges Treasury Secretary Yellen to Urgently Adopt Policy to Mitigate Cryptocurrencies' Risks – Regulation Bit…
1,2021-07-28 00:51:19,24.0,67802.0,0.0,#affiliatemarketing #affiliates #affiliateprogram #affiliated #crypto #cryptocurrency #cryptocurrenices #dogearmy #dogecoin #affiliatemarketing claim free #btc #eth #Litecoin #bitcoin #doge #Binance #tronarmy https://t.co/q16YusaF7U
2,2021-07-28 00:51:18,2121.0,246.0,459.0,RT @everyapetoken: $EVAPE IS UP 2600% from launch! This is mooning like no other!🚀🚀🚀\n\nGo ape in at uniswap! \nhttps://t.co/COYHNNL2g1\n\nCan o…
3,2021-07-28 00:51:18,2236.0,299.0,0.0,After months of of trading below the 200 EMA #bitcoin finally closed above the 200 EMA
4,2021-07-28 00:51:18,0.0,8201.0,0.0,Bitcoin Short Squeeze Revives Trading Volume And Volatility https://t.co/ovicwkaYNU #ETH #QASH #CryptoNews #BCH #CryptoCurrency #BTC #Ethereum #BitcoinNews #BLockchain #XRP #BlockchainNews #Cryptocurrencynews
...,...,...,...,...,...
280,2021-07-28 00:41:04,6801.0,391.0,385.0,"RT @CryptoadvisorN: Green candles once again!! A very promising future is coming in crypto🤑🤑 keep full your bags, invest smart!!!\n#btc #ETH…"
281,2021-07-28 00:41:03,1520.0,97.0,10.0,"RT @CardanoGuru: 90%+ Sold out\n\nI believe in the art world, they call that a success\n\nBefore they run out, want to say congratulations to @…"
282,2021-07-28 00:41:00,950.0,18.0,350.0,RT @SafeCardano: Hello Everyone Raising hands\n\n👉 Here is Contract address details:\n 0x9f94d5457082a5bf59f1ffdff6dd9e158f56fc2d\n\n👉Here is…
283,2021-07-28 00:40:53,23999.0,381.0,0.0,$STORJ 😉😉😉\n\n#STORJ #Data #DataStorage\n\n#crypto #cryptocurrency #cryptocurrencies #CryptoNews $BTC #Bitcoin #bitcoinnews $ONE #Ethereum #Algorand $ALGO $ETH $VET $HBAR #HBAR #MATIC $MATIC $ENJ #ENJ $SOL #Solana #chainlink $link #ADA #Cardano https://t.co/jHckrYkSld


## Data Preprocessing

In [19]:
# Formatting
# Keep only tweets with over 1000 favourites
crypto_data = crypto_data.loc[crypto_data['favourites']>1000]

# Clean text column using Regex
crypto_data['cleaned_text'] = crypto_data['text']
clean_text = '(RT) @[\w]*:|(@[A-Za-z0-9]+)|([^\,\!\.\'\%0-9A-Za-z \t])|(\w+:\/\/\S+)'
crypto_data['cleaned_text'] = crypto_data['cleaned_text'].str.replace(clean_text, " ", regex=True)
crypto_data['cleaned_text'] = crypto_data['cleaned_text'].str.lower()

# Convert date dtype to datetime, set index, sort index and drop duplicates
crypto_data['date'] = pd.to_datetime(crypto_data['date'])
crypto_data = crypto_data.set_index('date').sort_index(ascending=False)
crypto_data.drop_duplicates(inplace=True)

crypto_data.head(30)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas

Unnamed: 0_level_0,favourites,followers,retweets,text,cleaned_text
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-07-28 00:51:20,1008.0,85.0,38.0,RT @nf4mation: Senator Warren Urges Treasury Secretary Yellen to Urgently Adopt Policy to Mitigate Cryptocurrencies' Risks – Regulation Bit…,senator warren urges treasury secretary yellen to urgently adopt policy to mitigate cryptocurrencies' risks regulation bit
2021-07-28 00:51:18,2236.0,299.0,0.0,After months of of trading below the 200 EMA #bitcoin finally closed above the 200 EMA,after months of of trading below the 200 ema bitcoin finally closed above the 200 ema
2021-07-28 00:51:18,2121.0,246.0,459.0,RT @everyapetoken: $EVAPE IS UP 2600% from launch! This is mooning like no other!🚀🚀🚀\n\nGo ape in at uniswap! \nhttps://t.co/COYHNNL2g1\n\nCan o…,evape is up 2600% from launch! this is mooning like no other! go ape in at uniswap! can o
2021-07-28 00:51:17,6045.0,1552.0,0.0,#bitcoin was here👇 https://t.co/eo6ExHWwUk,bitcoin was here
2021-07-28 00:51:15,5334.0,90.0,0.0,@85Aurum @AurumKoko @LUNAPR1 #Aurum - the Golden Standard\nNUMBER 1 🏆 SOON\nAND HERE TO STAY 😎\n\n@AURofficial_ #Bitcoin \n#BinanceSmartChain #BSCGems,aurum the golden standard number 1 soon and here to stay bitcoin binancesmartchain bscgems
2021-07-28 00:51:14,17349.0,714.0,306.0,RT @TheCryptoLark: #bitcoin just closed a daily candle above the 200 day EMA! This is the first time it has crossed this line since the cra…,bitcoin just closed a daily candle above the 200 day ema! this is the first time it has crossed this line since the cra
2021-07-28 00:51:13,2281.0,297.0,2.0,RT @Emre10530357: @florsbeny The best project is #LunaLand\n\n#LunaLand to the moon 🚀 🚀 🚀\n\nCheck out the information in the photo and see how…,the best project is lunaland lunaland to the moon check out the information in the photo and see how
2021-07-28 00:51:12,2935.0,26.0,1326.0,"RT @cz_binance: Fiat or crypto, #bitcoin is the most energy efficient money network.","fiat or crypto, bitcoin is the most energy efficient money network."
2021-07-28 00:51:11,29084.0,3653.0,1.0,RT @engielar: Any bitcoin update?,any bitcoin update
2021-07-28 00:51:11,5334.0,90.0,0.0,@AurumKoko #Aurum - the Golden Standard\nNUMBER 1 🏆 SOON\nAND HERE TO STAY 😎\n\n@AURofficial_ #Bitcoin \n#BinanceSmartChain #BSCGems,aurum the golden standard number 1 soon and here to stay bitcoin binancesmartchain bscgems


## Tokenization

In [20]:
# Tokenizing Functions

def get_wordnet_pos(word):
# Map POS tag to the first character lemmatize() accepts
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

# Function for tokenizing tweets (already cleaned using regex)
def second_clean(tweet):
    tweet = remove_stopwords(tweet) # remove stopwords with Gensim

    lemmatizer = WordNetLemmatizer()
    tokenized = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(tweet)]
    
    # remove left over stop words with nltk
    tokenized = [token for token in tokenized if token not in stopwords.words("english")] 

    # remove non-alpha characters and keep the words of length >2 only
    tokenized = [token for token in tokenized if token.isalpha() and len(token)>2]

    return tokenized

# Function for joining tokenized list into string
def combine_tokens(tokenized): 
    non_tokenized = ' '.join([w for w in tokenized])
    return non_tokenized

# Execute function 
crypto_data['tokens'] = crypto_data['cleaned_text'].apply(lambda x: second_clean(x))
crypto_data['final_clean'] = crypto_data['tokens'].apply(lambda x: combine_tokens(x))

crypto_data

Unnamed: 0_level_0,favourites,followers,retweets,text,cleaned_text,tokens,final_clean
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-07-28 00:51:20,1008.0,85.0,38.0,RT @nf4mation: Senator Warren Urges Treasury Secretary Yellen to Urgently Adopt Policy to Mitigate Cryptocurrencies' Risks – Regulation Bit…,senator warren urges treasury secretary yellen to urgently adopt policy to mitigate cryptocurrencies' risks regulation bit,"[senator, warren, urge, treasury, secretary, yellen, urgently, adopt, policy, mitigate, cryptocurrencies, risk, regulation, bit]",senator warren urge treasury secretary yellen urgently adopt policy mitigate cryptocurrencies risk regulation bit
2021-07-28 00:51:18,2236.0,299.0,0.0,After months of of trading below the 200 EMA #bitcoin finally closed above the 200 EMA,after months of of trading below the 200 ema bitcoin finally closed above the 200 ema,"[month, trading, ema, bitcoin, finally, close, ema]",month trading ema bitcoin finally close ema
2021-07-28 00:51:18,2121.0,246.0,459.0,RT @everyapetoken: $EVAPE IS UP 2600% from launch! This is mooning like no other!🚀🚀🚀\n\nGo ape in at uniswap! \nhttps://t.co/COYHNNL2g1\n\nCan o…,evape is up 2600% from launch! this is mooning like no other! go ape in at uniswap! can o,"[evape, launch, moon, like, ape, uniswap]",evape launch moon like ape uniswap
2021-07-28 00:51:17,6045.0,1552.0,0.0,#bitcoin was here👇 https://t.co/eo6ExHWwUk,bitcoin was here,[bitcoin],bitcoin
2021-07-28 00:51:15,5334.0,90.0,0.0,@85Aurum @AurumKoko @LUNAPR1 #Aurum - the Golden Standard\nNUMBER 1 🏆 SOON\nAND HERE TO STAY 😎\n\n@AURofficial_ #Bitcoin \n#BinanceSmartChain #BSCGems,aurum the golden standard number 1 soon and here to stay bitcoin binancesmartchain bscgems,"[aurum, golden, standard, number, soon, stay, bitcoin, binancesmartchain, bscgems]",aurum golden standard number soon stay bitcoin binancesmartchain bscgems
...,...,...,...,...,...,...,...
2021-07-27 23:30:13,1083.0,29.0,8.0,RT @CKadatz: @IcedKnife $Raptor ! Under a week awhile till they launch a swap different from all the other swaps. CEX and DEX platforms. Hu…,raptor ! under a week awhile till they launch a swap different from all the other swaps. cex and dex platforms. hu,"[raptor, week, awhile, till, launch, swap, different, swap, cex, dex, platform]",raptor week awhile till launch swap different swap cex dex platform
2021-07-27 23:30:11,4950.0,978.0,1.0,😍you’re doing a give away?! then please let me know and after you give your give away tag me and put your eth address and I will air drop you your flowers minted onto the etherium blockchain to make it official! 🔥im gonna go create it to be ready🤘🏼#community #nftsarethefuture https://t.co/wMm92DBT4f,you re doing a give away ! then please let me know and after you give your give away tag me and put your eth address and i will air drop you your flowers minted onto the etherium blockchain to make it official! im gonna go create it to be ready community nftsarethefuture,"[away, let, know, away, tag, eth, address, air, drop, flower, mint, etherium, blockchain, official, gon, create, ready, community, nftsarethefuture]",away let know away tag eth address air drop flower mint etherium blockchain official gon create ready community nftsarethefuture
2021-07-27 23:29:49,1083.0,29.0,12.0,RT @CKadatz: @Investments_CEO $Raptor. Just launched farming. Already has staking and a lottery. Soon to launch raptorswap and NFT’s. Gonna…,ceo raptor. just launched farming. already has staking and a lottery. soon to launch raptorswap and nft s. gonna,"[ceo, raptor, launch, farm, stake, lottery, soon, launch, raptorswap, nft, gon]",ceo raptor launch farm stake lottery soon launch raptorswap nft gon
2021-07-27 23:29:20,1083.0,29.0,7.0,"RT @CKadatz: @SharksCoins $Raptor. They have farming , staking and a lottery. Their swap is coming out soon followed by NFT’s. Huge hidden…","raptor. they have farming , staking and a lottery. their swap is coming out soon followed by nft s. huge hidden","[raptor, farm, stake, lottery, swap, come, soon, follow, nft, huge, hidden]",raptor farm stake lottery swap come soon follow nft huge hidden


## NLP - Vader Sentiment Model

In [21]:
# Initialize analyzer
sia = SentimentIntensityAnalyzer()

# Sentiment labels function 
def sentiment_labels(df, feature, value): 
    df.loc[df[value] > 0,feature] = 'positive'
    df.loc[df[value] == 0,feature] = 'neutral'
    df.loc[df[value] < 0,feature] = 'negative'
    
# Vader sentiment analysis

def vader_sentiment(df):
    
    target_col='cleaned_text'
    prefix = 'vader_clean_'
        
    scores_col=prefix+'scores'
    compound_col = prefix+'polarity'
    sentiment = prefix+'sentiment'
    
    df[scores_col] = df[target_col].apply(lambda x:sia.polarity_scores(x))
    df[compound_col] = df[scores_col].apply(lambda d: d['compound'])
    sentiment_labels(df, sentiment, compound_col)
    
#Execute vader function
start = time.time()
vader_sentiment(crypto_data)
stop = time.time()

print(f'Vader analysis took: {round((stop-start)/60, 3)}minutes')

crypto_data.head(30)

Vader analysis took: 0.0minutes


Unnamed: 0_level_0,favourites,followers,retweets,text,cleaned_text,tokens,final_clean,vader_clean_scores,vader_clean_polarity,vader_clean_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-07-28 00:51:20,1008.0,85.0,38.0,RT @nf4mation: Senator Warren Urges Treasury Secretary Yellen to Urgently Adopt Policy to Mitigate Cryptocurrencies' Risks – Regulation Bit…,senator warren urges treasury secretary yellen to urgently adopt policy to mitigate cryptocurrencies' risks regulation bit,"[senator, warren, urge, treasury, secretary, yellen, urgently, adopt, policy, mitigate, cryptocurrencies, risk, regulation, bit]",senator warren urge treasury secretary yellen urgently adopt policy mitigate cryptocurrencies risk regulation bit,"{'neg': 0.113, 'neu': 0.699, 'pos': 0.188, 'compound': 0.1027}",0.1027,positive
2021-07-28 00:51:18,2236.0,299.0,0.0,After months of of trading below the 200 EMA #bitcoin finally closed above the 200 EMA,after months of of trading below the 200 ema bitcoin finally closed above the 200 ema,"[month, trading, ema, bitcoin, finally, close, ema]",month trading ema bitcoin finally close ema,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0,neutral
2021-07-28 00:51:18,2121.0,246.0,459.0,RT @everyapetoken: $EVAPE IS UP 2600% from launch! This is mooning like no other!🚀🚀🚀\n\nGo ape in at uniswap! \nhttps://t.co/COYHNNL2g1\n\nCan o…,evape is up 2600% from launch! this is mooning like no other! go ape in at uniswap! can o,"[evape, launch, moon, like, ape, uniswap]",evape launch moon like ape uniswap,"{'neg': 0.097, 'neu': 0.753, 'pos': 0.15, 'compound': 0.2905}",0.2905,positive
2021-07-28 00:51:17,6045.0,1552.0,0.0,#bitcoin was here👇 https://t.co/eo6ExHWwUk,bitcoin was here,[bitcoin],bitcoin,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0,neutral
2021-07-28 00:51:15,5334.0,90.0,0.0,@85Aurum @AurumKoko @LUNAPR1 #Aurum - the Golden Standard\nNUMBER 1 🏆 SOON\nAND HERE TO STAY 😎\n\n@AURofficial_ #Bitcoin \n#BinanceSmartChain #BSCGems,aurum the golden standard number 1 soon and here to stay bitcoin binancesmartchain bscgems,"[aurum, golden, standard, number, soon, stay, bitcoin, binancesmartchain, bscgems]",aurum golden standard number soon stay bitcoin binancesmartchain bscgems,"{'neg': 0.0, 'neu': 0.909, 'pos': 0.091, 'compound': 0.0772}",0.0772,positive
2021-07-28 00:51:14,17349.0,714.0,306.0,RT @TheCryptoLark: #bitcoin just closed a daily candle above the 200 day EMA! This is the first time it has crossed this line since the cra…,bitcoin just closed a daily candle above the 200 day ema! this is the first time it has crossed this line since the cra,"[bitcoin, close, daily, candle, day, ema, time, cross, line, cra]",bitcoin close daily candle day ema time cross line cra,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0,neutral
2021-07-28 00:51:13,2281.0,297.0,2.0,RT @Emre10530357: @florsbeny The best project is #LunaLand\n\n#LunaLand to the moon 🚀 🚀 🚀\n\nCheck out the information in the photo and see how…,the best project is lunaland lunaland to the moon check out the information in the photo and see how,"[best, project, lunaland, lunaland, moon, check, information, photo]",best project lunaland lunaland moon check information photo,"{'neg': 0.0, 'neu': 0.811, 'pos': 0.189, 'compound': 0.6369}",0.6369,positive
2021-07-28 00:51:12,2935.0,26.0,1326.0,"RT @cz_binance: Fiat or crypto, #bitcoin is the most energy efficient money network.","fiat or crypto, bitcoin is the most energy efficient money network.","[fiat, crypto, bitcoin, energy, efficient, money, network]",fiat crypto bitcoin energy efficient money network,"{'neg': 0.0, 'neu': 0.622, 'pos': 0.378, 'compound': 0.6674}",0.6674,positive
2021-07-28 00:51:11,29084.0,3653.0,1.0,RT @engielar: Any bitcoin update?,any bitcoin update,"[bitcoin, update]",bitcoin update,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0,neutral
2021-07-28 00:51:11,5334.0,90.0,0.0,@AurumKoko #Aurum - the Golden Standard\nNUMBER 1 🏆 SOON\nAND HERE TO STAY 😎\n\n@AURofficial_ #Bitcoin \n#BinanceSmartChain #BSCGems,aurum the golden standard number 1 soon and here to stay bitcoin binancesmartchain bscgems,"[aurum, golden, standard, number, soon, stay, bitcoin, binancesmartchain, bscgems]",aurum golden standard number soon stay bitcoin binancesmartchain bscgems,"{'neg': 0.0, 'neu': 0.909, 'pos': 0.091, 'compound': 0.0772}",0.0772,positive


## Analysis

In [23]:
# View sentiment totals
crypto_data['vader_clean_sentiment'].value_counts()

positive    72
neutral     63
negative     9
Name: vader_clean_sentiment, dtype: int64

In [25]:
# Function for determining avg sentiment for each score in the model and overall average sentiment
def pos_neg_neutral_avg(df):
    
    positive = []
    neutral = []
    negative = []
    
    for values in df:
        if values > 0:
            positive.append(values)
        
        elif values < 0:
            negative.append(values)
        
        else:
            neutral.append(values)
    
    print(f'Positive score average for {df.name} = {round(np.mean(positive), 2)}')
    print(f'Neutral score average for {df.name} = {round(np.mean(neutral), 2)}')
    print(f'Negative score average for {df.name} = {round(np.mean(negative), 2)}')
    
    print(f'Overall crypto sentiment score is = {round(np.mean(df), 4)}')
    
# Average scores for each sentiment category, and overall sentiment score
vader_values = crypto_data.loc[:, 'vader_clean_polarity']
pos_neg_neutral_avg(vader_values)

Positive score average for vader_clean_polarity = 0.46
Neutral score average for vader_clean_polarity = 0.0
Negative score average for vader_clean_polarity = -0.41
Overall crypto sentiment score is = 0.202
