# Twitter Scraping & NLP

In [60]:
import requests 
import json
from config import consumer_key, consumer_secret, access_key, access_secret, bearer_token

In [61]:
from textblob import TextBlob
import pandas as pd
import sys
import tweepy
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns
import time
import re
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('wordnet')

!pip install gensim
import gensim
from gensim.parsing.preprocessing import remove_stopwords 
import torch
import flair
from flair.models import TextClassifier
from flair.data import Sentence
from segtok.segmenter import split_single

from wordcloud import WordCloud, STOPWORDS
from PIL import Image
from langdetect import detect
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/memme11/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/memme11/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/memme11/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/memme11/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




In [62]:
# Display max column width 
pd.set_option('display.max_colwidth', None)

## Twitter API (Tweepy)

In [63]:
# Initialize and gain access to Twitter API
def initialize():
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    return api

api = initialize()

In [64]:
# Use get statement to see json format
url = 'https://api.twitter.com/1.1/search/tweets.json'
params = {'q': 'bitcoin',
          'tweet_mode': 'extended', 
         'lang': 'en', 
         'count': 2}
headers = {'authorization': 'Bearer '+bearer_token}

sample_data = requests.get(url, params=params, headers=headers).json()

sample_data

In [66]:
# Call on tweepy API and create dataframe
search_words = ("bitcoin", "etherium", "cardano")
crypto_data = pd.DataFrame()

def get_data(data):
    data = {
        'text': data.full_text,
        'date': data.created_at,
        'followers': data.user.followers_count,
        'favourites': data.user.favourites_count,
        'retweets': data.retweet_count
    }
    return data

for tweets in search_words:
    comp_tweets = api.search(q=tweets, lang = 'en', result_type = 'recent', count=250, tweet_mode='extended')
    
    for tweet in comp_tweets:
        row = get_data(tweet)
        crypto_data = crypto_data.append(row, ignore_index=True)
        
crypto_data

## Data Preprocessing

In [68]:
# Formatting
# Keep only tweets with over 1000 favourites
crypto_data = crypto_data.loc[crypto_data['favourites']>1000]

# Clean text column using Regex
crypto_data['cleaned_text'] = crypto_data['text']
clean_text = '(RT) @[\w]*:|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)'
crypto_data['cleaned_text'] = crypto_data['cleaned_text'].str.replace(clean_text, " ", regex=True)
crypto_data['cleaned_text'] = crypto_data['cleaned_text'].str.lower()

# Convert date dtype to datetime, set index and drop duplicates
crypto_data['date'] = pd.to_datetime(crypto_data['date'])
crypto_data = crypto_data.set_index('date').sort_index(ascending=False)
crypto_data.drop_duplicates(inplace=True)

crypto_data.head(30)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas

Unnamed: 0_level_0,favourites,followers,retweets,text,cleaned_text
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-07-26 21:44:23,1240.0,38.0,1549.0,RT @0xEarthDefi: Check our Earth Defi!\nHosted on the State-of-the-art #PolygonNetwork \n#yieldfarming and swap router\n\n👉👉 https://t.co/VjBV0…,check our earth defi hosted on the state of the art polygonnetwork yieldfarming and swap router
2021-07-26 21:44:19,6647.0,68.0,17.0,RT @adamliaw: If there's another protest this weekend and the NSW police want to immobilise anyone without using force they should just ask…,if there s another protest this weekend and the nsw police want to immobilise anyone without using force they should just ask
2021-07-26 21:44:19,2902.0,80.0,415.0,RT @BTC_Archive: 😱 43% of Singapore residents own #Bitcoin or crypto - IRCI survey.,43 of singapore residents own bitcoin or crypto irci survey
2021-07-26 21:44:18,5982.0,2765.0,0.0,@BTCGandalf The wizard has spoken. Lookout for #bitcoin price to change relative to the prior day,the wizard has spoken lookout for bitcoin price to change relative to the prior day
2021-07-26 21:44:17,3669.0,1253.0,0.0,"@n3ocortex @Dontbewallst @glassnode different measurements point to 34K zone (HnS technical objective (granted, its a shallow shoulder); fibonacci extension; zone of clustered orders previously) plus options interest by end July to land price around 34K. $btc #bitcoin #crypto $btcusd $xbt\ncoincidence? 🤔 https://t.co/aXNsBqbM9Q",different measurements point to 34k zone hns technical objective granted its a shallow shoulder fibonacci extension zone of clustered orders previously plus options interest by end july to land price around 34k btc bitcoin crypto btcusd xbt coincidence
...,...,...,...,...,...
2021-07-26 21:00:23,93205.0,12229.0,3.0,RT @Trent209: This $POODL never stops progressing👏🏻\n\nBy far my favorite #altcoin 😎\n@POODLETOKEN \nTelegram: https://t.co/2W5KHU8fjW \n#Poodl…,this poodl never stops progressing by far my favorite altcoin telegram poodl
2021-07-26 20:59:31,83109.0,1651.0,8.0,"RT @TopDogBeachClub: 😈Mutley Crew present X with @TheMonsterRehab; 🍖 CHUCKLES 🍞\n\nCharles (Chuckles for short) was an accountant by day, int…",mutley crew present x with chuckles charles chuckles for short was an accountant by day int
2021-07-26 20:58:09,1216.0,646.0,0.0,People wondering what #nft prices are going to do when the price of #etherium rises.\n\nCryptopunks- https://t.co/Iektic1bO7,people wondering what nft prices are going to do when the price of etherium rises cryptopunks
2021-07-26 20:57:33,3517.0,227.0,589.0,RT @azziadoor: 🏦$1000 in $ETH to a random person Just Follows and Retweets \n\nBest of luck Followers 🥰\n#ETH #etherium #ETHEREUM #eth,1000 in eth to a random person just follows and retweets best of luck followers eth etherium ethereum eth


## Tokenization

In [70]:
# Tokenizing Functions

def get_wordnet_pos(word):
# Map POS tag to the first character lemmatize() accepts
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

# Function for tokenizing tweets (already cleaned using regex)
def second_clean(tweet):
    tweet = remove_stopwords(tweet) # remove stopwords with Gensim

    lemmatizer = WordNetLemmatizer()
    tokenized = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(tweet)]
    
    # remove left over stop words with nltk
    tokenized = [token for token in tokenized if token not in stopwords.words("english")] 

    # remove non-alpha characters and keep the words of length >2 only
    tokenized = [token for token in tokenized if token.isalpha() and len(token)>2]

    return tokenized

# Function for joining tokenized list into string
def combine_tokens(tokenized): 
    non_tokenized = ' '.join([w for w in tokenized])
    return non_tokenized

In [71]:
# Execute function 
crypto_data['tokens'] = crypto_data['cleaned_text'].apply(lambda x: second_clean(x))
crypto_data['final_clean'] = crypto_data['tokens'].apply(lambda x: combine_tokens(x))

crypto_data

## NLP - Vader Sentiment Model

In [86]:
# Sentiment labels function 
def sentiment_labels(df, feature, value): 
    df.loc[df[value] > 0,feature] = 'positive'
    df.loc[df[value] == 0,feature] = 'neutral'
    df.loc[df[value] < 0,feature] = 'negative'

In [87]:
# Vader sentiment analysis
# define function and variable for SentimentIntensityAnalyzer()

sia = SentimentIntensityAnalyzer()


def vader_sentiment(df):
    
    target_col='final_clean'
    prefix = 'vader_clean_'
        
    scores_col=prefix+'scores'
    
    compound_col = prefix+'polarity'
    
    sentiment = prefix+'sentiment'
    
    df[scores_col] = df[target_col].apply(lambda x:sia.polarity_scores(x))
    
    df[compound_col] = df[scores_col].apply(lambda d: d['compound'])
    
    sentiment_labels(df, sentiment, compound_col)

In [88]:
#Execute vader function
start = time.time()

vader_sentiment(crypto_data)
stop = time.time()

print(f'Vader analysis took: {round((stop-start)/60, 3)}minutes')

crypto_data.head(30)

Vader analysis took: 0.0minutes


Unnamed: 0_level_0,favourites,followers,retweets,text,cleaned_text,tokens,final_clean,vader_clean_scores,vader_clean_polarity,vader_clean_sentiment,flair_score,flair_score2,sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-07-26 21:44:23,1240.0,38.0,1549.0,RT @0xEarthDefi: Check our Earth Defi!\nHosted on the State-of-the-art #PolygonNetwork \n#yieldfarming and swap router\n\n👉👉 https://t.co/VjBV0…,check our earth defi hosted on the state of the art polygonnetwork yieldfarming and swap router,"[check, earth, defi, host, state, art, polygonnetwork, yieldfarming, swap, router]",check earth defi host state art polygonnetwork yieldfarming swap router,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0,neutral,0.99,0.87,positive
2021-07-26 21:44:19,6647.0,68.0,17.0,RT @adamliaw: If there's another protest this weekend and the NSW police want to immobilise anyone without using force they should just ask…,if there s another protest this weekend and the nsw police want to immobilise anyone without using force they should just ask,"[protest, weekend, nsw, police, want, immobilise, force, ask]",protest weekend nsw police want immobilise force ask,"{'neg': 0.215, 'neu': 0.645, 'pos': 0.14, 'compound': -0.1779}",-0.1779,negative,-0.99,-0.99,negative
2021-07-26 21:44:19,2902.0,80.0,415.0,RT @BTC_Archive: 😱 43% of Singapore residents own #Bitcoin or crypto - IRCI survey.,43 of singapore residents own bitcoin or crypto irci survey,"[singapore, resident, bitcoin, crypto, irci, survey]",singapore resident bitcoin crypto irci survey,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0,neutral,-0.96,-1.0,negative
2021-07-26 21:44:18,5982.0,2765.0,0.0,@BTCGandalf The wizard has spoken. Lookout for #bitcoin price to change relative to the prior day,the wizard has spoken lookout for bitcoin price to change relative to the prior day,"[wizard, spoken, lookout, bitcoin, price, change, relative, prior, day]",wizard spoken lookout bitcoin price change relative prior day,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0,neutral,-0.81,0.56,negative
2021-07-26 21:44:17,3669.0,1253.0,0.0,"@n3ocortex @Dontbewallst @glassnode different measurements point to 34K zone (HnS technical objective (granted, its a shallow shoulder); fibonacci extension; zone of clustered orders previously) plus options interest by end July to land price around 34K. $btc #bitcoin #crypto $btcusd $xbt\ncoincidence? 🤔 https://t.co/aXNsBqbM9Q",different measurements point to 34k zone hns technical objective granted its a shallow shoulder fibonacci extension zone of clustered orders previously plus options interest by end july to land price around 34k btc bitcoin crypto btcusd xbt coincidence,"[different, measurement, point, zone, technical, objective, grant, shallow, shoulder, fibonacci, extension, zone, cluster, order, previously, plus, option, end, july, land, price, btc, bitcoin, crypto, btcusd, xbt, coincidence]",different measurement point zone technical objective grant shallow shoulder fibonacci extension zone cluster order previously plus option end july land price btc bitcoin crypto btcusd xbt coincidence,"{'neg': 0.0, 'neu': 0.912, 'pos': 0.088, 'compound': 0.3612}",0.3612,positive,0.92,-0.99,positive
2021-07-26 21:44:17,22256.0,172.0,23.0,RT @therealjuicyj: Bitcoin is bouncing back 💸💸,bitcoin is bouncing back,"[bitcoin, bounce]",bitcoin bounce,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0,neutral,-0.97,-0.93,negative
2021-07-26 21:44:16,28297.0,117.0,0.0,"I swear, if bitcoin drops before I can load up on puts… 🤬",i swear if bitcoin drops before i can load up on puts,"[swear, bitcoin, drop, load, put]",swear bitcoin drop load put,"{'neg': 0.524, 'neu': 0.476, 'pos': 0.0, 'compound': -0.3182}",-0.3182,negative,-1.0,-1.0,negative
2021-07-26 21:44:16,7646.0,215.0,6.0,RT @sokane1: NEW: Tesla finally turned a profit on the products it sells https://t.co/HKIoZ5wUHV,new tesla finally turned a profit on the products it sells,"[new, tesla, finally, turn, profit, product, sell]",new tesla finally turn profit product sell,"{'neg': 0.0, 'neu': 0.674, 'pos': 0.326, 'compound': 0.4404}",0.4404,positive,0.86,0.99,positive
2021-07-26 21:44:14,6980.0,953.0,0.0,"@ekkysyrdn Hey I joined a network called SurveyJ and earned $7,679 today! You get paid for surveys and posting on social media. Sign up with my link for an instant $40 signup bonus! https://t.co/aDOY6tyqzI #SurveyJ #cashapp #bitcoin",hey i joined a network called surveyj and earned 7 679 today you get paid for surveys and posting on social media sign up with my link for an instant 40 signup bonus surveyj cashapp bitcoin,"[hey, join, network, call, surveyj, earn, today, paid, survey, post, social, medium, sign, link, instant, signup, bonus, surveyj, cashapp, bitcoin]",hey join network call surveyj earn today paid survey post social medium sign link instant signup bonus surveyj cashapp bitcoin,"{'neg': 0.0, 'neu': 0.759, 'pos': 0.241, 'compound': 0.6908}",0.6908,positive,0.53,-0.9,neutral
2021-07-26 21:44:14,127578.0,77514.0,17.0,"RT @CryptoBoomNews: Amazon denies the City A.M. report about accepting #Bitcoin this year and #Bitcoin immediately dropped back below $40,0…",amazon denies the city a m report about accepting bitcoin this year and bitcoin immediately dropped back below 40 0,"[amazon, denies, city, report, accept, bitcoin, year, bitcoin, immediately, drop]",amazon denies city report accept bitcoin year bitcoin immediately drop,"{'neg': 0.322, 'neu': 0.395, 'pos': 0.283, 'compound': -0.1531}",-0.1531,negative,-1.0,-1.0,negative


## Analysis

In [97]:
# View sentiment totals
crypto_data['vader_clean_sentiment'].value_counts()

positive    66
neutral     63
negative    24
Name: vader_clean_sentiment, dtype: int64

In [95]:
# Function for determining avg sentiment for each score in the model and overall average sentiment
def pos_neg_neutral_avg(df):
    
    positive = []
    neutral = []
    negative = []
    
    for values in df:
        if values > 0:
            positive.append(values)
        
        elif values < 0:
            negative.append(values)
        
        else:
            neutral.append(values)
    
    print(f'Positive score average for {df.name} = {round(np.mean(positive), 2)}')
    print(f'Neutral score average for {df.name} = {round(np.mean(neutral), 2)}')
    print(f'Negative score average for {df.name} = {round(np.mean(negative), 2)}')
    
    print(f'Overall crypto sentiment score is = {round(np.mean(df), 2)}')

In [96]:
# Average scores for each sentiment category, and overall sentiment score
vader_values = crypto_data.loc[:, 'vader_clean_polarity']
pos_neg_neutral_avg(vader_values)

Positive score average for vader_clean_polarity = 0.53
Neutral score average for vader_clean_polarity = 0.0
Negative score average for vader_clean_polarity = -0.35
Overall crypto sentiment score is = 0.17
