In [121]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import requests
import os
import json

In [122]:
def get_tweets(query, max_num_tweets, curr_coin):
    """ Gets at most max_num_tweets tweets matching query

    Args:
        query: twitter search query 
                (info on query: https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query)
        max_num_tweets: maximum number of tweets

    Returns: list of tweets and its data (structure of data: https://miro.medium.com/max/1400/1*b7499m8QPju3AH7WUreP2A.png
                specifically: {url, date, content, renderedContent, id, user, outlinks, tcooutlinks, replyCount, retweetCount, 
                likeCount, quoteCount, converstationId, lang, source, media, retweetedTweet, quotedTweet, mentionedUsers})
    """
    tweets_generator = sntwitter.TwitterSearchScraper(query).get_items()

    tweets_list = []
    for i, tweet in enumerate(tweets_generator):
        if i > max_num_tweets: 
            break
        tweets_list.append([curr_coin, tweet.url, tweet.date, tweet.content, tweet.id, tweet.user, tweet.replyCount, 
                            tweet.retweetCount, tweet.likeCount, tweet.quoteCount, tweet.source])
    
    return tweets_list

In [134]:
def check_spam(all_crypto_df):
    spam_score = 0

    for i in range (0, len(all_crypto_df)):
        if all_crypto_df["following"][i] == 2001 or (all_crypto_df["following"][i] > 2001 and (all_crypto_df["following"][i] > (all_crypto_df["followers"][i] * 1.1))):       
            all_crypto_df = all_crypto_df.drop(index = i)
    
    return all_crypto_df

In [138]:
start_date = "2019-01-01"
end_date = "2022-10-01"
max_tweets = 100

# case insenstive
coins = [("Bitcoin", "BTC"),
        ("Bitcoin Cash", "BCH"),
        ("Binance Coin", "BNB"),
        ("EOS.IO", "EOS"),
        ("Ethereum Classic", "ETC"),
        ("Ethereum", "ETH"),
        ("Litecoin", "LTC"),
        ("Monero", "XMR"),
        ("TRON", "TRX"),
        ("Stellar", "XLM"),
        ("Cardano", "ADA"),
        ("IOTA", "MIOTA"),
        ("Maker", "MKR"),
        ("Dogecoin", "DOGE")]

all_crypto_df = pd.DataFrame() # create dataframe to store tweet data

for coin_name, ticker in coins:
    search_key = f'{coin_name} OR {ticker}'
    sns_tweets = get_tweets(f'{search_key} lang:en since:{start_date} until:{end_date}', max_tweets, ticker) # pass query, max tweets, ticker

    crypto_df = pd.DataFrame(sns_tweets, columns=["coinTicker", "url", "date", "content", "id", "user", "replyCount", "retweetCount",
                                "likeCount", "quoteCount", "source"])
    crypto_df.rename(columns={"id": "tweetId", "url": "tweetUrl", "source": "machineType"}, inplace=True)
    crypto_df.drop_duplicates(subset=["tweetId"], inplace=True)   

    crypto_df["username"] =  crypto_df.apply(lambda e: e["user"].username, axis=1)
    crypto_df["userId"] =  crypto_df.apply(lambda e: e["user"].id, axis=1)
    crypto_df["following"] =  crypto_df.apply(lambda e: e["user"].friendsCount, axis=1)
    crypto_df["followers"] =  crypto_df.apply(lambda e: e["user"].followersCount, axis=1)
    
    all_crypto_df = crypto_df if len(all_crypto_df) == 0 else all_crypto_df.append(crypto_df) # make one single df

all_crypto_df = check_spam(all_crypto_df) # check for spam accounts

all_crypto_df.to_csv('tweets.csv')