The code performs various tasks related to Twitter data analysis, including scraping tweets, preprocessing text, counting word frequencies, and searching for accounts based on hashtags

### Install requirements

In [1]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
from datetime import datetime, timedelta
import math
import os
#from googletrans import Translator
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('omw-1.4')
import re
from collections import Counter
import spacy

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Асхат\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Асхат\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Scraping tweets during last 'n' hours

#### Scrap tweets, likes, reposts and then process the text using spaCy

In [None]:
# Read the Excel file and remove "https://twitter.com/" from the Link column
df = pd.read_excel('Twitter Account Database.xlsx', nrows = 5)

df['Link'] = df['Link'].str.replace('https://twitter.com/', '')

# Drop duplicates from 'Link' column
df = df.drop_duplicates(subset=['Link'])

# Set the time range for scraping tweets
now = datetime.now()
start = now - timedelta(hours=24)

# Initialize an empty list to store the tweets
tweets = []

# Loop over each username in the "Link" column and scrape their tweets from the last 24 hours
for username in df['Link']:
    for tweet in sntwitter.TwitterSearchScraper(f'from:{username} since:{start:%Y-%m-%d} until:{now:%Y-%m-%d}').get_items():
        text = tweet.content
        link = tweet.url
        tweets.append([username, tweet.date, text, link, tweet.likeCount, tweet.retweetCount])

# Convert the list of tweets to a DataFrame
df_tweets = pd.DataFrame(tweets, columns=['Username', 'Datetime', 'text', 'link', 'like_count', 'retweet_count'])

# Remove timezone information from the Datetime column
df_tweets['Datetime'] = df_tweets['Datetime'].apply(lambda x: x.replace(tzinfo=None) if x is not None else None)

# Load the spacy language model
nlp = spacy.load("en_core_web_sm")

# Define a function to preprocess the text
def preprocess_text(text):
    # Apply the spacy tokenizer to the text
    doc = nlp(text)
    
    # Remove stop words, punctuation, and numbers, and lemmatize each remaining token
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.like_num]
    
    # Join the tokens back into a string
    return " ".join(tokens)

df_tweets['processed_text'] = df_tweets['text'].apply(preprocess_text)
df_tweets['processed_text_list'] = df_tweets['processed_text'].apply(lambda x: x.split())

df_tweets

#### Use the spaCy library to tokenize each tweet text, remove stop words, lemmatize each token and count occurence of each word

In [None]:
df_tweets = pd.read_excel('sss.xlsx')

In [None]:
# Load the spacy language model
nlp = spacy.load("en_core_web_sm")

# Define a function to preprocess the text
def preprocess_text(text):
    # Apply the spacy tokenizer to the text
    doc = nlp(text)
    
    # Remove stop words, punctuation, and numbers, and lemmatize each remaining token
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.like_num]
    
    # Join the tokens back into a string
    return " ".join(tokens)

df_tweets['processed_text'] = df_tweets['Text'].apply(preprocess_text)
df_tweets['processed_text_list'] = df_tweets['processed_text'].apply(lambda x: x.split())

df_tweets.head()

In [None]:
# Create a Series with all the words in the "text" column
all_words = df_tweets['Text'].apply(lambda x: [WordNetLemmatizer().lemmatize(token.lemma_.lower()) for token in nlp(x) if not token.is_stop and token.is_alpha]).explode()

# Count how many times each word appears
word_counts = all_words.value_counts()

word_counts

In [None]:
# Generate current date in the desired format
current_date = datetime.now().strftime('%y_%m_%d')

# Save dataframes to different sheets in the same Excel file with current date in the name
with pd.ExcelWriter(f'SSS_{current_date}.xlsx') as writer:
    df_tweets.to_excel(writer, sheet_name='tweets', index=True)
    word_counts.to_excel(writer, sheet_name='words_count', index=True)

# Scraping twitter accounts by hashtags

In [None]:
# Set search variables
hashtags = ["#eth",
            "#ethereum"
            "#crypto", 
            "#token",
            "#btc", 
            "#bitcoin", 
            "#binance", 
            "#web3", 
            "#blockchain", 
            "#bnb", 
            "#nft", 
            "#defi", 
            "#dao", 
            "#dex",
            "#tether",
            "#usdt", 
            "#dfinity", 
            "#cosmos", 
            "#evmos", 
            "#ibc", 
            "#near", 
            "#solana", 
            "#usdc", 
            "#xrp", 
            "#dogecoin", 
            "#doge",
            "#polygon", 
            "#matic", 
            "#busd", 
            "#polkadot", 
            "#litecoin", 
            "#shiba", 
            "#tron", 
            "#trx", 
            "#avalanche", 
            "#avax", 
            "#dai", 
            "#chainlink", 
            "#toncoin"
           ]
max_accounts = 100000
min_followers = 2000
min_likes = 100
min_retweets = 50
min_replies = 50

# Create a list to hold our accounts and a set to keep track of usernames
accounts_list = []
usernames_set = set()

# Loop through each hashtag in the list
for hashtag in hashtags:
    # Define the search query
    search_query = f"{hashtag} lang:en min_faves:{min_likes} min_retweets:{min_retweets} min_replies:{min_replies}"
    # Loop through each tweet obtained from the search query
    for i, tweet in enumerate(sntwitter.TwitterSearchScraper(search_query).get_items()):
        if i >= max_accounts:
            break
        # Get the user object associated with the tweet
        user = tweet.user
        # Check if the user has more than 10,000 followers and hasn't been added already
        if user.followersCount < min_followers or user.username in usernames_set:
            continue
        # Add the username to the set of usernames
        usernames_set.add(user.username)
        # Create a dictionary to store account information
        account_dict = {
            "username": user.username,
            "bio": user.description,
            "url": user.url,
            "hashtag": hashtag
        }
        # Append the dictionary to the list of accounts
        accounts_list.append(account_dict)

# Create a pandas DataFrame from the list of accounts
accounts_df = pd.DataFrame(accounts_list)

# Print the DataFrame
accounts_df.head()

In [2]:
accounts_df = pd.read_excel("accounts_by_hashtags.xlsx")

#fill NaN values in bio column
accounts_df['bio'] = accounts_df['bio'].fillna('')

# Filter out accounts that contain the words "airdrop" or "airdrops" in their bio
accounts_df = accounts_df[~accounts_df['bio'].str.contains('airdrop|airdrops|giveaway|giveaways|mint|pre-mint|collection', case=False)]
accounts_df

Unnamed: 0.1,Unnamed: 0,username,bio,url,hashtag
0,0,NFT_BOYKA,Web 3 Marketing | Community Builder | Advisor ...,https://twitter.com/NFT_BOYKA,#eth
1,1,MQ3458,|𝕎𝕖𝕓𝟛 𝔼𝕟𝕥𝕙𝕦𝕤𝕚𝕒𝕤𝕥𝕚𝕔 🌍| ℕ𝔽𝕋 ℂ𝕠𝕝𝕝𝕖𝕜𝕥𝕠𝕣 | ℂ𝕪𝕣𝕡𝕥𝕠 ...,https://twitter.com/MQ3458,#eth
2,2,tunaferith,"I don't use AI, only handmade drawings. Discor...",https://twitter.com/tunaferith,#eth
3,3,CryptoAgresivoo,Building Communities & Holder | @Twitch Stream...,https://twitter.com/CryptoAgresivoo,#eth
4,4,SabianNFT,"Early to Crypto, Early to NFT’s 🐳",https://twitter.com/SabianNFT,#eth
...,...,...,...,...,...
3967,4179,Nord_Finance,"A safe, simple, and effective Digital Asset in...",https://twitter.com/Nord_Finance,#avax
3968,4180,akitavax,Akitavax is the community driven project of th...,https://twitter.com/akitavax,#avax
3969,4181,RabiaKhanGul,Mental Health Therapist\nBlogger\nCo Host and ...,https://twitter.com/RabiaKhanGul,#dai
3970,4183,BigSean,#DETROITMixtape,https://twitter.com/BigSean,#dai


In [3]:
#save it to excel/csv file
accounts_df.to_excel("accounts_by_hashtags.xlsx")

# Twitter accounts from Cryptorank

In [2]:
import snscrape.modules.twitter as sntwitter
import pandas as pd

# Step 1: Read Excel file with TOP twitter accounts from cryptorank and create dataframe
df_cryptorank = pd.read_excel('cryptorank_cryptocurrencies.xlsx')

# Step 2: Filter dataframe by 'twitter' in 'Social Media Links-href' column
df_cryptorank['Social Media Links-href'].fillna('', inplace=True) # Fill NaN with empty string
df_cryptorank_filtered = df_cryptorank[df_cryptorank['Social Media Links-href'].str.contains('twitter', case=False, na=False)]

df_cryptorank_filtered

Unnamed: 0,web-scraper-order,web-scraper-start-url,Social Media Links,Social Media Links-href,Link,Link-href,Name,next
1,1681398074-328,https://cryptorank.io/?rows=50,,https://twitter.com/xx_network,xx networkXX,https://cryptorank.io/price/elixxir,xx network,
5,1681398077-332,https://cryptorank.io/?rows=50,,https://twitter.com/moonbeamnetwork,xcUSDTXCUSDT,https://cryptorank.io/price/xcusdt,xcUSDT,
8,1681398082-335,https://cryptorank.io/?rows=50,,https://twitter.com/moonbeamnetwork,xcDOTXCDOT,https://cryptorank.io/price/xcdot,xcDOT,
11,1681398085-338,https://cryptorank.io/?rows=50,,https://twitter.com/xwinfinance,xWIN FinanceXWIN,https://cryptorank.io/price/xwin-finance,xWIN Finance,
17,1681398088-344,https://cryptorank.io/?rows=50,,https://twitter.com/xtokenterminal,xTokenXTK,https://cryptorank.io/price/xtoken,xToken,
...,...,...,...,...,...,...,...,...
24996,1681413996-25323,https://cryptorank.io/?rows=50,,https://twitter.com/circle/,USD CoinUSDC,https://cryptorank.io/price/usdcoin,USD Coin,
25002,1681413999-25329,https://cryptorank.io/?rows=50,,https://twitter.com/binance,BNBBNB,https://cryptorank.io/price/bnb,BNB,
25003,1681413999-25330,https://cryptorank.io/?rows=50,,https://twitter.com/BNBCHAIN,BNBBNB,https://cryptorank.io/price/bnb,BNB,
25006,1681414002-25333,https://cryptorank.io/?rows=50,,https://twitter.com/tether_to,TetherUSDT,https://cryptorank.io/price/tether,Tether,
