<a href="https://www.kaggle.com/code/gpreda/collect-elon-musk-tweets?scriptVersionId=126162342" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Install packages

In [1]:
!pip install tweepy

Collecting tweepy
  Downloading tweepy-4.13.0-py3-none-any.whl (102 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.8/102.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tweepy
Successfully installed tweepy-4.13.0
[0m

# Import packages

In [2]:
import os
import tweepy as tw
import pandas as pd
import wandb
from tqdm import tqdm

# Initialize secrets

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

# Initialize Weight and Biases (WandB)

In [4]:
class Config:
    project = "EMTC"
    wandb=True
    dataset='elon-musk-tweets'
    _wandb_kernel='gpreda'
config = Config()

def class2dict(f):
    return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

In [5]:
if config.wandb:
    wandb.login(key=user_secrets.get_secret("wandb_api"))
    wandb.init(
        # set the wandb project where this run will be logged
        project="EMTC",
        name=Config.dataset,
        config=class2dict(Config),
        group=Config.dataset,
        job_type="collect",
        anonymous=None
    )


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgpreda[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Twitter connection and cursor

In [6]:
def twitter_connection():
    consumer_api_key = user_secrets.get_secret("TWITTER_CONSUMER_API_KEY")
    consumer_api_secret = user_secrets.get_secret("TWITTER_CONSUMER_API_SECRET")

    auth = tw.OAuthHandler(consumer_api_key, consumer_api_secret)
    api = tw.API(auth, wait_on_rate_limit=True)
    return api


def create_cursor(api, search_words, date_since, language="en", items_limit=3000):
    
    # Collect tweets
    tweets = tw.Cursor(api.search_tweets,
                  q=search_words,
                  lang=language,
                  since=date_since).items(items_limit)


    print(f"retreive new tweets ...")
    tweets_copy = []
    for tweet in tqdm(tweets):
        tweets_copy.append(tweet)
     
    print(f"new tweets retrieved: {len(tweets_copy)}")

    return tweets_copy
   

# Build dataset

In [7]:
def build_dataset(tweets_copy):
    tweets_df = pd.DataFrame()
    for tweet in tqdm(tweets_copy):
        hashtags = []
        try:
            for hashtag in tweet.entities["hashtags"]:
                hashtags.append(hashtag["text"])
        except:
            pass
        tweets_df = tweets_df.append(pd.DataFrame({'id': tweet.id,
                                                   'user_name': tweet.user.name, 
                                                   'user_location': tweet.user.location,\
                                                   'user_description': tweet.user.description,
                                                   'user_created': tweet.user.created_at,
                                                   'user_followers': tweet.user.followers_count,
                                                   'user_friends': tweet.user.friends_count,
                                                   'user_favourites': tweet.user.favourites_count,
                                                   'user_verified': tweet.user.verified,
                                                   'date': tweet.created_at,
                                                   'text': tweet.text, 
                                                   'hashtags': [hashtags if hashtags else None],
                                                   'source': tweet.source,
                                                   'retweets': tweet.retweet_count,
                                                   'favorites': tweet.favorite_count,
                                                   'is_retweet': tweet.retweeted}, index=[0]))
    return tweets_df




# Update and save dataset

In [8]:
def update_and_save_dataset(tweets_df): 
    input_file_path = "../input/elon-musk-tweets/elon_musk_tweets.csv"
    output_file_path = "elon_musk_tweets.csv"
    if os.path.exists(input_file_path):
        tweets_old_df = pd.read_csv(input_file_path)
        print(f"past tweets: {tweets_old_df.shape}")
        tweets_all_df = pd.concat([tweets_old_df, tweets_df], axis=0)
        print(f"new tweets: {tweets_df.shape[0]} past tweets: {tweets_old_df.shape[0]} all tweets: {tweets_all_df.shape[0]}")
        tweets_new_df = tweets_all_df.drop_duplicates(subset = ["id"], keep='last', inplace=False)
        print(f"all tweets: {tweets_new_df.shape}")
        tweets_new_df.to_csv(output_file_path, index=False)
        
        wandb.log({"rows_old": tweets_old_df.shape[0],
           "rows_new": tweets_df.shape[0],
           "rows_merged": tweets_new_df.shape[0]})
        
    else:
        print(f"tweets: {tweets_df.shape}")
        tweets_df.to_csv(output_file_path, index=False)
    

# Run it all

In [9]:
api = twitter_connection()
tweets_copy = create_cursor(api, "from:elonmusk -filter:retweets", "2020-03-01")
tweets_df = build_dataset(tweets_copy)
update_and_save_dataset(tweets_df)
if config.wandb:
    wandb.finish()

retreive new tweets ...


328it [00:05, 64.88it/s]


new tweets retrieved: 328


100%|██████████| 328/328 [00:01<00:00, 223.84it/s]


past tweets: (4524, 16)
new tweets: 328 past tweets: 4524 all tweets: 4852
all tweets: (4529, 16)


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
rows_merged,▁
rows_new,▁
rows_old,▁

0,1
rows_merged,4529
rows_new,328
rows_old,4524
