In [1]:
import pandas as pd
import numpy as np

# (1) ADD EMPTY USER COLUMNS

In [2]:
tweets_df = pd.read_csv('../../datasets/tweets-data.csv')
tweets_df.head(5)

Unnamed: 0,text,username,likes,comments,retweets,quotes,is-retweet,external-link,pictures,videos,gifs
0,These Black Pitmasters Are Hustling To Preserv...,@FoodsAreGoodYes,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0OIEcaYAAOQEF....,[],[]
1,#ai #food #foodporn #foodlover,@yummy_food_ai_,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0ODvvWkAARMm-....,[],[]
2,Chicken Licken Menu View South african menu; ...,@M14548Mehsud,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0OG2IbIAAVBBw....,[],[]
3,Pepperoni Pizza 🍕 with Peppers homecookingvsfa...,@homevsfastfood,345,4,63,1,True,,['https://pbs.twimg.com/media/GAx6QMZXIAEGXyx....,[],[]
4,#Fine #Dining #Dinner #flashback #Focused #Che...,@ChefLondie,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0OBqXXkAA7Vpe....,[],[]


**We still don't have information related to the users!!!**

Firstly we add some empty columns, and we will fill them later

In [3]:
user_features= ['image','bio','website']
user_stats= ['tweets','following','followers','likes','media']

for feature in (user_features+user_stats):
    tweets_df['user-'+feature]= np.nan
    
tweets_df.head(5)

Unnamed: 0,text,username,likes,comments,retweets,quotes,is-retweet,external-link,pictures,videos,gifs,user-image,user-bio,user-website,user-tweets,user-following,user-followers,user-likes,user-media
0,These Black Pitmasters Are Hustling To Preserv...,@FoodsAreGoodYes,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0OIEcaYAAOQEF....,[],[],,,,,,,,
1,#ai #food #foodporn #foodlover,@yummy_food_ai_,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0ODvvWkAARMm-....,[],[],,,,,,,,
2,Chicken Licken Menu View South african menu; ...,@M14548Mehsud,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0OG2IbIAAVBBw....,[],[],,,,,,,,
3,Pepperoni Pizza 🍕 with Peppers homecookingvsfa...,@homevsfastfood,345,4,63,1,True,,['https://pbs.twimg.com/media/GAx6QMZXIAEGXyx....,[],[],,,,,,,,
4,#Fine #Dining #Dinner #flashback #Focused #Che...,@ChefLondie,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0OBqXXkAA7Vpe....,[],[],,,,,,,,


In [None]:
tweets_df.to_csv('../../tweets&users-data.csv', index=False)

Now the idea is to iterate over *users_set*, scrape their profile information and update the dataset with their info

# (2) GET USERS DATA

### Iteration over users (SERIAL BAD WAY)

In [None]:
# for usr in users_set:
#     # Scrape usr info 
#     user_data= scraper.get_profile_info(usr)
#     # Update all the rows associated to the current user
#     for feature in user_features:
#         tweets_df.loc[tweets_df['username']== usr,['user-'+feature] ] = user_data[feature]
#     for stat in user_stats:
#         tweets_df.loc[tweets_df['username']==usr,['user-'+stat] ] = user_data['stats'][stat]

### GET USERS DATA IN PARALLEL :)

In [2]:
tweets_df = pd.read_csv('../../datasets/tweets&users-data.csv')

In [3]:
#type(tweets_df.loc[:,"username"]) -->pd.Series
users_set = tweets_df.loc[:,"username"].unique()
print("Among the",tweets_df.shape[0],"there are",len(users_set),"unique users!")

Among the 1496 there are 810 unique users!


In [20]:
from ntscraper import Nitter
import concurrent.futures

In [21]:
scraper = Nitter()

Testing instances: 100%|██████████| 33/33 [01:08<00:00,  2.08s/it]


In [22]:
user_features = ['image', 'bio', 'website']
user_stats = ['tweets', 'following', 'followers', 'likes', 'media']

def fetch_user_info(username):
    user_data = scraper.get_profile_info(username)
    return username, user_data

# Function to update DataFrame with user information
def update_dataframe(result):
    username, user_data = result
    for feature in user_features:
        tweets_df.loc[tweets_df['username'] == username, ['user-' + feature]] = user_data[feature]
    for stat in user_stats:
        tweets_df.loc[tweets_df['username'] == username, ['user-' + stat]] = user_data['stats'][stat]


In [None]:
# Number of concurrent threads (adjust as needed)
num_threads = 10
# Set of users for which the scraping fails
failed_users=set()
successful_users=set()

with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Submit tasks to the thread pool
    future_to_user = {executor.submit(fetch_user_info, username): username for username in users_set } #eventualmente fare pezzettini di users_set
    # Process results as they become available
    for future in concurrent.futures.as_completed(future_to_user):
        username = future_to_user[future]
        try:
            result = future.result()
            update_dataframe(result)
            successful_users.add(username)
        except:
            failed_users.add(username)

In [None]:
print('success: ',len(successful_users))
print('failed: ',len(failed_users))

In [13]:
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Submit tasks to the thread pool
    future_to_user = {executor.submit(fetch_user_info, username): username for username in failed_users}
    # Process results as they become available
    for future in concurrent.futures.as_completed(future_to_user):
        username = future_to_user[future]
        try:
            result = future.result()
            update_dataframe(result)
            successful_users.add(username)
            failed_users.remove(username)
        except:
            continue
            

In [None]:
print('success: ',len(successful_users))
print('failed: ',len(failed_users))

# (3) SAVE THE DATASET

In [35]:
# Check if we missed some users (NB: just need to look if one of usr stats is nan)
tweets_df[tweets_df['user-followers']==np.nan] #empty-->GOOD :)

Unnamed: 0,text,username,likes,comments,retweets,quotes,is-retweet,external-link,pictures,videos,gifs,user-image,user-bio,user-website,user-tweets,user-following,user-followers,user-likes,user-media


Export to CSV

In [30]:
tweets_df.to_csv('../../datasets/tweets&users-data.csv', index=False)