In [1]:
import pandas as pd
import numpy as np

# (1) ADD EMPTY USER COLUMNS

In [2]:
tweets_df = pd.read_csv('../../datasets/tweets-data-merged.csv')
print(tweets_df.shape)
tweets_df.head(5)

(4993, 11)


Unnamed: 0,text,username,likes,comments,retweets,quotes,is-retweet,external-link,pictures,videos,gifs
0,Fried Chicken with Hot 🔥 Sauce Corn Muffin and...,@homevsfastfood,125,2,33,1,True,,['https://pbs.twimg.com/media/GBRPAZvXcAATBXG....,[],[]
1,#Morel #mushrooms at center of #food #poisonin...,@MicrobesInfo,0,0,0,0,False,,[],[],[]
2,My Blog:: Obama Legacy Facing REVISIT ... Ian...,@IanRMackintosh,1,0,0,0,False,,[],[],[]
3,Craving sushi but tired of expensive takeout? ...,@boxnipUK,0,0,0,0,False,,[],[],[]
4,Potato Pancakes should not be a rare side dish...,@LifestyleSoln,0,0,0,0,False,,['https://pbs.twimg.com/media/GBVywFMWkAE5vLY....,[],[]


**We still don't have information related to the users!!!**

Firstly we add some empty columns, and we will fill them later

In [3]:
user_features= ['image','bio','website']
user_stats= ['tweets','following','followers','likes','media']

for feature in (user_features+user_stats):
    tweets_df['user-'+feature]= np.nan
    
tweets_df.head(5)

Unnamed: 0,text,username,likes,comments,retweets,quotes,is-retweet,external-link,pictures,videos,gifs,user-image,user-bio,user-website,user-tweets,user-following,user-followers,user-likes,user-media
0,Fried Chicken with Hot 🔥 Sauce Corn Muffin and...,@homevsfastfood,125,2,33,1,True,,['https://pbs.twimg.com/media/GBRPAZvXcAATBXG....,[],[],,,,,,,,
1,#Morel #mushrooms at center of #food #poisonin...,@MicrobesInfo,0,0,0,0,False,,[],[],[],,,,,,,,
2,My Blog:: Obama Legacy Facing REVISIT ... Ian...,@IanRMackintosh,1,0,0,0,False,,[],[],[],,,,,,,,
3,Craving sushi but tired of expensive takeout? ...,@boxnipUK,0,0,0,0,False,,[],[],[],,,,,,,,
4,Potato Pancakes should not be a rare side dish...,@LifestyleSoln,0,0,0,0,False,,['https://pbs.twimg.com/media/GBVywFMWkAE5vLY....,[],[],,,,,,,,


In [4]:
tweets_df.to_csv('../../datasets/tweets&users-data-merged.csv', index=False)

Now the idea is to iterate over *users_set*, scrape their profile information and update the dataset with their info

# (2) GET USERS DATA

### Iteration over users (SERIAL BAD WAY)

In [None]:
# for usr in users_set:
#     # Scrape usr info 
#     user_data= scraper.get_profile_info(usr)
#     # Update all the rows associated to the current user
#     for feature in user_features:
#         tweets_df.loc[tweets_df['username']== usr,['user-'+feature] ] = user_data[feature]
#     for stat in user_stats:
#         tweets_df.loc[tweets_df['username']==usr,['user-'+stat] ] = user_data['stats'][stat]

### GET USERS DATA IN PARALLEL :)

In [2]:
tweets_df = pd.read_csv('../../datasets/tweets&users-data-merged.csv')
tweets_df.shape

(4979, 19)

In [3]:
#type(tweets_df.loc[:,"username"]) -->pd.Series
users_set = tweets_df.loc[:,"username"].unique()
print("Among the",tweets_df.shape[0],"there are",len(users_set),"unique users!")

Among the 4981 there are 3472 unique users!


In [8]:
tweets_df[tweets_df['username']==users_set[3471]].head()

Unnamed: 0,text,username,likes,comments,retweets,quotes,is-retweet,external-link,pictures,videos,gifs,user-image,user-bio,user-website,user-tweets,user-following,user-followers,user-likes,user-media
4980,I need food to fix up,@adeboye_j,0,1,0,0,False,,[],[],['https://video.twimg.com/tweet_video/GBYZZfSX...,,,,,,,,


In [9]:
from ntscraper import Nitter
import concurrent.futures

In [19]:
scraper = Nitter()

Testing instances: 100%|██████████| 31/31 [00:55<00:00,  1.78s/it]


In [11]:
user_features = ['image', 'bio', 'website']
user_stats = ['tweets', 'following', 'followers', 'likes', 'media']

def fetch_user_info(username):
    user_data = scraper.get_profile_info(username)
    return username, user_data

# Function to update DataFrame with user information
def update_dataframe(result):
    username, user_data = result
    for feature in user_features:
        tweets_df.loc[tweets_df['username'] == username, ['user-' + feature]] = user_data[feature]
    for stat in user_stats:
        tweets_df.loc[tweets_df['username'] == username, ['user-' + stat]] = user_data['stats'][stat]


In [None]:
# Number of concurrent threads (adjust as needed)
num_threads = 10
# Set of users for which the scraping fails
failed_users=set()
successful_users=set()

with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Submit tasks to the thread pool
    future_to_user = {executor.submit(fetch_user_info, username): username for username in users_set[3400:3471]} #eventualmente fare pezzettini di users_set
    # Process results as they become available
    for future in concurrent.futures.as_completed(future_to_user):
        username = future_to_user[future]
        try:
            result = future.result()
            update_dataframe(result)
            successful_users.add(username)
        except:
            failed_users.add(username)

In [20]:
print('success: ',len(successful_users))
print('failed: ',len(failed_users))

success:  69
failed:  2


In [None]:
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Submit tasks to the thread pool
    future_to_user = {executor.submit(fetch_user_info, username): username for username in failed_users}
    # Process results as they become available
    for future in concurrent.futures.as_completed(future_to_user):
        username = future_to_user[future]
        try:
            result = future.result()
            update_dataframe(result)
            successful_users.add(username)
            failed_users.remove(username)
        except:
            continue
            

In [22]:
print('success: ',len(successful_users))
print('failed: ',len(failed_users))

success:  69
failed:  2


In [26]:
tweets_df.shape

(4979, 19)

# (3) SAVE THE DATASET

Export to CSV

In [73]:
tweets_df.to_csv('../../datasets/tweets&users-data-merged.csv', index=False)