In [1]:
import pandas as pd
import json
from tqdm import tqdm

In [2]:
tweets = json.load(open('./data/tweetsFinalREVISI.json'))
users = json.load(open('./data/usersFinalREVISI.json'))
places = json.load(open('./data/placesFinalREVISI.json'))

In [3]:
print(f'Tweets count: {len(tweets)}\nUsers count: {len(users)}\nPlaces count: {len(places)}')

Tweets count: 312463
Users count: 145428
Places count: 398


In [4]:
def getUserDetails(tweet, users):
    tweet_author = str(tweet['author_id'])
    
    for user in users:
        user_values = str(user.get('id'))
        
        if tweet_author == user_values:
            return user
        else:
            continue
            
    return None

def getPlaceDetails(tweet, places):
    tweet_place_id = str(tweet['geo']['place_id'])
    
    for place in places:
        place_values = str(place.get('id'))
                
        if tweet_place_id == place_values:
            return place
        else:
            continue
            
    return None

In [5]:
def getPlaceData(tweet, places):
    
    if 'geo' in tweet:
        if 'place_id' in tweet['geo']:
            placeDetails = getPlaceDetails(tweet, places)
            if placeDetails != None:
                return {
                    'name': placeDetails['full_name'],
                    'place_type': placeDetails['place_type'],
                    'country': placeDetails['country'],
                    'country_code': placeDetails['country_code'] 
                }
            else:
                return {
                    'name': None,
                    'place_type': None,
                    'country': None,
                    'country_code': None  
                }
    else:
        return {
            'name': None,
            'place_type': None,
            'country': None,
            'country_code': None  
        }
                    
def getUserData(tweet, users):
    
    userDetails = getUserDetails(tweet, users)
    
    if userDetails != None:
        if 'location' in userDetails:
            location = userDetails['location']
        else:
            location = None

        return {
            'username': userDetails['username'],
            'following': userDetails['public_metrics']['following_count'],
            'tweets': userDetails['public_metrics']['tweet_count'],
            'location': location,
            'verified': userDetails['verified']
        }
    
    else:
        return {
            'username': None,
            'following': None,
            'tweets': None,
            'location': None,
            'verified': None
        }

def getHashtags(tweet):
    
    if 'entities' in tweet:
        if 'hashtags' in tweet['entities']:
            hashtags = []
            if len(tweet['entities']['hashtags']) >= 1:
                for hashtag in tweet['entities']['hashtags']:
                    hashtags.append(hashtag['tag'])
                    
            return hashtags
    
    else:
        return None

def getReferencedTweets(tweet):
    if 'referenced_tweets' in tweet:
        if len(tweet['referenced_tweets']) >= 1:
            reference_type = []
               
            for reference_tweet in tweet['referenced_tweets']:
                reference_type.append(reference_tweet['type'])
            
        return reference_type
    
    else:
        return None
    
def getMentions(tweet):
    if 'entities' in tweet:
        if 'mentions' in tweet['entities']:
            if len(tweet['entities']['mentions']) >= 1:
                mentions = []
                
                for mention in tweet['entities']['mentions']:
                    mentions.append(mention['username'])
                
            return mentions
        
        else:
            return None
    else:
        return None
        
def getUrlTitle(tweet):
    if 'entities' in tweet:
        if 'urls' in tweet['entities']:
            if len(tweet['entities']['urls']) >= 1:
                urlTitles = []
                
                for url in tweet['entities']['urls']:
                    if 'title' in url:
                        urlTitles.append(url['title'])
                
                return urlTitles
        
        else:
            return None
    
    else:
        return None

In [6]:
def generateTemp():
    return {
        'text': None,
        'hashtags': [],
        'users': {
            'username': None,
            'following': None,
            'tweets': None,
            'location': None,
            'verified': None,
        },
        'created_at': None,
        'metrics': {
            'retweets': None,
            'replies': None,
            'quotes': None,
            'impressions': None,
        },
        'entities': {
            'mentions': [],
            'url title': []
        },
        'reference_type': None,
        'lang': None,
        'places': {
            'name': None,
            'place_type': None,
            'country': None,
            'country_code': None
        }
    }

In [7]:
data = []

In [8]:
for tweet in tqdm(tweets):
    
    temp = generateTemp()
                
    temp.update({
        'text': tweet['text'],
        'hashtags': getHashtags(tweet),
        'users': {
            'username': getUserData(tweet, users)['username'],
            'following': getUserData(tweet, users)['following'],
            'tweets': getUserData(tweet, users)['tweets'],
            'location': getUserData(tweet, users)['location'],
            'verified': getUserData(tweet, users)['verified'],
        },
        'created_at': tweet['created_at'],
        'metrics': {
            'retweets': tweet['public_metrics']['retweet_count'],
            'replies': tweet['public_metrics']['reply_count'],
            'quotes': tweet['public_metrics']['quote_count'],
            'impressions': tweet['public_metrics']['impression_count'],
        },
        'entities': {
            'mentions': getMentions(tweet),
            'url title': getUrlTitle(tweet)
        },
        'reference_type': getReferencedTweets(tweet),
        'lang': tweet['lang'],
        'places': {
            'name': getPlaceData(tweet, places)['name'],
            'place_type': getPlaceData(tweet, places)['place_type'],
            'country': getPlaceData(tweet, places)['country'],
            'country_code': getPlaceData(tweet, places)['country_code']
        }
    })
        
    data.append(temp)

100%|████████████████████████████████████████████████████████████████████████| 312463/312463 [3:54:37<00:00, 22.20it/s]


In [9]:
print(f'Tweets count: {len(tweets)}\nUsers count: {len(users)}\nPlaces count: {len(places)}\n-------------------------\nFinal data count: {len(data)}')

Tweets count: 312463
Users count: 145428
Places count: 398
-------------------------
Final data count: 312463


In [10]:
with open("./data/finalDataFixREVISI.json", "w") as dataFinal:
    json.dump(data, dataFinal, indent=4)