In [2]:
import os
import re
import json
import pandas as pd
from dateutil.parser import parse
from datetime import datetime, timedelta
from tqdm import tqdm
from time import sleep


def tweet_parser(tweets_data, since, until, path, keyword_search):
    tweets = []
    for tweet_obj in tqdm(tweets_data, desc='parsing tweet'):
        result = {
            'created_at': tweet_obj['created_at_local'],
            'tweet_id': '_'+tweet_obj['id_str'] if tweet_obj['id_str'] != None else None,
            'user_id': '_'+tweet_obj['user']['id_str'] if tweet_obj['user']['id_str'] != None else None,
            'user_screenname': tweet_obj['user']['screen_name'],
            'user_followings_count': tweet_obj['user']['friends_count'],
            'user_followers_count': tweet_obj['user']['followers_count'],
            'user_tweets_count': tweet_obj['user']['statuses_count'],
            'is_verified': tweet_obj['user']['verified'],
            'tweet_text': tweet_obj['full_text'],
            'favorite_count': tweet_obj['favorite_count'],
            'retweet_count': tweet_obj['retweet_count'],
            'tweet_source': re.findall(r'\>(.+)\<', tweet_obj['source'])[0],
            'place_fullname': tweet_obj['place']['full_name'] if ('place' in tweet_obj and tweet_obj['place'] != None) else None,
            'place_name': tweet_obj['place']['name'] if ('place' in tweet_obj and tweet_obj['place'] != None) else None,
            'place_type': tweet_obj['place']['place_type'] if ('place' in tweet_obj and tweet_obj['place'] != None) else None,
            'place_country': tweet_obj['place']['country'] if ('place' in tweet_obj and tweet_obj['place'] != None) else None,
            'quoted_from_tweet_id': '_'+tweet_obj['quoted_status']['id_str'] if 'quoted_status' in tweet_obj else None,
            'quoted_from_user_id': '_'+tweet_obj['quoted_status']['user']['id_str'] if 'quoted_status' in tweet_obj else None,
            'quoted_from_user_screenname': tweet_obj['quoted_status']['user']['screen_name'] if 'quoted_status' in tweet_obj else None,
            'reply_to_tweet_id': '_'+tweet_obj['in_reply_to_status_id_str'] if ('in_reply_to_status_id_str' in tweet_obj and tweet_obj['in_reply_to_status_id_str'] != None) else None,
            'reply_to_user_id': '_'+tweet_obj['in_reply_to_user_id_str'] if ('in_reply_to_user_id_str' in tweet_obj and tweet_obj['in_reply_to_user_id_str'] != None) else None,
            'reply_to_user_screenname': tweet_obj['in_reply_to_screen_name'] if 'in_reply_to_screen_name' in tweet_obj else None,
            'retweeted_from_tweet_id': '_'+tweet_obj['retweeted_status']['id_str'] if 'retweeted_status' in tweet_obj else None,
            'retweeted_from_user_id': '_'+tweet_obj['retweeted_status']['user']['id_str'] if 'retweeted_status' in tweet_obj else None,
            'retweeted_from_user_screenname': tweet_obj['retweeted_status']['user']['screen_name'] if 'retweeted_status' in tweet_obj else None,
        }
        # Collect original tweets
        if (result['quoted_from_tweet_id'] == None and 
            result['reply_to_tweet_id'] == None and 
            result['retweeted_from_tweet_id'] == None):
            result['tweet_type'] = 'tweet'
        # Collect reply to tweets
        elif (result['quoted_from_tweet_id'] == None and 
            result['reply_to_tweet_id'] != None and 
            result['retweeted_from_tweet_id'] == None):
            result['tweet_type'] = 'reply'
        # Collect quoted from tweets
        elif (result['quoted_from_tweet_id'] != None and 
            result['reply_to_tweet_id'] == None and 
            result['retweeted_from_tweet_id'] == None):
            result['tweet_type'] = 'quote'
        # Collect retweeted from tweets
        elif (result['quoted_from_tweet_id'] == None and 
            result['reply_to_tweet_id'] == None and 
            result['retweeted_from_tweet_id'] != None):
            result['tweet_type'] = 'retweet'
            result['retweet_count'] = 0
            result['favorite_count'] = 0
        # Collect retweeted from quoted tweets
        elif (result['quoted_from_tweet_id'] != None and 
            result['reply_to_tweet_id'] == None and 
            result['retweeted_from_tweet_id'] != None):
            result['tweet_type'] = 'retweet'
            result['retweet_count'] = 0
            result['favorite_count'] = 0
        # Collect reply from quoted tweets
        elif (result['quoted_from_tweet_id'] != None and 
            result['reply_to_tweet_id'] != None and 
            result['retweeted_from_tweet_id'] == None):
            result['tweet_type'] = 'reply'
        if (parse(result['created_at']) >= parse(since+' 00:00:00') and 
            parse(result['created_at']) <= parse(until+' 23:59:59')):
            tweets.append(result)

    tweets = pd.DataFrame(tweets).drop_duplicates(subset=['tweet_id']).reset_index(drop=True)
    tweets = tweets[['created_at', 'tweet_id', 'user_id', 'user_screenname',
                     'user_followings_count', 'user_followers_count', 'user_tweets_count',
                     'is_verified', 'tweet_text', 'favorite_count',
                     'retweet_count', 'tweet_source', 'tweet_type', 
                     'place_fullname', 'place_name',
                     'place_type', 'place_country', 'quoted_from_tweet_id',
                     'quoted_from_user_id', 'quoted_from_user_screenname',
                     'reply_to_tweet_id', 'reply_to_user_id', 'reply_to_user_screenname',
                     'retweeted_from_tweet_id', 'retweeted_from_user_id',
                     'retweeted_from_user_screenname']]
    tweets.to_csv('{}/collection_of_tweets_{}_{}-{}.csv'.format(path, 
                                                                            keyword_search, 
                                                                            since.replace('-',''), 
                                                                            until.replace('-','')),
                       index=False)
    return tweets

def user_parser(tweets_data, since, until, path, keyword_search, GMT=7):
    def location_lookup(loc):
        city_look = pd.read_csv('lookup/city_lookup.csv')
        city_look['keyword'] = city_look['keyword'].str.upper()
        province_look = pd.read_csv('lookup/province_lookup.csv')
        province_look['keyword'] = province_look['keyword'].str.upper()
        subdistrict_look = pd.read_csv('lookup/sub_district_lookup.csv')
        subdistrict_look['keyword'] = subdistrict_look['keyword'].str.upper()
        city = 'Undefined'
        prov = 'Undefined'
        if loc != 'UNDEFINED':
            for c in range(len(city_look)):
                if city_look['keyword'][c] in loc:
                    city = city_look['city'][c]
                    prov = city_look['state'][c]
                    break

            if ((city == 'Undefined') & (prov == 'Undefined')):
                for p in range(len(province_look)):
                    if province_look['keyword'][p] in loc:
                        city = province_look['city'][p]
                        prov = province_look['state'][p]
                        break    
                        
            if ((city == 'Undefined') & (prov == 'Undefined')):
                for s in range(len(subdistrict_look)):
                    if subdistrict_look['keyword'][s] in loc:
                        city = subdistrict_look['city'][s]
                        prov = subdistrict_look['state'][s]
                        break
        return [city, prov]
    
    users = []
    for tweet_obj in tqdm(tweets_data, desc='parsing users'):
        result = {
            'created_at': tweet_obj['created_at_local'],
            'tweet_id': '_'+tweet_obj['id_str'] if tweet_obj['id_str'] != None else None,
            'user_id': '_'+tweet_obj['user']['id_str'] if tweet_obj['user']['id_str'] != None else None,
            'user_screenname': tweet_obj['user']['screen_name'],
            'user_fullname': tweet_obj['user']['name'],
            'user_created_at': (parse(tweet_obj['user']['created_at'], ignoretz=True) + timedelta(hours=GMT)).strftime('%Y-%m-%d %H:%M:%S'),
            'user_followings_count': tweet_obj['user']['friends_count'],
            'user_followers_count': tweet_obj['user']['followers_count'],
            'user_tweets_count': tweet_obj['user']['statuses_count'],
            'user_location': tweet_obj['user']['location'].upper(),
            'is_verified': tweet_obj['user']['verified']
        }
        if (parse(result['created_at']) >= parse(since+' 00:00:00') and 
            parse(result['created_at']) <= parse(until+' 23:59:59')):
            users.append(result)
        
    users = pd.DataFrame(users).drop_duplicates(subset=['tweet_id']).reset_index(drop=True)
    users = users.drop_duplicates(subset=['user_id']).reset_index(drop=True)
    users_loc = []
    for i in tqdm(range(len(users)), desc='location cleansing'):
        loc = location_lookup(users['user_location'][i])
        users_loc.append({'user_id': users['user_id'][i],
                          'user_city': loc[0],
                          'user_province': loc[1]
                         })
    users_loc = pd.DataFrame(users_loc)
    users = pd.merge(users, users_loc, on='user_id')
    users = users[['user_id','user_screenname','user_fullname','user_created_at',
                   'user_followings_count','user_followers_count','user_tweets_count',
                   'user_city','user_province','is_verified']]
    users.to_csv('{}/collection_of_users_{}_{}-{}.csv'.format(path, 
                                                                     keyword_search, 
                                                                     since.replace('-',''), 
                                                                     until.replace('-','')),
                 index=False)
    return users

if __name__ == '__main__':
    keyword_search = input('keyword_search : ')
    since = input('since date [YYYY-MM-DD]: ')
    until = input('until date [YYYY-MM-DD]: ')
    
    # Load Tweets Data
    print('Load data...', end='\r')
    path = '{}/tweet_search_result/{}'.format(os.getcwd(), keyword_search)
    tweets_data = []
    for json_file in os.listdir(path):
        if json_file.endswith('.json') and json_file.startswith('search_tweet_'):
            tweets_data.extend([json.loads(line) for line in open('{}/{}'.format(path, json_file), errors='ignore').readlines()])
    print('Data {} loaded!'.format(keyword_search))
    sleep(3)
    
    # Parsing required fields for analysis
    tweets = tweet_parser(tweets_data, since, until, path, keyword_search)
    print('Data {} tweets parsed!'.format(keyword_search))
    
    users = user_parser(tweets_data, since, until, path, keyword_search)
    print('Data {} users parsed!'.format(keyword_search))

keyword_search : bawaslu
since date [YYYY-MM-DD]: 2019-05-14
until date [YYYY-MM-DD]: 2019-05-27
Data bawaslu loaded!


parsing tweet: 100%|█████████████████████████████████████████████████████████| 357957/357957 [02:04<00:00, 2868.36it/s]


Data bawaslu tweets parsed!


parsing users: 100%|█████████████████████████████████████████████████████████| 357957/357957 [03:08<00:00, 1903.85it/s]
location cleansing: 100%|████████████████████████████████████████████████████| 127175/127175 [4:37:00<00:00,  6.20it/s]


Data bawaslu users parsed!
