In [1]:
import json
import pandas as pd
import ast
import os
from datetime import datetime
import pytz
import pymongo
from pymongo import MongoClient

In [2]:
client = MongoClient("mongodb+srv://<user>:<password>@cluster0.wkyhu.mongodb.net/?retryWrites=true&w=majority")

In [3]:
db = client.get_database("twitter_db")

In [4]:
# check if the collection exists
if "tweets_data" in db.list_collection_names():
    db.drop_collection("tweets_data")
else:
    print('The collection does not exist.')

In [5]:
# Create a new collection
records = db.tweets_data
data = []

In [6]:
# Create indexes
records.create_index([("text", pymongo.TEXT)])
records.create_index([("name", pymongo.ASCENDING)])
records.create_index([("media.hashtags", 1)])
records.create_index([("date", pymongo.DESCENDING)])

In [7]:
def load_data(file_path, data):
    # Load the JSON data from file
    with open(file_path, "r") as f:
        for line in f:
            try:
                item = json.loads(line)
                data.append(item)
            except:
                # if there is an error loading the json of the tweet, skip
                continue

In [8]:
load_data("../../corona-out-2", data)

In [9]:
load_data("../../corona-out-3", data)

In [10]:
len(data)

120434

In [11]:
# # Enable sharding on the database
# client.admin.command('enableSharding', 'twitter_db')

# # Define the shard key
# shard_key = {'created_at': 'hashed'}

# # Configure the sharded cluster
# client.admin.command('shardCollection', 'twitter_db.tweets_data', key=shard_key)

In [12]:
tweets = []
users = []

for tweet in data :
    tweet_id = tweet['id']
    created_at = tweet['created_at']
    in_reply_to_status_id = tweet['in_reply_to_status_id']
    in_reply_to_user_id = tweet['in_reply_to_user_id']
    source = tweet['source']
    is_quote = tweet['is_quote_status']
    quote = None
    retweet = None

    ##################### user information #####################
    
    user_id = tweet['user']['id']
    user_name = tweet['user']['name']
    screen_name = tweet['user']['screen_name']
    location = tweet['user']['location']
    description = tweet['user']['description']
    verified =  tweet['user']['verified'] 
    followers_count = tweet['user']['followers_count'] 
    friends_count = tweet['user']['friends_count'] 
    listed_count = tweet['user']['listed_count'] 
    favourites_count =  tweet['user']['favourites_count'] 
    statuses_count =  tweet['user']['statuses_count']
    twitter_join_date = tweet['user']['created_at']
    language = tweet['user']['lang']
    following = tweet['user']['following']
    
     
    ##################### segregate retweets, tweets and quoted tweets information #####################
    
    try:
        if is_quote:

            original_tweet_id = tweet['quoted_status']['id']
            original_tweet_user_id = tweet['quoted_status']['user']['id']
            original_tweet_user_name = tweet['quoted_status']['user']['name']
            original_post_time = tweet['quoted_status']['created_at']

            original_tweet_quote_count = tweet['quoted_status']['quote_count']
            original_tweet_reply_count = tweet['quoted_status']['reply_count']
            original_tweet_retweet_count = tweet['quoted_status']['retweet_count']
            original_tweet_favorite_count = tweet['quoted_status']['favorite_count']

            original_tweet_hashtags = [hashtag['text'] for hashtag in tweet['quoted_status']['entities']['hashtags']]
            original_tweet_urls = [url['url'] for url in tweet['quoted_status']['entities']['urls']]
            original_tweet_mentions = [mention['screen_name'] for mention in tweet['quoted_status']['entities']['user_mentions']]

            quote_media = {'hashtags': original_tweet_hashtags, 
                           'urls': original_tweet_urls, 
                           'mentions': original_tweet_mentions}

            quote = {'tweet_id': original_tweet_id, 
                     'user_id': original_tweet_user_id, 
                     'user_name': original_tweet_user_name, 
                     'quote_count': original_tweet_quote_count, 
                     'reply_count': original_tweet_reply_count, 
                     'retweet_count': original_tweet_retweet_count, 
                     'favorite_count': original_tweet_favorite_count, 
                     'media': quote_media}


        if (tweet['text'].startswith('RT')):
            is_retweet = True

            if not is_quote:

                original_tweet_id = tweet['retweeted_status']['id']
                original_tweet_user_id = tweet['retweeted_status']['user']['id']
                original_tweet_user_name = tweet['retweeted_status']['user']['name']
                original_post_time = tweet['retweeted_status']['created_at']

                original_tweet_quote_count = tweet['retweeted_status']['quote_count']
                original_tweet_reply_count = tweet['retweeted_status']['reply_count']
                original_tweet_retweet_count = tweet['retweeted_status']['retweet_count']
                original_tweet_favorite_count = tweet['retweeted_status']['favorite_count']

                original_tweet_hashtags = [hashtag['text'] for hashtag in tweet['retweeted_status']['entities']['hashtags']]
                original_tweet_urls = [url['url'] for url in tweet['retweeted_status']['entities']['urls']]
                original_tweet_mentions = [mention['screen_name'] for mention in tweet['retweeted_status']['entities']['user_mentions']]

                retweet_media = {'hashtags': original_tweet_hashtags, 
                                 'urls': original_tweet_urls, 
                                 'mentions': original_tweet_mentions}


            else:
                retweet_media = quote_media

            # try adding extended tweet if it exists
            try:
                text = tweet['retweeted_status']['extended_tweet']['full_text']  
            except : 
                try:
                    text = tweet['retweeted_status']['text']
                except: 
                    continue

            retweet = {'tweet_id': original_tweet_id, 
                       'user_id': original_tweet_user_id, 
                       'user_name': original_tweet_user_name, 
                       'quote_count': original_tweet_quote_count, 
                       'reply_count': original_tweet_reply_count, 
                       'retweet_count': original_tweet_retweet_count, 
                       'favorite_count': original_tweet_favorite_count,
                       'created_at': original_post_time, 
                       'media': retweet_media}

        else:
            is_retweet = False

            # try adding extended tweet if it exists
            try :
                text = tweet['extended_tweet']['full_text']
            except:
                text = tweet['text']


        ##################### extra info about tweets #####################

        favorited = tweet['favorited']
        favorite_count = tweet['favorite_count']
        urls = [url['url'] for url in tweet['entities']['urls']]
        hashtags = [tag['text'] for tag in tweet['entities']['hashtags']]
        mentions = [mention['screen_name'] for mention in tweet['entities']['user_mentions']]

        media = {'hashtags': hashtags, 'urls': urls, 'mentions': mentions}

        quote_count = tweet['quote_count']
        reply_count = tweet['reply_count']
        retweet_count = tweet['retweet_count']
        favorite_count = tweet['favorite_count']

        try:
            # Making a JSON to be sent to collection.
            mongo_data = {'tweet_id': tweet_id, 'user': user_id, 
                          'name': user_name, 'verified': verified,
                          'date': created_at, 'source': source, 'text': text, 
                          'in_reply_to_status_id': in_reply_to_status_id, 
                          'in_reply_to_user_id': in_reply_to_user_id, 
                          'is_retweet': is_retweet, 'is_quote': is_quote,
                          'retweet': retweet, 'quote': quote, 'media': media, 
                          'favorite_count': favorite_count, 'quote_count': quote_count, 
                          'reply_count': reply_count,'retweet_count': retweet_count} 

            records.insert_one(mongo_data)

        # Except if it's already there
        except pymongo.errors.DuplicateKeyError as e:
            print("Duplicate Key")


        ##################### combine everything #####################

        tweets.append({'tweet_id': tweet_id, 'user': user_id, 'name': user_name, 
                       'verified': verified, 'date': created_at, 'source': source, 
                       'original_tweet_id': original_tweet_id,
                       'original_tweet_user_id': original_tweet_user_id, 
                       'text' : text, 'quote_count': quote_count,
                       'reply_count': reply_count,'retweet_count': retweet_count, 
                       'favorite_count': favorite_count, 'favorited': favorited, 
                       'urls': urls, 'hashtags': hashtags, 'mentions': mentions})
    except:
        continue


In [13]:
# Create a pandas DataFrame from the parsed data
df_tweets = pd.DataFrame(tweets)

In [14]:
print(df_tweets.shape)

(120392, 17)


In [15]:
df_tweets.head()

Unnamed: 0,tweet_id,user,name,verified,date,source,original_tweet_id,original_tweet_user_id,text,quote_count,reply_count,retweet_count,favorite_count,favorited,urls,hashtags,mentions
0,1249403767180668930,1242817830946508801,juwelz v,False,Sun Apr 12 18:27:25 +0000 2020,"<a href=""http://twitter.com/download/iphone"" r...",1249315454797168641,46769281,wishing death on people is weirdo behavior.,0,0,0,0,False,[],[],[nuffsaidny]
1,1249403768023678982,1225145123920588805,efe09,False,Sun Apr 12 18:27:25 +0000 2020,"<a href=""https://mobile.twitter.com"" rel=""nofo...",1249397541596286979,1087735689091928064,"In Turkey, there are 300 thousand prisoners an...",0,0,0,0,False,[],[],[lale_karanfil]
2,1249403769193779202,101007632,Ravin Gupta,False,Sun Apr 12 18:27:26 +0000 2020,"<a href=""http://twitter.com/download/android"" ...",1249319407177744385,1897514666,Thank You Sir !!\nReally thankful for encourag...,0,0,0,0,False,[],[],"[umesh_agr, BSNL_OR]"
3,1249403769567227906,1230170166614482944,Carpe diem,False,Sun Apr 12 18:27:26 +0000 2020,"<a href=""http://twitter.com/download/android"" ...",1249403114614075400,1193535233242664960,"Turkey is so stubborn to change their mind, th...",0,0,0,0,False,[],[],[meysimek]
4,1249403770435493888,4707764075,UpsidedownTurtle 🧢,False,Sun Apr 12 18:27:26 +0000 2020,"<a href=""https://mobile.twitter.com"" rel=""nofo...",1249316363681910784,14135350,This image.\nThis quote.\n\n“One of the reason...,0,0,0,0,False,[],[],[biannagolodryga]


In [16]:
# define the input and output formats
input_format = '%a %b %d %H:%M:%S %z %Y'
output_format = '%Y-%m-%d %H:%M:%S %Z%z'

In [17]:
type(df_tweets['date'])

pandas.core.series.Series

In [18]:
# Convert string Series to datetime Series
df_tweets['date'] = pd.to_datetime(df_tweets['date'], format=input_format)

# Convert datetime Series to string Series with desired output format
df_tweets['date'] = df_tweets['date'].dt.strftime(output_format)

In [19]:
df_tweets.to_csv('../data/tweets.csv', index=False)