# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import time
import os
import pprint

# Data

In [None]:
project_path = "drive/MyDrive/Colab_Notebooks/Projects/Crypto"

In [None]:
json_file_names = os.listdir(project_path + "/Data")

**Example Tweet Object**

In [None]:
with open(project_path + "/Data/" +  json_file_names[0]) as infile:
    tweets_json = json.load(infile)

In [None]:
tweet = tweets_json[0]

In [None]:
pprint.pprint(tweet)

{'contributors': None,
 'coordinates': None,
 'created_at': 'Mon Nov 08 16:10:21 +0000 2021',
 'entities': {'hashtags': [],
              'symbols': [],
              'urls': [{'display_url': 'medium.com/@arker-officia…',
                        'expanded_url': 'https://medium.com/@arker-official/arker-metaverse-2e417ba05ec1',
                        'indices': [109, 132],
                        'url': 'https://t.co/ewiy4qirjB'}],
              'user_mentions': [{'id': 919873360837926912,
                                 'id_str': '919873360837926912',
                                 'indices': [3, 18],
                                 'name': 'Arker The legend of Ohm',
                                 'screen_name': 'ArkerCommunity'}]},
 'favorite_count': 0,
 'favorited': False,
 'filter_level': 'low',
 'geo': None,
 'id': 1457742296263233547,
 'id_str': '1457742296263233547',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_

**Helper functions**

In [99]:
def get_user_info(tweet_obj,sublevel=''):
  """ Function to get relevant information for user, and retweeted_status or quoted
  status (if they exist) """

  f = lambda x: x[sublevel] if sublevel != '' else x
  
  if sublevel != '':
    prefix = sublevel + '-'  
  else:
    prefix = sublevel
      
  tweet_obj[prefix + 'user-screen_name'] = f(tweet_obj)['user']['screen_name']
  tweet_obj[prefix + 'user-created_at'] = f(tweet_obj)['user']['created_at']
  tweet_obj[prefix + 'user-followers_count'] = f(tweet_obj)['user']['followers_count']

In [94]:
def consolidate_text(tweet_obj,sublevel=''):

  """ Function to get consildate text and extended tweet - full text into one """

  if sublevel == '':

    if 'extended_tweet' in tweet_obj:
      tweet_obj['text'] = tweet_obj['extended_tweet']['full_text'] 

  else:

    if 'extended_tweet' in tweet_obj[sublevel]:
      tweet_obj[sublevel + '-text'] = tweet_obj[sublevel]['extended_tweet']['full_text']      
    else:
      tweet_obj[sublevel + '-text'] = tweet_obj[sublevel]['text']


In [95]:
def flatten_tweets(tweets_json):
    """ Flattens out tweet dictionaries so relevant JSON
        is in a top-level dictionary."""

    tweets_list = []
    
    
    for tweet in tweets_json:
        # Reorganize User info and text 

        get_user_info(tweet)
        consolidate_text(tweet)     

        # If status is a retweet, reorganize retweeted user info and text
        if 'retweeted_status' in tweet:
          
          get_user_info(tweet,'retweeted_status')
          consolidate_text(tweet,'retweeted_status')
        
       # If status is a quote tweet, reorganize quoted user info and text
        if 'quoted_status' in tweet:

          get_user_info(tweet,'quoted_status')
          consolidate_text(tweet,'quoted_status')
        
        # Geographic location for user
        if 'location' in tweet['user']:

          tweet['user-location'] = tweet['user']['location']

        if 'place' in tweet['user']:

          tweet['user-place'] = tweet['user']['place']

            
        tweets_list.append(tweet)

    return tweets_list

**Extract relevant information and concatenate into DataFrames**

In [100]:
list_of_dfs = []

for file in json_file_names:
  with open(project_path + "/Data/" + file) as infile:
    tweets_json = json.load(infile)
    tweets = flatten_tweets(tweets_json)
    tweets_df = pd.DataFrame(tweets)
    list_of_dfs.append(tweets_df)
  

In [101]:
tweet_df = pd.concat(list_of_dfs)

In [102]:
tweet_df.reset_index(drop=True,inplace=True)

In [103]:
columns = ['created_at', 'id','text','in_reply_to_screen_name','place','quote_count',
       'reply_count', 'retweet_count', 'favorite_count', 'entities','user-screen_name', 'user-created_at', 'user-followers_count',
       'user-location', 'retweeted_status-user-screen_name',
       'retweeted_status-user-created_at',
       'retweeted_status-user-followers_count', 'retweeted_status-text','quoted_status-user-created_at', 'quoted_status-user-followers_count',
       'quoted_status-text']

In [87]:
tweet_df.columns

Index(['created_at', 'id', 'id_str', 'text', 'source', 'truncated',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str',
       'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place',
       'contributors', 'retweeted_status', 'is_quote_status', 'quote_count',
       'reply_count', 'retweet_count', 'favorite_count', 'entities',
       'favorited', 'retweeted', 'possibly_sensitive', 'filter_level', 'lang',
       'timestamp_ms', 'user-screen_name', 'user-created_at',
       'user-followers_count', 'retweeted_status-text', 'user-location',
       'quoted_status_id', 'quoted_status_id_str', 'quoted_status',
       'quoted_status_permalink', 'quoted_status-text', 'display_text_range',
       'extended_tweet', 'extended_entities'],
      dtype='object')

In [104]:
tweet_df = tweet_df[columns]

**Preliminary Data Quality Ensurance**

In [105]:
tweet_df.shape

(50000, 21)

In [106]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 21 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   created_at                             50000 non-null  object 
 1   id                                     50000 non-null  int64  
 2   text                                   50000 non-null  object 
 3   in_reply_to_screen_name                7533 non-null   object 
 4   place                                  82 non-null     object 
 5   quote_count                            50000 non-null  int64  
 6   reply_count                            50000 non-null  int64  
 7   retweet_count                          50000 non-null  int64  
 8   favorite_count                         50000 non-null  int64  
 9   entities                               50000 non-null  object 
 10  user-screen_name                       50000 non-null  object 
 11  us

In [107]:
tweet_df.head()

Unnamed: 0,created_at,id,text,in_reply_to_screen_name,place,quote_count,reply_count,retweet_count,favorite_count,entities,user-screen_name,user-created_at,user-followers_count,user-location,retweeted_status-user-screen_name,retweeted_status-user-created_at,retweeted_status-user-followers_count,retweeted_status-text,quoted_status-user-created_at,quoted_status-user-followers_count,quoted_status-text
0,Mon Nov 08 16:10:21 +0000 2021,1457742296263233547,RT @ArkerCommunity: Discover the new metaverse...,,,0,0,0,0,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",Faiz_2294,Wed May 12 08:32:39 +0000 2021,32,"Magelang Utara, Indonesia",ArkerCommunity,Mon Oct 16 10:31:14 +0000 2017,26263.0,"Discover the new metaverse of Arker, start a n...",,,
1,Mon Nov 08 16:10:21 +0000 2021,1457742296821243918,RT @FEhrsam: Have invested in a few groups wor...,,,0,0,0,0,"{'hashtags': [], 'urls': [], 'user_mentions': ...",SpaceXEng,Thu Aug 16 00:05:01 +0000 2012,2266,420.69,FEhrsam,Sun Jun 12 19:33:04 +0000 2011,170231.0,Have invested in a few groups working on new c...,Sun May 08 16:03:03 +0000 2011,2709261.0,Crypto cities!\n\nhttps://t.co/mUdpYJSDq0
2,Mon Nov 08 16:10:21 +0000 2021,1457742297320202246,RT @djs_crypto: Crowdsale Contract is the core...,,,0,0,0,0,"{'hashtags': [], 'urls': [], 'user_mentions': ...",Embr_Dil,Mon Jul 26 19:28:15 +0000 2021,66,,djs_crypto,Fri Apr 23 10:39:56 +0000 2021,131.0,Crowdsale Contract is the core of the Embr IDO...,,,
3,Mon Nov 08 16:10:21 +0000 2021,1457742297366470657,RT @backerfigth: The #SmartDeFi #launchpad wit...,,,0,0,0,0,"{'hashtags': [{'text': 'SmartDeFi', 'indices':...",yasinsoyudemir1,Thu May 16 15:19:16 +0000 2019,144,,backerfigth,Tue Feb 02 18:21:04 +0000 2021,437.0,The #SmartDeFi #launchpad with its integration...,,,
4,Mon Nov 08 16:10:21 +0000 2021,1457742297425235968,RT @cryptoman71: @bezoge AMA live now!!\n\nhtt...,,,0,0,0,0,"{'hashtags': [{'text': 'Crypto', 'indices': [6...",CryptoEarner968,Sat Feb 20 16:45:54 +0000 2021,3224,"Paris, France",cryptoman71,Mon May 17 10:04:43 +0000 2021,228.0,@bezoge AMA live now!!\n\nhttps://t.co/EXZofTr...,,,


In [60]:
tweet_df.shape

(50000, 21)

In [108]:
for col in tweet_df:
  if "created_at" in col:
    tweet_df[col] = pd.to_datetime(tweet_df[col])

In [109]:
tweet_df.to_csv(project_path + '/Data/' + 'tweet_df.csv')