# Twitter Data Exploration & Analysis

In [1]:
import os
import json

In [2]:
# Utility function for getting data
def getDataDir():
    homeDir=os.getenv("HOME")
    return os.path.join(homeDir, "data")

getDataDir()

'/home/jovyan/data'

## Part 1 - Parsing & Exploring the Data

First step is to read the data from file, one record at a time and then
gather statistics and information about the fields. We do this 1 record
at a time to avoid loading everything into memory.

In [3]:
def fnGetDataFromJSON(sLine):
    data = None
    try:
        data = json.loads(sLine)
    except Exception as e:
        print("Parsing error:", e)
    
    return data

In [4]:
# Sample data should be in this directory
filepath = os.path.join(getDataDir(), "corona-out-2")

In [10]:
def fnProcessObj(data, fields, iFieldTypeConflict):
    if data != None:
        # Get the fields from the json.
        for field in data:
            # Create a record of this field
            if not field in fields.keys():
                fields[field] = fields[field] = {"count": 1, "nonNull": 0, "type": None}
            else:
                fieldRec = fields[field]
                fieldRec["count"] = fieldRec["count"] + 1

            # Differentiate NULL entries and extract type if not known
            value = data[field]
            if value is not None:
                fieldType = type(value)
                fieldRec = fields[field]
                # Conflict if only type changes
                if fieldType is not None and fieldRec["type"] is not None:
                    if fieldType != fieldRec["type"]:
                        iFieldTypeConflict = iFieldTypeConflict + 1

                # Type of fields
                if fieldType is not None:
                    fieldRec["nonNull"] = fieldRec["nonNull"] + 1
                    fieldRec["type"] = fieldType

In [6]:
def fnPrintStats(fields, iFieldTypeConflict):
    print("Fields with conflicting types ", iFieldTypeConflict)
    print("")

    # Print out fields/objects in tweets and count. Fields 1:1 (required) or sparse
    # Looks like a data frame. ;)
    print("Fields")
    print("{name:32s} {count:<6s} {nonNullCount:<6s} {fType:>16s}".format(name="name", 
                                                                          count="count",
                                                                          nonNullCount="not null",
                                                                          fType="type"))
    for fieldName,fieldRec in fields.items():
        fieldCount = fieldRec["count"]
        fieldNonNullCount = fieldRec["nonNull"]                                                          
        fieldType = fieldRec["type"]

        if fieldType is not None:
            fieldType = fieldType.__name__
        else:
            fieldType = "Undefined"

        print("{name:32s} {count:<6d} {nonNullCount:<6d} {fType:>16s}".format(name=fieldName, 
                                                                       count=fieldCount,
                                                                       nonNullCount=fieldNonNullCount,
                                                                       fType=fieldType))

In [25]:
# Read up to iMax records. If iMax is None, read them all
def fnGetRecords(filepath, iMax, bDoPrint):
    
    # Keep track of fields and count for the # of tweats those fields appeared in
    fields = {}  
    
    # Keep track of how many records come across 
    iProcess = 0
    iFieldTypeConflict = 0

    userFields = {}
    iUserFieldTypeConflict = 0

    placeFields = {}
    iPlaceFieldTypeConflict = 0
    
    retweetedFields = {}
    iRetweetedFieldTypeConflict = 0
    
    quotedStatusFields = {}
    iQuotedStatusFieldTypeConflict = 0
    
    entitiesFields = {}
    iEntitiesFieldTypeConflict = 0
    
    extendedTweetFields = {}
    iExtendedTweetFieldTypeConflict = 0
    
    extendedEntitiesFields = {}
    iExtendedEntitiesFieldTypeConflict = 0
    
    # Read JSON sample data with tweats
    try:
        print("Reading from ", filepath)

        with open(filepath, "r") as sampleFile:
            # Lets get it one line at a time to avoid loading everything into memory
            for sLine in sampleFile:
                # Ignore whitespaces
                if not sLine.isspace():
                    if bDoPrint:
                        print("Record", iProcess, ":")
                    data = fnGetDataFromJSON(sLine)
                    
                    # Should we print / preview each record?
                    if bDoPrint:
                        print(json.dumps(data, indent=4))

                    # Calculate field and stats
                    fnProcessObj(data, fields, iFieldTypeConflict)
                    
                    # This could have been done with a dictionary of dictionaries. Keeping things simple.
                    if "user" in data:
                        fnProcessObj(data["user"], userFields, iUserFieldTypeConflict)
                    
                    if "place" in data:
                        fnProcessObj(data["place"], placeFields, iPlaceFieldTypeConflict)
                                
                    if "retweeted_status" in data:
                        fnProcessObj(data["retweeted_status"], retweetedFields, iRetweetedFieldTypeConflict)
                        
                    if "quoted_status" in data:
                        fnProcessObj(data["quoted_status"], quotedStatusFields, iQuotedStatusFieldTypeConflict)
                        
                    if "entities" in data:
                        fnProcessObj(data["entities"], entitiesFields, iEntitiesFieldTypeConflict)
                        
                    if "extended_tweet" in data:
                        fnProcessObj(data["extended_tweet"], extendedTweetFields, iExtendedTweetFieldTypeConflict)
                        
                    if "extended_entities" in data:
                        fnProcessObj(data["extended_entities"], extendedEntitiesFields, iExtendedEntitiesFieldTypeConflict)
                            
                    # For previewing the data
                    if not iMax is None and iProcess >= iMax:
                        break
                    iProcess = iProcess + 1
                    
        # Output some basic stats for Tweets
        print("Tweets processed ", iProcess)
        fnPrintStats(fields, iFieldTypeConflict)
        
        # And nested objects
        print("User fields")
        fnPrintStats(userFields, iUserFieldTypeConflict)
        print("")
        
        print("Place Fields")
        fnPrintStats(placeFields, iPlaceFieldTypeConflict)
        print("")
        
        print("Retweet Fields")
        fnPrintStats(retweetedFields, iRetweetedFieldTypeConflict)
        print("")
        
        print("Quoted Status")
        fnPrintStats(quotedStatusFields, iQuotedStatusFieldTypeConflict)
        print("")
        
        print("Entities")
        fnPrintStats(entitiesFields, iEntitiesFieldTypeConflict)
        print("")
        
        print("Extended Tweet")
        fnPrintStats(extendedTweetFields, iExtendedTweetFieldTypeConflict)
        print("")
        
        print("Extended Entities")
        fnPrintStats(extendedEntitiesFields, iExtendedEntitiesFieldTypeConflict)
        print("")
            
    except Exception as e:
        print("Error while reading JSON records from memory", e)

In [None]:
type(32.2)

In [None]:
# Let's sneak a peek at the first record
fnGetRecords(filepath, 1, True)

### Field breakdown

As shown below, we can differentiate between fields that are 1:1, and of simpler types 
and should be probably be stored in a fixed type


In [26]:
# Get stats but don't print out the individual records
fnGetRecords(filepath, None, False)

Reading from  /home/jovyan/data/corona-out-2
Tweets processed  18518
Fields with conflicting types  0

Fields
name                             count  not null             type
created_at                       18518  18518               str
id                               18518  18518               int
id_str                           18518  18518               str
text                             18518  18518               str
source                           18518  18518               str
truncated                        18518  18518              bool
in_reply_to_status_id            18518  2528                int
in_reply_to_status_id_str        18518  2528                str
in_reply_to_user_id              18518  2734                int
in_reply_to_user_id_str          18518  2734                str
in_reply_to_screen_name          18518  2734                str
user                             18518  18518              dict
geo                              18518  13              