# Working directory

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from google.colab import drive
import json
from collections import Counter
from pprint import pprint

# Mount Drive before changing working directory
drive.mount('/content/drive')
%cd "/content/drive/My Drive/UniMelb---Cluster and Cloud Computing COMP90024_2020_SM1/assignment_twitter"

# Data

Each line of data file (except first line) contains json text representing
[Tweet Data Dictionary](https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object)

In [0]:
datafile = "tinyTwitter.json"
with open(datafile) as f:
  lines = f.readlines()

print(lines[0]) # 1st line
print(lines[1]) # 2nd line
print(lines[-1]) # last line

{"total_rows":215443567,"offset":211386044,"rows":[

{"id":"1212161512334336000","key":["sydney",2020,1,1],"value":1,"doc":{"_id":"1212161512334336000","_rev":"1-a51c37b5a85cf6a96735628afbea2c75","created_at":"Wed Jan 01 00:00:00 +0000 2020","id":1212161512334336000,"id_str":"1212161512334336000","text":"@La_Doine Pues ahora si, feliz 2020 🎊","truncated":false,"entities":{"hashtags":[],"symbols":[],"user_mentions":[{"screen_name":"La_Doine","name":"Star","id":320174862,"id_str":"320174862","indices":[0,9]}],"urls":[]},"metadata":{"iso_language_code":"es","result_type":"recent"},"source":"<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>","in_reply_to_status_id":1212157896512942000,"in_reply_to_status_id_str":"1212157896512942080","in_reply_to_user_id":320174862,"in_reply_to_user_id_str":"320174862","in_reply_to_screen_name":"La_Doine","user":{"id":1011510954406961200,"id_str":"1011510954406961153","name":"AySeñorQueNosComenLosMonguers","screen_name":"BocataJam

In [0]:
def parse_tweet(text):
  """
  Parse json text for Tweet Data Dictionary
  """

  text = text.strip(",\n") # remove unwanted trailing ','

  # Parse json text
  try:
    data = json.loads(text)
    tweet = data["doc"]
  except json.decoder.JSONDecodeError: # illegal text
    tweet = {}

  return tweet

tweets = [t for t in map(parse_tweet, lines) if t] # skip first line
pprint(tweets[-1])

{'_id': '1212167199793614848',
 '_rev': '1-144bb7aa614168777f1a499c8dd6b892',
 'contributors': None,
 'coordinates': None,
 'created_at': 'Wed Jan 01 00:22:36 +0000 2020',
 'entities': {'hashtags': [],
              'symbols': [],
              'urls': [],
              'user_mentions': [{'id': 264107729,
                                 'id_str': '264107729',
                                 'indices': [3, 8],
                                 'name': '5 Seconds of Summer',
                                 'screen_name': '5SOS'}]},
 'favorite_count': 0,
 'favorited': False,
 'geo': None,
 'id': 1212167199793614800,
 'id_str': '1212167199793614848',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'is_quote_status': False,
 'lang': 'en',
 'location': 'sydney',
 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'},
 'place': None,
 'retweet_count': 6526,
 'ret

# Search for Hashtags in a tweet

Array of [Hashtag Objects](https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/entities-object#hashtags) is under [Entities data dictionary](https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/entities-object#entitiesobject)


In [0]:
def extract_hashtags(tweet):
  """
  Extract hashtags (converted lowercase) from Tweet Data Dictionary
  """

  hashtags = tweet["entities"]["hashtags"]
  names = [tag["text"].lower() for tag in hashtags] # hashtag names (without #)

  return names

htcounts = Counter()
for t in tweets:
  names = extract_hashtags(t)
  htcounts.update(names)
print(htcounts)

Counter({'climatechange': 6, 'scottyfrommarketing': 6, 'auspol': 6, 'australiaburns': 5, 'sonicmovie': 4, 'bushfiresnsw': 3, 'nswfires': 3, 'sydnye': 2, 'sydney': 2, 'climate': 2, 'happynewyear': 2, 'australianfires': 2, 'fireworks': 2, 'nye2020': 2, 'canberra': 2, 'เป๊กผลิตโชค': 2, 'liarfromtheshire': 2, 'morrisonfires': 2, 'countdown2020withpeck': 2, 'nye': 2, 'configmgr': 1, 'ilovesydney': 1, 'sideproject': 1, 'makers': 1, 'bulltitts': 1, 'fatca': 1, 'decadechallenge': 1, '2010vs2020': 1, 'abc': 1, 'priorities': 1, 'stowaway': 1, 'joseout': 1, 'duindorp': 1, 'gardening': 1, 'urbangardens': 1, 'newyearsday2020': 1, 'smoko': 1, 'youvestolenmychildhood': 1, 'howdareyou': 1, 'peoplearedying': 1, 'coalman': 1, 'brisbane': 1, 'labor': 1, 'loveislandau': 1, 'cheers': 1, 'skål': 1, 'avstwitterpsychic': 1, 'goavsgo': 1, 'climatecriminals': 1, '9news': 1, 'theirabc': 1, 'primeminister': 1, 'sackfomo': 1, 'sackscummo': 1, 'southcoastfires': 1, 'bushfires': 1, 'nswpol': 1, 'peckpalitchoke': 1, 

# Search for language in a tweet

Language code is under field 'lang'.  
[Language names](https://developer.twitter.com/en/docs/twitter-for-websites/twitter-for-websites-supported-languages/overview) supported by Twitter will be used to make report.

In [0]:
langnames = {'ar': 'Arabic',
             'bn': 'Bengali',
             'cs': 'Czech',
             'da': 'Danish',
             'de': 'German',
             'el': 'Greek',
             'en': 'English',
             'es': 'Spanish',
             'fa': 'Persian',
             'fi': 'Finnish',
             'fil': 'Filipino',
             'fr': 'French',
             'he': 'Hebrew',
             'hi': 'Hindi',
             'hu': 'Hungarian',
             'id': 'Indonesian',
             'it': 'Italian',
             'ja': 'Japanese',
             'ko': 'Korean',
             'msa': 'Malay',
             'nl': 'Dutch',
             'no': 'Norwegian',
             'pl': 'Polish',
             'pt': 'Portuguese',
             'ro': 'Romanian',
             'ru': 'Russian',
             'sv': 'Swedish',
             'th': 'Thai',
             'tr': 'Turkish',
             'uk': 'Ukrainian',
             'ur': 'Urdu',
             'vi': 'Vietnamese',
             'zh-cn': 'Chinese Simplified',
             'zh-tw': 'Chinese Traditional'}

def extract_language(tweet):
  """
  Extract language from Tweet Data Dictionary
  """

  return tweet["lang"]

langcounts = Counter(map(extract_language, tweets))
print(langcounts)

Counter({'en': 832, 'und': 69, 'fr': 18, 'pt': 17, 'es': 16, 'th': 9, 'tl': 6, 'ar': 4, 'zh': 4, 'de': 4, 'ja': 3, 'in': 2, 'et': 2, 'fa': 2, 'ht': 2, 'ko': 2, 'cy': 1, 'tr': 1, 'nl': 1, 'ro': 1, 'pl': 1, 'hu': 1, 'ru': 1})
