# Working directory

In [2]:
from google.colab import drive
import os
import json
from collections import Counter
import re
from pprint import pprint

# Mount Drive before changing working directory
drive.mount('/content/drive')
%cd "/content/drive/My Drive/unimelb-cluster-and-cloud-computing-comp90024-2020-sm1/twitter_hashtags"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive/unimelb-cluster-and-cloud-computing-comp90024-2020-sm1/twitter_hashtags


# Data

Each line of data file (except first line/last line) contains json text representing
[Tweet Data Dictionary](https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object), under field "doc".

In [3]:
datafile = "data/tinyTwitter.json"
with open(datafile) as f:
  lines = f.readlines()

print(lines[0]) # 1st line
print(lines[1]) # 2nd line
print(lines[-1]) # last line

{"total_rows":215443567,"offset":211386044,"rows":[

{"id":"1212161512334336000","key":["sydney",2020,1,1],"value":1,"doc":{"_id":"1212161512334336000","_rev":"1-a51c37b5a85cf6a96735628afbea2c75","created_at":"Wed Jan 01 00:00:00 +0000 2020","id":1212161512334336000,"id_str":"1212161512334336000","text":"@La_Doine Pues ahora si, feliz 2020 🎊","truncated":false,"entities":{"hashtags":[],"symbols":[],"user_mentions":[{"screen_name":"La_Doine","name":"Star","id":320174862,"id_str":"320174862","indices":[0,9]}],"urls":[]},"metadata":{"iso_language_code":"es","result_type":"recent"},"source":"<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>","in_reply_to_status_id":1212157896512942000,"in_reply_to_status_id_str":"1212157896512942080","in_reply_to_user_id":320174862,"in_reply_to_user_id_str":"320174862","in_reply_to_screen_name":"La_Doine","user":{"id":1011510954406961200,"id_str":"1011510954406961153","name":"AySeñorQueNosComenLosMonguers","screen_name":"BocataJam

# Process line (elementary workflow)

## Remove unwanted characters

As each line may end with characters (e.g ',' ']' '}') resulting in badly formatted json text, those trailing characters need to be removed.

In [0]:
def remove_trails(text):
  """
  Remove unwanted trailing characters from json text
  text: line of json text
  """

  text = text.rstrip() # remove trailing whitespace (\n)
  text = re.sub(r"(?<=}),?]?}?$", "", text) # remove unwanted trailing ',' ']' '}'

  return text

# Test
print(remove_trails('{"id":"1212","doc":{"_id":"1212"}},\n'))
assert remove_trails('{"id":"1212","doc":{"_id":"1212"}},\n') == '{"id":"1212","doc":{"_id":"1212"}}'
assert remove_trails('{"id":"1212","doc":{"_id":"1212"}}]}\n') == '{"id":"1212","doc":{"_id":"1212"}}'

# Real data
lines_clean = list(map(remove_trails, lines))
print(*lines_clean[:3], sep="\n")

{"id":"1212","doc":{"_id":"1212"}}
{"total_rows":215443567,"offset":211386044,"rows":[
{"id":"1212161512334336000","key":["sydney",2020,1,1],"value":1,"doc":{"_id":"1212161512334336000","_rev":"1-a51c37b5a85cf6a96735628afbea2c75","created_at":"Wed Jan 01 00:00:00 +0000 2020","id":1212161512334336000,"id_str":"1212161512334336000","text":"@La_Doine Pues ahora si, feliz 2020 🎊","truncated":false,"entities":{"hashtags":[],"symbols":[],"user_mentions":[{"screen_name":"La_Doine","name":"Star","id":320174862,"id_str":"320174862","indices":[0,9]}],"urls":[]},"metadata":{"iso_language_code":"es","result_type":"recent"},"source":"<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>","in_reply_to_status_id":1212157896512942000,"in_reply_to_status_id_str":"1212157896512942080","in_reply_to_user_id":320174862,"in_reply_to_user_id_str":"320174862","in_reply_to_screen_name":"La_Doine","user":{"id":1011510954406961200,"id_str":"1011510954406961153","name":"AySeñorQueNosComenLos

## Parse for Tweet Data Dictionary

In [0]:
def parse_tweet(text):
  """
  Parse for Tweet Data Dictionary in json text, under field "doc"
  text: line of json text
  """

  try:
    data = json.loads(text)
    tweet = data["doc"]
  except json.decoder.JSONDecodeError: # illegal text
    tweet = {} 

  return tweet

# Test
print(parse_tweet('{"id":"1212","doc":{"_id":"1212"}}'))
assert parse_tweet('{"id":"1212","doc":{"_id":"1212"}}') == {"_id":"1212"}
assert not parse_tweet('{"total_rows":215443567,"offset":211386044,"rows":[') # not any tweet data

# Real data
tweets = [t for t in map(parse_tweet, lines_clean) if t] # skip first line
pprint(tweets[-1])

{'_id': '1212'}
{'_id': '1212167199793614848',
 '_rev': '1-144bb7aa614168777f1a499c8dd6b892',
 'contributors': None,
 'coordinates': None,
 'created_at': 'Wed Jan 01 00:22:36 +0000 2020',
 'entities': {'hashtags': [],
              'symbols': [],
              'urls': [],
              'user_mentions': [{'id': 264107729,
                                 'id_str': '264107729',
                                 'indices': [3, 8],
                                 'name': '5 Seconds of Summer',
                                 'screen_name': '5SOS'}]},
 'favorite_count': 0,
 'favorited': False,
 'geo': None,
 'id': 1212167199793614800,
 'id_str': '1212167199793614848',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'is_quote_status': False,
 'lang': 'en',
 'location': 'sydney',
 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'},
 'place': None,
 'retweet_cou

## Search for Hashtags

Array of [Hashtag Objects](https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/entities-object#hashtags) is under [Entities data dictionary](https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/entities-object#entitiesobject)


In [0]:
def extract_hashtags(tweet):
  """
  Extract hashtags from Tweet Data Dictionary and convert to lowercase
  tweet: Tweet Data Dictionary
  """

  if not tweet:
    return []

  hashtags = tweet["entities"]["hashtags"]
  names = [tag["text"].lower() for tag in hashtags] # lowercased hashtag names (without #)

  return names

htcounts = Counter()
for t in tweets:
  names = extract_hashtags(t)
  htcounts.update(names)
print(htcounts)

Counter({'climatechange': 6, 'scottyfrommarketing': 6, 'auspol': 6, 'australiaburns': 5, 'sonicmovie': 4, 'bushfiresnsw': 3, 'nswfires': 3, 'sydnye': 2, 'sydney': 2, 'climate': 2, 'happynewyear': 2, 'australianfires': 2, 'fireworks': 2, 'nye2020': 2, 'canberra': 2, 'เป๊กผลิตโชค': 2, 'liarfromtheshire': 2, 'morrisonfires': 2, 'countdown2020withpeck': 2, 'nye': 2, 'configmgr': 1, 'ilovesydney': 1, 'sideproject': 1, 'makers': 1, 'bulltitts': 1, 'fatca': 1, 'decadechallenge': 1, '2010vs2020': 1, 'abc': 1, 'priorities': 1, 'stowaway': 1, 'joseout': 1, 'duindorp': 1, 'gardening': 1, 'urbangardens': 1, 'newyearsday2020': 1, 'smoko': 1, 'youvestolenmychildhood': 1, 'howdareyou': 1, 'peoplearedying': 1, 'coalman': 1, 'brisbane': 1, 'labor': 1, 'loveislandau': 1, 'cheers': 1, 'skål': 1, 'avstwitterpsychic': 1, 'goavsgo': 1, 'climatecriminals': 1, '9news': 1, 'theirabc': 1, 'primeminister': 1, 'sackfomo': 1, 'sackscummo': 1, 'southcoastfires': 1, 'bushfires': 1, 'nswpol': 1, 'peckpalitchoke': 1, 

In [0]:
htcounts.most_common(10)

[('climatechange', 6),
 ('scottyfrommarketing', 6),
 ('auspol', 6),
 ('australiaburns', 5),
 ('sonicmovie', 4),
 ('bushfiresnsw', 3),
 ('nswfires', 3),
 ('sydnye', 2),
 ('sydney', 2),
 ('climate', 2)]

## Search for language in a tweet

Language code is under field 'lang'.  
[Language names](https://developer.twitter.com/en/docs/twitter-for-websites/twitter-for-websites-supported-languages/overview) supported by Twitter will be used to make report.

In [0]:
langnames = {'ar': 'Arabic',
             'bn': 'Bengali',
             'cs': 'Czech',
             'da': 'Danish',
             'de': 'German',
             'el': 'Greek',
             'en': 'English',
             'es': 'Spanish',
             'fa': 'Persian',
             'fi': 'Finnish',
             'fil': 'Filipino',
             'fr': 'French',
             'he': 'Hebrew',
             'hi': 'Hindi',
             'hu': 'Hungarian',
             'id': 'Indonesian',
             'it': 'Italian',
             'ja': 'Japanese',
             'ko': 'Korean',
             'msa': 'Malay',
             'nl': 'Dutch',
             'no': 'Norwegian',
             'pl': 'Polish',
             'pt': 'Portuguese',
             'ro': 'Romanian',
             'ru': 'Russian',
             'sv': 'Swedish',
             'th': 'Thai',
             'tr': 'Turkish',
             'uk': 'Ukrainian',
             'ur': 'Urdu',
             'vi': 'Vietnamese',
             'zh-cn': 'Chinese Simplified',
             'zh-tw': 'Chinese Traditional'}

def extract_language(tweet):
  """
  Extract language from Tweet Data Dictionary
  """
  
  return tweet.get("lang", "")

langcounts = Counter(map(extract_language, tweets))
print(langcounts)

Counter({'en': 832, 'und': 69, 'fr': 18, 'pt': 17, 'es': 16, 'th': 9, 'tl': 6, 'ar': 4, 'zh': 4, 'de': 4, 'ja': 3, 'in': 2, 'et': 2, 'fa': 2, 'ht': 2, 'ko': 2, 'cy': 1, 'tr': 1, 'nl': 1, 'ro': 1, 'pl': 1, 'hu': 1, 'ru': 1})


In [0]:
pprint(langcounts.most_common(10))

[('en', 832),
 ('und', 69),
 ('fr', 18),
 ('pt', 17),
 ('es', 16),
 ('th', 9),
 ('tl', 6),
 ('ar', 4),
 ('zh', 4),
 ('de', 4)]


## Tweet Processor

In [0]:
class Tweet:
    """Process line of json text for single tweet data"""

    def __init__(self, text):
      """
      text: line of json text
      """

      self.text = text

      # Tweet data
      text_clean = remove_trails(text)
      self.data = parse_tweet(text_clean)

      # Extract information
      self.hashtags = extract_hashtags(self.data)
      self.lang = extract_language(self.data) # language

tweets_pro = [t for t in map(Tweet, lines) if t.data] # drop badly formatted lines

# Count hashtags, languages
htcounts_pro = Counter()
langcounts_pro = Counter()
for t in tweets_pro:
  htcounts_pro.update(t.hashtags)
  langcounts_pro[t.lang] += 1
print(htcounts_pro)
print(langcounts_pro)

Counter({'climatechange': 6, 'scottyfrommarketing': 6, 'auspol': 6, 'australiaburns': 5, 'sonicmovie': 4, 'bushfiresnsw': 3, 'nswfires': 3, 'sydnye': 2, 'sydney': 2, 'climate': 2, 'happynewyear': 2, 'australianfires': 2, 'fireworks': 2, 'nye2020': 2, 'canberra': 2, 'เป๊กผลิตโชค': 2, 'liarfromtheshire': 2, 'morrisonfires': 2, 'countdown2020withpeck': 2, 'nye': 2, 'configmgr': 1, 'ilovesydney': 1, 'sideproject': 1, 'makers': 1, 'bulltitts': 1, 'fatca': 1, 'decadechallenge': 1, '2010vs2020': 1, 'abc': 1, 'priorities': 1, 'stowaway': 1, 'joseout': 1, 'duindorp': 1, 'gardening': 1, 'urbangardens': 1, 'newyearsday2020': 1, 'smoko': 1, 'youvestolenmychildhood': 1, 'howdareyou': 1, 'peoplearedying': 1, 'coalman': 1, 'brisbane': 1, 'labor': 1, 'loveislandau': 1, 'cheers': 1, 'skål': 1, 'avstwitterpsychic': 1, 'goavsgo': 1, 'climatecriminals': 1, '9news': 1, 'theirabc': 1, 'primeminister': 1, 'sackfomo': 1, 'sackscummo': 1, 'southcoastfires': 1, 'bushfires': 1, 'nswpol': 1, 'peckpalitchoke': 1, 

# Program (process big file)

## Read specific chunk

In [0]:
def read_lines(filename, start=0, end=-1):
  """
  Read specific chunk of filename line by line (lazy)
  filename: json file containing tweet data (big file)
  start: byte position to read from (defaults to start of file)
  end: byte position to read to (defaults to end of file)
  """

  if end < 0:
    end = os.path.getsize(filename)

  with open(filename) as f:
    f.seek(start)
    while f.tell() < end:
      yield f.readline()

# Test
filename = '/tmp/data.txt'
with open(filename, 'w') as fh:
  fh.write("Hello World!\nHow are you today?\nThank you!")
print(*read_lines(filename, end=15), sep="\n")
print(*read_lines(filename, start=15, end=20), sep="\n")
print(*read_lines(filename, start=20), sep="\n")

# Real data
print(*list(read_lines(datafile))[:3], sep="\n") # first 3 lines


Hello World!

How are you today?

w are you today?

 you today?

Thank you!
{"total_rows":215443567,"offset":211386044,"rows":[

{"id":"1212161512334336000","key":["sydney",2020,1,1],"value":1,"doc":{"_id":"1212161512334336000","_rev":"1-a51c37b5a85cf6a96735628afbea2c75","created_at":"Wed Jan 01 00:00:00 +0000 2020","id":1212161512334336000,"id_str":"1212161512334336000","text":"@La_Doine Pues ahora si, feliz 2020 🎊","truncated":false,"entities":{"hashtags":[],"symbols":[],"user_mentions":[{"screen_name":"La_Doine","name":"Star","id":320174862,"id_str":"320174862","indices":[0,9]}],"urls":[]},"metadata":{"iso_language_code":"es","result_type":"recent"},"source":"<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>","in_reply_to_status_id":1212157896512942000,"in_reply_to_status_id_str":"1212157896512942080","in_reply_to_user_id":320174862,"in_reply_to_user_id_str":"320174862","in_reply_to_screen_name":"La_Doine","user":{"id":1011510954406961200,"id_str":"10115109

## Count hashtags and languages in chunk

In [0]:
def count_hashtags_langs(filename, start=0, end=-1):
  """
  Count hashtags and languages in specific chunk of filename
  filename: json file containing tweet data (big file)
  start: byte position to read from (defaults to start of file)
  end: byte position to read to (defaults to end of file)
  """
  
  htcounts = Counter() # hashtag counts
  langcounts = Counter() # language counts
  for text in read_lines(filename, start, end):
    tweet = Tweet(text)
    if not tweet.data: # badly formatted line
      continue
    
    htcounts.update(tweet.hashtags)
    langcounts[tweet.lang] += 1

  return htcounts, langcounts

# Non-splitting
htcounts, langcounts = count_hashtags_langs(datafile)
print(htcounts)
print(langcounts)

# Splitting
htcounts1, langcounts1 = count_hashtags_langs(datafile, end=1567912)
htcounts2, langcounts2 = count_hashtags_langs(datafile, start=1567912, end=3135824)
htcounts3, langcounts3 = count_hashtags_langs(datafile, start=3135824)
htcounts_split = htcounts1 + htcounts2 + htcounts3
langcounts_split = langcounts1 + langcounts2 + langcounts3
print(htcounts_split)
print(langcounts_split)

Counter({'climatechange': 6, 'scottyfrommarketing': 6, 'auspol': 6, 'australiaburns': 5, 'sonicmovie': 4, 'bushfiresnsw': 3, 'nswfires': 3, 'sydnye': 2, 'sydney': 2, 'climate': 2, 'happynewyear': 2, 'australianfires': 2, 'fireworks': 2, 'nye2020': 2, 'canberra': 2, 'เป๊กผลิตโชค': 2, 'liarfromtheshire': 2, 'morrisonfires': 2, 'countdown2020withpeck': 2, 'nye': 2, 'configmgr': 1, 'ilovesydney': 1, 'sideproject': 1, 'makers': 1, 'bulltitts': 1, 'fatca': 1, 'decadechallenge': 1, '2010vs2020': 1, 'abc': 1, 'priorities': 1, 'stowaway': 1, 'joseout': 1, 'duindorp': 1, 'gardening': 1, 'urbangardens': 1, 'newyearsday2020': 1, 'smoko': 1, 'youvestolenmychildhood': 1, 'howdareyou': 1, 'peoplearedying': 1, 'coalman': 1, 'brisbane': 1, 'labor': 1, 'loveislandau': 1, 'cheers': 1, 'skål': 1, 'avstwitterpsychic': 1, 'goavsgo': 1, 'climatecriminals': 1, '9news': 1, 'theirabc': 1, 'primeminister': 1, 'sackfomo': 1, 'sackscummo': 1, 'southcoastfires': 1, 'bushfires': 1, 'nswpol': 1, 'peckpalitchoke': 1, 

## Process chunk

In [0]:
def process_chunk(filename, chunks, number):
  """
  Pick and process equal-size chunk of filename according to its number
  filename: json file containing tweet data (big file)
  chunks: total number of chunks
  number (starting 0): current chunk number
  """

  size = os.path.getsize(datafile)
  start = int(number / chunks * size) # start byte position
  end = int((number+1) / chunks * size) # end byte position
  htcounts, langcounts = count_hashtags_langs(filename, start, end)

  return htcounts, langcounts

# Split chunks
htcounts1, langcounts1 = process_chunk(datafile, 3, 0)
htcounts2, langcounts2 = process_chunk(datafile, 3, 1)
htcounts3, langcounts3 = process_chunk(datafile, 3, 2)
htcounts_split = htcounts1 + htcounts2 + htcounts3
langcounts_split = langcounts1 + langcounts2 + langcounts3
print(htcounts_split)
print(langcounts_split)

Counter({'climatechange': 6, 'scottyfrommarketing': 6, 'auspol': 6, 'australiaburns': 5, 'sonicmovie': 4, 'bushfiresnsw': 3, 'nswfires': 3, 'sydnye': 2, 'sydney': 2, 'climate': 2, 'happynewyear': 2, 'australianfires': 2, 'fireworks': 2, 'nye2020': 2, 'canberra': 2, 'เป๊กผลิตโชค': 2, 'liarfromtheshire': 2, 'morrisonfires': 2, 'countdown2020withpeck': 2, 'nye': 2, 'configmgr': 1, 'ilovesydney': 1, 'sideproject': 1, 'makers': 1, 'bulltitts': 1, 'fatca': 1, 'decadechallenge': 1, '2010vs2020': 1, 'abc': 1, 'priorities': 1, 'stowaway': 1, 'joseout': 1, 'duindorp': 1, 'gardening': 1, 'urbangardens': 1, 'newyearsday2020': 1, 'smoko': 1, 'youvestolenmychildhood': 1, 'howdareyou': 1, 'peoplearedying': 1, 'coalman': 1, 'brisbane': 1, 'labor': 1, 'loveislandau': 1, 'cheers': 1, 'skål': 1, 'avstwitterpsychic': 1, 'goavsgo': 1, 'climatecriminals': 1, '9news': 1, 'theirabc': 1, 'primeminister': 1, 'sackfomo': 1, 'sackscummo': 1, 'southcoastfires': 1, 'bushfires': 1, 'nswpol': 1, 'peckpalitchoke': 1, 

## Sum respective counts from all chunks and find overall top 10s

In [0]:
N = 3 # number of chunks

def main():
  htcounts_all = Counter() # overall hashtag counts
  langcounts_all = Counter() # overall language counts
  for i in range(N): # for each chunk number
    htcounts, langcounts = process_chunk(datafile, N, i)
    htcounts_all += htcounts
    langcounts_all += langcounts
  print(htcounts_all)
  print(langcounts_all)
  pprint(htcounts_all.most_common(10))
  pprint(langcounts_all.most_common(10))

main()

Counter({'climatechange': 6, 'scottyfrommarketing': 6, 'auspol': 6, 'australiaburns': 5, 'sonicmovie': 4, 'bushfiresnsw': 3, 'nswfires': 3, 'sydnye': 2, 'sydney': 2, 'climate': 2, 'happynewyear': 2, 'australianfires': 2, 'fireworks': 2, 'nye2020': 2, 'canberra': 2, 'เป๊กผลิตโชค': 2, 'liarfromtheshire': 2, 'morrisonfires': 2, 'countdown2020withpeck': 2, 'nye': 2, 'configmgr': 1, 'ilovesydney': 1, 'sideproject': 1, 'makers': 1, 'bulltitts': 1, 'fatca': 1, 'decadechallenge': 1, '2010vs2020': 1, 'abc': 1, 'priorities': 1, 'stowaway': 1, 'joseout': 1, 'duindorp': 1, 'gardening': 1, 'urbangardens': 1, 'newyearsday2020': 1, 'smoko': 1, 'youvestolenmychildhood': 1, 'howdareyou': 1, 'peoplearedying': 1, 'coalman': 1, 'brisbane': 1, 'labor': 1, 'loveislandau': 1, 'cheers': 1, 'skål': 1, 'avstwitterpsychic': 1, 'goavsgo': 1, 'climatecriminals': 1, '9news': 1, 'theirabc': 1, 'primeminister': 1, 'sackfomo': 1, 'sackscummo': 1, 'southcoastfires': 1, 'bushfires': 1, 'nswpol': 1, 'peckpalitchoke': 1, 

In [0]:
import regex

text = "can't, Å, é, and #中ABC _ #sh_t! #abc #มาพ่องเพิ่งอะไร"
print(regex.findall('#\w+', text))

['#中ABC', '#sh_t', '#abc', '#มาพ่องเพิ่งอะไร']


# Run application

In [0]:
%run 