# Twitter SUDO Notebook
Team 84 - Melbourne\
Brendan Pichler(bpichler@student.unimelb.edu.au) 1212335\
George Wang (wagw@student.unimelb.edu.au) 1084224\
Luchen Zhou(luczhou@student.unimelb.edu.au) 1053412\
Wei Wang(wangw16@student.unimelb.edu.au) 900889\
Yihan Wang (yihwang3@student.unimelb.edu.au) 1056614\


In [7]:
import os
from os import listdir
from os.path import isfile, join
import numpy as np
import random
import math
import sys
import time
import ijson
import json
import nltk
import re
from nltk.corpus import stopwords

## Twitter Preprocessing 
Filter 65GB data into 1.5GB using sal data

In [2]:
# Abbreviated forms of Australian States/Territories
STATE_ABBREVS = {
    'New South Wales': '(nsw)',
    'Victoria': '(vic.)',
    'Queensland': '(qld)',
    'South Australia': '(sa)',
    'Western Australia': '(wa)',
    'Tasmania': '(tas.)',
    'Northern Territory': '(nt)',
    'Australian Capital Territory': '(act)',
}

# Australian State Capitals
STATE_CAPITALS = {
    'Sydney': '1gsyd',
    'Melbourne': '2gmel',
    'Brisbane': '3gbri',
    'Adelaide': '4gade',
    'Perth': '5gper',
    'Hobart': '6ghob',
    'Darwin': '7gdar',
    'Canberra': '8acte',
}

def parse_location(location):
    m = re.match('(?P<region>[a-z\s\-]+)(?:,\s)?(?P<state>.+)?', location, re.IGNORECASE)
    return m.groupdict() if m is not None else None

# Generate candidate keys for look up in sal
def generate_candidate_keys(region, state):
    state_abbrev = STATE_ABBREVS.get(state)
    candidate_keys = region.lower().split(' - ')

    # Keep original as a potential candidate
    candidate_keys.append(region)

    if state_abbrev is not None:
        for region in region.split(' - '):
            candidate_key = region.lower() + ' ' + state_abbrev
            candidate_keys.append(candidate_key)
    
    return candidate_keys

# Get candidates gccs
def get_candidate_gccs(keys):
    candidate_gccs = set()

    for key in keys:
        if sal.get(key) is not None:
            candidate_gccs.add(sal.get(key)['gcc'])

    return candidate_gccs

# Test if unique gcc resolved
def location_resolved(gccs):
    return len(gccs) == 1

# Test if location is a capital city
def is_capital(location):
    return location in STATE_CAPITALS
def process_location(region, state):

    # Check if capital in location information
    if is_capital(state):
        return STATE_CAPITALS.get(state)
    
    if is_capital(region):
        return STATE_CAPITALS.get(region)
    
    # Otherwise generate candidate keys and look up in sal
    keys = generate_candidate_keys(region, state)
    gccs = get_candidate_gccs(keys)

    # Return gcc if unique, otherwise None
    if location_resolved(gccs):
        return gccs.pop()
    else:
        return None

In [4]:
SAL_file = 'C:/Users/81409/OneDrive/桌面/CCC/sal.json'
sal = json.load(open(SAL_file, encoding='utf-8'))

In [5]:
policy_bow = ["government","democracy",  "elections", "voting","campaigns","political parties","legislation", "policy", "administration", "diplomacy", "foreign policy","domestic policy", "public policy", "law",     "constitution",     "civil rights",     "civil liberties",     "social justice",     "equality",     "political ideology",     "political spectrum",     "lobbying",     "special interest groups",     "media",     "political commentary",     "political satire",     "corruption",     "transparency",     "accountability",     "political science",     "international relations",     "public opinion",     "propaganda",     "power",     "authority",     "leadership",     "governance",     "policy making",     "public administration",    "bureaucracy",    "campaign finance",    "censorship",    "checks and balances",    "citizenship",    "constituency",    "crisis management",    "debates",    "defamation",    "dictatorship",    "discrimination",    "divisiveness",    "economic policy",    "election security",    "emergency powers",    "fascism",    "freedom of speech",    "human rights",    "impeachment",    "judicial system",    "legislative branch",    "libertarianism",    "lobbyists",    "military",    "minorities",    "nationalism","patriotism","peacekeeping","political asylum","political correctness","political culture","political economy","political stability","populism","protest","public service","reform","representation","revolution","separation of powers","socialism","sovereignty","state","totalitarianism","veto","war","welfare state"]
criminal_bow = ['theft', 'robbery', 'burglary', 'fraud', 'embezzlement', 'forgery', 'extortion', 'blackmail', 'smuggling', 'money laundering', 'racketeering', 'homicide', 'assault', 'kidnapping', 'arson', 'drug trafficking', 'prostitution', 'gambling', 'terrorism', 'cybercrime', 'piracy', 'carjacking', 'vandalism', 'shoplifting', 'pickpocketing', 'cyberbullying', 'hate crime', 'white-collar crime', 'organized crime', 'juvenile delinquency', 'prison', 'parole', 'probation', 'criminal record', 'suspect', 'defendant', 'convict', 'witness', 'prosecutor', 'defense attorney', 'judge', 'jury', 'plea bargain', 'sentencing', 'probation officer', 'correctional facility', 'parole board', 'inmate', 'parolee', 'fugitive', 'surveillance', 'wiretap', 'sting operation', 'forensic','evidence', 'DNA', 'investigation']
employment_bow = ['career', 'profession', 'resume', 'CV', 'interview', 'compensation', 'workplace', 'coworker', 'supervisor', 'manager', 'networking', 'diversity', 'harassment', 'opportunity', 'laws', 'overtime', 'sick leave', 'vacation', 'retirement', 'pension', 'severance', 'unemployment', 'loss', 'security', 'gig', 'freelancing', 'entrepreneurship', 'advancement', 'development', 'growth', 'goals', 'balance', 'insurance', 'program', 'maternity', 'paternity', 'child care', 'scheduling', 'telecommuting', 'metrics', 'incentives', 'bonuses', 'turnover', 'unions', 'contracts', 'agreements', 'termination', 'lawsuits', 'safety', 'culture', 'ethics', 'inclusion', 'resources', 'staffing', 'boards', 'fairs', 'associations', 'shadowing', 'mentorship', 'leadership', 'apprenticeships', 'internships', 'co-op']
traffic_bow = ['vehicle', 'driver', 'road', 'highway', 'street', 'lane', 'intersection', 'stoplight', 'stop sign', 'yield sign', 'speed limit', 'traffic signal', 'pedestrian', 'crosswalk', 'sidewalk', 'parking', 'parking lot', 'parking meter', 'public transit', 'bus', 'train', 'subway', 'light rail', 'bike lane', 'carpool', 'commute', 'congestion', 'accident', 'collision', 'tow truck', 'highway patrol', 'traffic jam', 'detour', 'roadwork', 'construction', 'bridge', 'tunnel', 'toll road', 'expressway', 'roundabout', 'lane closure', 'merge', 'yield', 'U-turn', 'speed bump', 'traffic circle', 'interchange', 'overpass', 'underpass', 'median', 'shoulder', 'off-ramp', 'on-ramp']
# policy_bow = ["election", "vote", "candidate", "campaign", "party", "government", "policy", "law", "justice", "democracy", "freedom", "rights", "constitution", "representative", "power", "sovereignty", "diplomacy", "foreign policy", "national security", "border", "immigration", "citizenship", "regime", "ideology", "political system", "political party", "political institution", "executive", "legislative", "judiciary", "impeachment", "constitutional amendment", "civil rights", "civil liberties", "civic duty", "activism", "protest", "social justice", "inequality", "discrimination", "human rights", "public opinion", "opinion poll", "media", "propaganda", "lobbying", "interest group", "corruption", "accountability", "transparency"]

# just for removing the prefix and suffix
def remove_prefix(bow):
  # Define patterns to match prefixes and suffixes
  prefix_pattern = re.compile(r'^(un|dis|non|anti|in|im|il|ir|over|out|pre|post|sub|super|re)+', re.IGNORECASE)
  suffix_pattern = re.compile(r'(ing|ed|s|es|ly|ment|able|ible|ive|tion|ion|ate|al|ish|ous|ic)+$', re.IGNORECASE)
  
  bow = [term.lower() for term in bow]
  processed_words= []
  for words in bow:
      # Remove any prefixes or suffixes
      words = re.sub(prefix_pattern, '', words)
      words = re.sub(suffix_pattern, '', words)
      # Split the term by spaces
      words_split = words.split(' ')
      # Add each part to the processed list
      for word in words_split:
          processed_words.append(word)
          
  # Remove any duplicates from the list
  processed_terms = list(set(processed_words))

  return processed_terms

In [8]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
my_str = 'hypothetically|ancient|700|year|Jewish|temple|was|found|another|country|legislation|burglaries'
# use the nltk library to convert a list of words into root words
def to_root_words(bow):
    lemmatizer = WordNetLemmatizer()
    ps = PorterStemmer()
    root_list = []
    for words in bow:
        words_split = words.split(' ')
        for word in words_split:
            new_word = word.lower()
            new_word = ps.stem(new_word)
            new_word = lemmatizer.lemmatize(new_word)
            root_list.append(new_word)
    # root_list = [lemmatizer.lemmatize(word) for word in root_list]
    # bow = [word.lower() for word in bow]
    # root_list = [lemmatizer.lemmatize(word) for word in bow]
    # root_list = [ps.stem(word) for word in root_list]
    return list(set(root_list))
def check_include_topic(input_string,topic_bow):
    input = re.findall(r'\w+', input_string)

    # convert to all lower case
    input = [word.lower() for word in input]
    input_remove = to_root_words(input)
    for word in input_remove:
        if word in topic_bow:
            return True
    return False
print(check_include_topic(my_str, criminal_bow))

False


In [9]:
employment_bow = to_root_words(employment_bow)
traffic_bow = to_root_words(traffic_bow)
criminal_bow = to_root_words(criminal_bow)

In [None]:
import re
import json
from collections import defaultdict
#mypath = 'drive/MyDrive/CCC_Ass2/data/'
#TWEET_NAME = 'twitter-huge.json'
#TWEET_FILE = mypath+TWEET_NAME
TWEET_FILE = "D:/ccc/mnt/ext100/twitter-huge.json"
## {id:"",user_name:"",key:"",tokens:"",full_name:"",location:"",lang:"",text:""}

# {rows.item.doc.data.entities.mentions.item.id, 
# rows.item.doc.data.entities.mentions.item.username,
# rows.item.key =>  contain several values() start with value=start_array end with value=end_array
# rows.item.value.tokens : text like 
# rows.item.doc.includes.places.item.full_name : state location info
# rows.item.doc.data.lang: language info
# rows.item.doc.data.text: 
# }

with open(TWEET_FILE, encoding='utf-8') as file:
    parser = ijson.parse(file)
    with open('spolicy_tweets.json', 'w', encoding='utf-8') as outfile:
        for prefix, event, value in parser:
            try:
                if prefix == 'rows.item' and event =='start_map':
                    tweets_dict = defaultdict()
                    policy_realted = False
                    if_en = False
                elif prefix == 'rows.item.id' and event == 'string':
                    tweets_dict['tweet_id'] = value
                #elif prefix == 'rows.item.doc.data.text' and event == 'string':
                    #tweets_dict['tweet_text'] = value
                elif prefix == 'rows.item.doc.includes.places.item.full_name' and event == 'string':
                    location = parse_location(value)
                    if (location is not None):
                        region = location.get('region') if location.get('region') is not None else None
                        state = location.get('state') if location.get('state') is not None else None
                        gcc = process_location(region, state)
                        tweets_dict['region'] = region
                        tweets_dict['state'] = state
                        tweets_dict['gcc'] = gcc
                        tweets_dict['location'] = value
                elif prefix == 'rows.item.value.tokens' and event == 'string':
                    tweets_dict['token'] = value
                    if check_include_topic(value,policy_bow) != []:
                        tweets_dict['policy_realted'] = True
                    else:
                        tweets_dict['policy_realted'] = False
                elif prefix == 'rows.item.doc.data.author_id' and event == 'string':
                    tweets_dict['author'] = value
                elif prefix == 'rows.item.doc.data.created_at' and event == 'string':
                    tweets_dict['date_create'] = value
                elif prefix == 'rows.item.doc.data.lang' and value == 'en':
                    if_en = True
                elif prefix == 'rows.item.doc.data.public_metrics.retweet_count' and event == 'number':
                    tweets_dict['retweet_count'] = value
                elif prefix == 'rows.item.doc.data.public_metrics.reply_count' and event == 'number':
                    tweets_dict['reply_count'] = value
                elif prefix == 'rows.item.doc.data.public_metrics.like_count' and event == 'number':
                    tweets_dict['like_count'] = value
                elif prefix == 'rows.item.doc.data.public_metrics.quote_count' and event == 'number':
                    tweets_dict['quote'] = value
                elif prefix == 'rows.item.doc.data.context_annotations.item.domain.name' and event=='string':
                    if check_include_topic(value,policy_bow) != []:
                        tweets_dict['domain'] = value
                        policy_realted = True
                elif prefix == 'rows.item.doc.data.context_annotations.item.entity.name' and event=='string':
                    if check_include_topic(value,policy_bow) != []:
                        policy_realted = True
                        tweets_dict['entity'] = value
                if prefix == 'rows.item' and event =='end_map' and 'location' in tweets_dict and if_en:
                    if 'domain' not in tweets_dict:
                        tweets_dict['domain'] = 'NA'
                    if 'entity' not in tweets_dict:
                        tweets_dict['entity'] = 'NA'
                    if 'region' not in tweets_dict:
                        tweets_dict['region'] = 'NA'
                    if 'state' not in tweets_dict:
                        tweets_dict['state'] = 'NA'
                    if 'gcc' not in tweets_dict:
                        tweets_dict['gcc'] = 'NA'
                    if 'location' not in tweets_dict:
                        tweets_dict['location'] = 'NA'
                    json.dump(tweets_dict, outfile)
                    outfile.write(',')
                    outfile.write('\n')
            except:
                continue

In [None]:
with open('policy_tweets.json', 'r') as file:
    # Read the first line
    line = file.readline()
    # Loop through the file until we reach the end
    while line:
        # Print the line
        json_data = json.loads(line)

        # Print the JSON data
        print(json_data)
        line = file.readline()

## Add data to couchDB and create map reduce

In [None]:
import couchdb
import ijson
import json
import re
# Set up the connection string with username and password
username = 'admin'
password = 'admin'
server_url = f'http://{username}:{password}@172.26.132.19:5984/'
    #http://172.26.132.54:5984/_utils 

# Connect to the server
server = couchdb.Server(server_url)

# Create the 'test_data' database if it does not exist
if 'twitter' not in server:
    server.create('twitter')

# Select the 'test_data' database
db = server['twitter']

# Insert the sample documents into the 'test_data' database
with open('ccc/tweets.json', 'r') as file:
    # Read the first line
    line = file.readline()
    # Loop through the file until we reach the end
    while line:
        line = re.search(r'\{.*?\}', line).group()
        try:
            json_data = json.loads(line)
            # Insert the documents into the test_data database
            db.save(json_data)
        except:
            continue
        line = file.readline()


In [None]:
import couchdb
from couchdb.design import ViewDefinition
# Connect to the CouchDB server
username = 'admin'
password = 'admin'
server_url = f'http://{username}:{password}@172.26.132.19:5984/'
server = couchdb.Server(server_url)
# Connect to the database
db = server['twitter']

# Define the map function
def map_func(doc):
    if doc.get('id'):
        yield (doc['_id'], doc['region'])

# Define the reduce function
def reduce_func(keys, values, rereduce):
    return len(values)

# Define the view
view = ViewDefinition('mydesign', 'myview', map_func, reduce_func=reduce_func)
view.get_doc(db)
print(db)
# Save the view to the database
view.sync(db)

# Query the view
results = db.view('_design/mydesign/_view/myview')
print(results)

In [6]:
import couchdb
from couchdb.design import ViewDefinition
# Connect to the CouchDB server
username = 'admin'
password = 'admin'
server_url = f'http://{username}:{password}@172.26.132.19:5984/'
server = couchdb.Server(server_url)

db = server['twitter']

design_doc = {
    "_id": "_design/WesternAustraliaCount",
    "language": "javascript",
    "views": {
        "myview": {
            "map": "function(doc) {if (doc.state && doc.state.includes('Western Australia')) {emit(doc.location, 1);}}",
            "reduce": "function(keys, values, rereduce) { return sum(values); }"
        }
    }
}

# save the design document to the database
db.save(design_doc)

('_design/WesternAustraliaCount', '1-479c8eb8d60f64233ebf2198ce8a490d')

## Process data and add crime_related, traffic_related and employment_related

In [None]:
import re
#[{"tweet_id": "1491567527322808321", "token":, "policy_realted": true, "author": "45472006", "domain": "Interests and Hobbies", "date_create": "2022-02-10T00:19:45.000Z", "retweet_count": 0, "reply_count": 0, "like_count": 0, "quote": 0, "region": "Rockhampton", "state": "Queensland", "gcc": null, "location": "Rockhampton, Queensland", "entity": "NA"}
new_tweet_dict = {}
count = 0
with open('ccc/tweets.json', 'r') as file:
    with open('ccc/new_tweet_count.json', 'w') as outfile:
    # Read the first line
        line = file.readline()
    # Loop through the file until we reach the end
        while line:

            line = re.search(r'\{.*?\}', line).group()
            json_data = json.loads(line)
            new_tweet_dict['tweet_id'] = json_data['tweet_id']
            token = json_data['token']
            new_tweet_dict['token']= token
            new_tweet_dict['policy_related']= json_data['policy_realted']
            new_tweet_dict['author']= json_data['author']
            new_tweet_dict['domain']= json_data['domain']
            new_tweet_dict['date_create']= json_data['date_create']
            new_tweet_dict['retweet_count']= json_data['retweet_count']
            new_tweet_dict['reply_count']= json_data['reply_count']
            new_tweet_dict['like_count']= json_data['like_count']
            new_tweet_dict['region']= json_data['region']
            new_tweet_dict['state']= json_data['state']
            new_tweet_dict['gcc']= json_data['gcc']
            new_tweet_dict['location']= json_data['location']
            new_tweet_dict['entity']= json_data['entity']
        
            new_tweet_dict['entity']= json_data['entity']
        #employment_bow = to_root_words(employment_bow)
        #traffic_bow = to_root_words(traffic_bow)
        #criminal_bow = to_root_words(criminal_bow)
            new_tweet_dict['crime_related'] = check_include_topic(token,criminal_bow)
            new_tweet_dict['triffic_related'] = check_include_topic(token,traffic_bow)
            new_tweet_dict['employment_related'] = check_include_topic(token,employment_bow)
            json.dump(new_tweet_dict, outfile)

            outfile.write('\n')
            count+=1
            if (count%10000==0):
                print(count)
            line = file.readline()
            

In [None]:
with open('ccc/new_tweets.json', 'w') as f:
    json.dump(new_tweet_dict, f)

## Get the count of topic related data and make twitter general anlysis

In [None]:
import ijson
import json
# Open the JSON file
count = 0
count_dict = {}
with open('ccc/new_tweet_count.json', 'r') as f:
    # Use ijson.items to iterate over the items in the JSON file
    line = f.readline()
    # Loop through the file until we reach the end
    while line:
        json_data = json.loads(line)
        count_dict['total_count'] = count_dict.get('total_count', 0) + 1
        
        if (json_data['policy_related']):
            count_dict['policy_count'] = count_dict.get('policy_count', 0) + 1
        if (json_data['triffic_related']):
            count_dict['traffic_count'] = count_dict.get('traffic_count', 0) + 1
        if (json_data['employment_related']):
            count_dict['employment_count'] = count_dict.get('employment_count', 0) + 1
        if (json_data['crime_related']):
            count_dict['crime_count'] = count_dict.get('crime_count', 0) + 1
        line = f.readline()

In [92]:
with open('ccc/total_count_topics.json', 'w') as f:
    json.dump(count_dict, f)

In [15]:
count = 0
frequency = {}
stop_words = set(stopwords.words('english'))
def get_word_frequency(token,frequency):
    words = token.lower().split("|")
    stop_words.add('the')
    stop_words.add('like')
    stop_words.add('amp')
    words = [word for word in words if not word in stop_words]
    
    for word in words:
        if word in frequency:
            frequency[word] += 1
        else:
            frequency[word] = 1
    return frequency

In [None]:
with open('ccc/tweets.json', 'r') as file:
    # Read the first line
    line = file.readline()
    # Loop through the file until we reach the end
    while line:

        line = re.search(r'\{.*?\}', line)
        if line is not None:
            line = line.group()

            json_data = json.loads(line)
            if (json_data['policy_realted']==True):
                token = json_data['token']
                frequency = get_word_frequency(token)
                count += 1
                if (count%100000==0):
                    print(count)

        line = file.readline()

In [None]:
total = {'total_user':set(),'total_post':0,'total_topic_related_user':set(), 'total_topic_related_post':0,
         'topic_user_ratio':0.0,'topic_post_ration':0.0}
count = 0

with open('ccc/new_tweet_count.json', 'r') as f:
    # Use ijson.items to iterate over the items in the JSON file
    line = f.readline()
    # Loop through the file until we reach the end
    while line:
        json_data = json.loads(line)
        user = json_data['author']
        total['total_post'] += 1
        total['total_user'].add(user)
        if(json_data['crime_related'] or json_data['policy_related'] or json_data['triffic_related'] or json_data['employment_related']):
            total['total_topic_related_post'] += 1
            total['total_topic_related_user'].add(user)
        if (count%10000==0):
            print(count)
        count+=1
        line = f.readline()
total['total_topic_related_user'] = len(total['total_topic_related_user'])
total['topic_user_ratio']  = total['total_topic_related_user'] /total['total_user']
total['topic_post_ration']  = total['total_topic_related_post'] /total['total_post']

In [16]:
import ijson
import json
# Open the JSON file
count = 0
location_info_with_topic_count = {}
word_cloud_each_state_topic = {}

with open('ccc/new_tweet_count.json', 'r') as f:
    # Use ijson.items to iterate over the items in the JSON file
    line = f.readline()
    # Loop through the file until we reach the end
    while line:
        json_data = json.loads(line)
        state = json_data['state']
        author = json_data['author']
        token = json_data['token']
        
        if (state is not None):
            location = state
            if (location not in location_info_with_topic_count):
                location_info_with_topic_count[location] = {}
            if(location not in word_cloud_each_state_topic):
                word_cloud_each_state_topic[location] = {}
                word_cloud_each_state_topic[location]['policy_related'] = {}
                word_cloud_each_state_topic[location]['crime_related'] = {}
                word_cloud_each_state_topic[location]['traffic_related'] = {}
                word_cloud_each_state_topic[location]['employment_related'] = {}
            
            location_info_with_topic_count[location]['total_tweet_count'] = location_info_with_topic_count[location].get('total_tweet_count', 0) + 1
            if ('total_author' not in location_info_with_topic_count[location]):
                location_info_with_topic_count[location]['total_author'] = set()
                
            if ('policy_related' not in location_info_with_topic_count[location]):
                location_info_with_topic_count[location]['policy_related'] = {}
                location_info_with_topic_count[location]['policy_related']['author'] = set()
                location_info_with_topic_count[location]['policy_related']['count'] = 0

                
            if ('crime_related' not in location_info_with_topic_count[location]):
                location_info_with_topic_count[location]['crime_related'] = {}
                location_info_with_topic_count[location]['crime_related']['author'] = set()
                location_info_with_topic_count[location]['crime_related']['count'] = 0
                
            if ('traffic_related' not in location_info_with_topic_count[location]):
                location_info_with_topic_count[location]['traffic_related'] = {}
                location_info_with_topic_count[location]['traffic_related']['author'] = set()
                location_info_with_topic_count[location]['traffic_related']['count'] = 0
                
            if ('employment_related' not in location_info_with_topic_count[location]):
                location_info_with_topic_count[location]['employment_related'] = {}
                location_info_with_topic_count[location]['employment_related']['author'] = set()
                location_info_with_topic_count[location]['employment_related']['count'] = 0
                
                
            location_info_with_topic_count[location]['total_author'].add(author)
            if(json_data['policy_related']):
                location_info_with_topic_count[location]['policy_related']['count'] = location_info_with_topic_count[location]['policy_related'].get('count', 0) + 1
                location_info_with_topic_count[location]['policy_related']['author'].add(author)
                word_cloud_each_state_topic[location]['policy_related'] = get_word_frequency(token,word_cloud_each_state_topic[location]['policy_related'])
            
            if(json_data['crime_related']):
                location_info_with_topic_count[location]['crime_related']['count'] = location_info_with_topic_count[location]['crime_related'].get('count', 0) + 1
                location_info_with_topic_count[location]['crime_related']['author'].add(author)
                word_cloud_each_state_topic[location]['crime_related'] = get_word_frequency(token,word_cloud_each_state_topic[location]['crime_related'])
            
            if(json_data['triffic_related']):
                location_info_with_topic_count[location]['traffic_related']['count'] = location_info_with_topic_count[location]['traffic_related'].get('count', 0) + 1
                location_info_with_topic_count[location]['traffic_related']['author'].add(author)
                word_cloud_each_state_topic[location]['traffic_related'] = get_word_frequency(token,word_cloud_each_state_topic[location]['traffic_related'])
            
            if(json_data['employment_related']):
                location_info_with_topic_count[location]['employment_related']['count'] = location_info_with_topic_count[location]['employment_related'].get('count', 0) + 1
                location_info_with_topic_count[location]['employment_related']['author'].add(author)
                word_cloud_each_state_topic[location]['employment_related'] = get_word_frequency(token,word_cloud_each_state_topic[location]['employment_related'])
        count+=1

        #if (count%10000==0):
            #print(count)

        line = f.readline()
        


In [19]:
location_info_with_topic_count['Victoria']['employment_related']['count']

47523

In [21]:
word_cloud_each_state_topic_new = {}
for state in word_cloud_each_state_topic:
    new_state = state.replace(" ", "_")
    if (new_state in australian_state):
        word_cloud_each_state_topic_new[new_state] = {}
        word_cloud_each_state_topic_new[new_state]['political'] = dict(sorted(word_cloud_each_state_topic[state]['policy_related'].items(), key=lambda x: x[1], reverse=True)[:50])
        word_cloud_each_state_topic_new[new_state]['employment'] = dict(sorted(word_cloud_each_state_topic[state]['employment_related'].items(), key=lambda x: x[1], reverse=True)[:50])
        word_cloud_each_state_topic_new[new_state]['traffic'] = dict(sorted(word_cloud_each_state_topic[state]['traffic_related'].items(), key=lambda x: x[1], reverse=True)[:50])
        word_cloud_each_state_topic_new[new_state]['criminal'] = dict(sorted(word_cloud_each_state_topic[state]['crime_related'].items(), key=lambda x: x[1], reverse=True)[:50])

In [46]:
word_cloud_each_state_topic_top50 = {}
for state in word_cloud_each_state_topic:
    if (state in australian_state):
        word_cloud_each_state_topic_top50[state] = word_cloud_each_state_topic[state]

In [23]:
total_author_by_state ={}
for state in location_info_with_topic_count:
    total_author_by_state[state] = len(location_info_with_topic_count[state]['total_author'])

In [25]:
with open('ccc/total_author_count_by_state.json', 'w') as f:
    json.dump(total_author_by_state, f)

In [24]:
author_count_by_topic_by_state ={}
for state in location_info_with_topic_count:
    author_count_by_topic_by_state[state] = {}
    author_count_by_topic_by_state[state]['total_tweet_count'] = location_info_with_topic_count[state]['total_tweet_count']
    author_count_by_topic_by_state[state]['total_author_count'] = len(location_info_with_topic_count[state]['total_author'])
    
    author_count_by_topic_by_state[state]['policy_author_count'] = len(location_info_with_topic_count[state]['policy_related']['author'])
    author_count_by_topic_by_state[state]['policy_related_ratio'] = author_count_by_topic_by_state[state]['policy_author_count']/author_count_by_topic_by_state[state]['total_author_count']
    author_count_by_topic_by_state[state]['policy_tweet_count'] = location_info_with_topic_count[state]['policy_related']['count']

    
    author_count_by_topic_by_state[state]['employment_author_count'] = len(location_info_with_topic_count[state]['employment_related']['author'])
    author_count_by_topic_by_state[state]['employment_related_ratio'] = author_count_by_topic_by_state[state]['employment_author_count']/author_count_by_topic_by_state[state]['total_author_count']
    author_count_by_topic_by_state[state]['employment_tweet_count'] = location_info_with_topic_count[state]['employment_related']['count']

    
    author_count_by_topic_by_state[state]['crime_author_count'] = len(location_info_with_topic_count[state]['crime_related']['author'])
    author_count_by_topic_by_state[state]['crime_related_ratio'] = author_count_by_topic_by_state[state]['crime_author_count']/author_count_by_topic_by_state[state]['total_author_count']
    author_count_by_topic_by_state[state]['crime_tweet_count'] = location_info_with_topic_count[state]['crime_related']['count']

    
    author_count_by_topic_by_state[state]['traffic_author_count'] = len(location_info_with_topic_count[state]['traffic_related']['author'])
    author_count_by_topic_by_state[state]['traffic_related_ratio'] = author_count_by_topic_by_state[state]['traffic_author_count']/author_count_by_topic_by_state[state]['total_author_count']
    author_count_by_topic_by_state[state]['traffic_tweet_count'] = location_info_with_topic_count[state]['traffic_related']['count']


In [25]:
topic_state_info ={'political':{},'criminal':{},'employment':{},'traffic':{}}
for state in location_info_with_topic_count:
    new_state = state.replace(" ", "_")
    if (new_state in australian_state):
        topic_state_info['political'][new_state] ={}
        topic_state_info['political'][new_state]['user_ratio'] = author_count_by_topic_by_state[state]['policy_related_ratio']
        topic_state_info['political'][new_state]['tweet_ratio'] = author_count_by_topic_by_state[state]['policy_tweet_count']/author_count_by_topic_by_state[state]['total_tweet_count']
        topic_state_info['political'][new_state]['user_count'] = author_count_by_topic_by_state[state]['policy_author_count']
        topic_state_info['political'][new_state]['tweet_count'] = author_count_by_topic_by_state[state]['policy_tweet_count']
        topic_state_info['political'][new_state]['total_tweet_count'] = author_count_by_topic_by_state[state]['total_tweet_count']
        topic_state_info['political'][new_state]['total_user_count'] = author_count_by_topic_by_state[state]['total_author_count']
        
        topic_state_info['employment'][new_state] ={}
        topic_state_info['employment'][new_state]['user_ratio'] = author_count_by_topic_by_state[state]['employment_related_ratio']
        topic_state_info['employment'][new_state]['tweet_ratio'] = author_count_by_topic_by_state[state]['employment_tweet_count']/author_count_by_topic_by_state[state]['total_tweet_count']
        topic_state_info['employment'][new_state]['user_count'] = author_count_by_topic_by_state[state]['employment_author_count']
        topic_state_info['employment'][new_state]['tweet_count'] = author_count_by_topic_by_state[state]['employment_tweet_count']
        topic_state_info['employment'][new_state]['total_tweet_count'] = author_count_by_topic_by_state[state]['total_tweet_count']
        topic_state_info['employment'][new_state]['total_user_count'] = author_count_by_topic_by_state[state]['total_author_count']
        
        topic_state_info['criminal'][new_state] ={}
        topic_state_info['criminal'][new_state]['user_ratio'] = author_count_by_topic_by_state[state]['crime_related_ratio']
        topic_state_info['criminal'][new_state]['tweet_ratio'] = author_count_by_topic_by_state[state]['crime_tweet_count']/author_count_by_topic_by_state[state]['total_tweet_count']
        topic_state_info['criminal'][new_state]['user_count'] = author_count_by_topic_by_state[state]['crime_author_count']
        topic_state_info['criminal'][new_state]['tweet_count'] = author_count_by_topic_by_state[state]['crime_tweet_count']
        topic_state_info['criminal'][new_state]['total_tweet_count'] = author_count_by_topic_by_state[state]['total_tweet_count']
        topic_state_info['criminal'][new_state]['total_user_count'] = author_count_by_topic_by_state[state]['total_author_count']
        
        topic_state_info['traffic'][new_state] ={}
        topic_state_info['traffic'][new_state]['user_ratio'] = author_count_by_topic_by_state[state]['traffic_related_ratio']
        topic_state_info['traffic'][new_state]['tweet_ratio'] = author_count_by_topic_by_state[state]['traffic_tweet_count']/author_count_by_topic_by_state[state]['total_tweet_count']
        topic_state_info['traffic'][new_state]['user_count'] = author_count_by_topic_by_state[state]['traffic_author_count']
        topic_state_info['traffic'][new_state]['tweet_count'] = author_count_by_topic_by_state[state]['traffic_tweet_count']
        topic_state_info['traffic'][new_state]['total_tweet_count'] = author_count_by_topic_by_state[state]['total_tweet_count']
        topic_state_info['traffic'][new_state]['total_user_count'] = author_count_by_topic_by_state[state]['total_author_count']

In [27]:
with open('ccc/topic_state_info.json', 'w') as f:
    json.dump(topic_state_info, f)

In [17]:
australian_state = ['New_South_Wales','Victoria','Queensland','South_Australia','Western_Australia','Tasmania','Australian_Capital_Territory','Northern_Territory']

In [24]:
australian_state = set(australian_state)
australian_state

{'Australian_Capital_Territory',
 'New_South_Wales',
 'Northern_Territory',
 'Queensland',
 'South_Australia',
 'Tasmania',
 'Victoria',
 'Western_Australia'}

In [39]:
state_author_count = {}
for state in author_count_by_topic_by_state:
    if (state in australian_state):
        state_author_count[state] = author_count_by_topic_by_state[state]

In [14]:
for location in location_info_with_topic_count:
    if ('policy_related_count' not in location_info_with_topic_count[location]):
        location_info_with_topic_count[location]['policy_related_count'] = 0
    if ('crime_related_count' not in location_info_with_topic_count[location]):
        location_info_with_topic_count[location]['crime_related_count'] = 0
    if ('employment_related_count' not in location_info_with_topic_count[location]):
        location_info_with_topic_count[location]['employment_related_count'] = 0
    if ('triffic_related_count' not in location_info_with_topic_count[location]):
        location_info_with_topic_count[location]['triffic_related_count'] = 0

In [15]:
location_info_with_topic_count
sorted_locations_total_top100 = dict(sorted(location_info_with_topic_count.items(), key=lambda x: x[1]['total_count'], reverse=True)[:100])

In [59]:
sorted_locations_policy_top100_ratio =  dict(sorted(sorted_locations_total_top100.items(), key=lambda x: x[1]['policy_related_count']/x[1]['total_count'], reverse=True)[:100])
sorted_locations_crime_top100_ratio =  dict(sorted(sorted_locations_total_top100.items(), key=lambda x: x[1]['crime_related_count']/x[1]['total_count'], reverse=True)[:100])
sorted_locations_employment_top100_ratio =  dict(sorted(sorted_locations_total_top100.items(), key=lambda x: x[1]['employment_related_count']/x[1]['total_count'], reverse=True)[:100])
sorted_locations_triffic_top100_ratio =  dict(sorted(sorted_locations_total_top100.items(), key=lambda x: x[1]['triffic_related_count']/x[1]['total_count'], reverse=True)[:100])

## Create maps

In [28]:
import geopy
import folium
import googlemaps

# Initialize the geocoder
api_key = "AIzaSyABe6PkJW8dvA3RH-BeAFSzzEZpkxVYVI0"
gmaps = googlemaps.Client(key=api_key)
for location in sorted_locations_triffic_top100:
    sorted_locations_triffic_top100[location]['geo_info'] = gmaps.geocode(location)

In [64]:
with open('ccc/sorted_locations_triffic_top100_ratio.json', 'w') as f:
    json.dump(sorted_locations_triffic_top100_ratio, f)

In [None]:
used_for_map = {}
for location in sorted_locations_top100:
    if (sorted_locations_top100[location]['location_code'] is not None):
        used_for_map[location] = sorted_locations_top100[location]

In [85]:
# Create a map centered at Sydney, Australia
first_item = next(iter(sorted_locations_triffic_top100_ratio.values()))
print(first_item)
start_lat = first_item['geo_info'][0]['geometry']['location']['lat']
start_lng = first_item['geo_info'][0]['geometry']['location']['lng']
map = folium.Map(location=[start_lat, start_lng], zoom_start=10)
count = 0
for location in sorted_locations_triffic_top100_ratio:
    result = sorted_locations_employment_top100_ratio[location]['geo_info']
    # Get the latitude and longitude from the result
    location_lat = result[0]['geometry']['location']['lat']
    location_lng = result[0]['geometry']['location']['lng']
    # Create a marker at the location
    if (count == 0 ): marker = folium.Marker(location=[location_lat, location_lng], icon=folium.Icon(color='red'))
    elif (count == 1):   
        marker = folium.Marker(location=[location_lat, location_lng], icon=folium.Icon(color='pink'))
    elif (count == 2):   
        marker = folium.Marker(location=[location_lat, location_lng], icon=folium.Icon(color='green'))
    else: 
        marker = folium.Marker(location=[location_lat, location_lng])
    popup_text = f"<b>{sorted_locations_employment_top100_ratio[location]['region']}</b><br> Total: {sorted_locations_employment_top100_ratio[location]['total_count']}, <br>policy: {sorted_locations_crime_top100_ratio[location]['policy_related_count']}, <br>crime: {sorted_locations_crime_top100_ratio[location]['crime_related_count']}, <br> employment: {sorted_locations_crime_top100_ratio[location]['employment_related_count']}, <br>triffic: {sorted_locations_crime_top100_ratio[location]['triffic_related_count']}"
    marker.add_child(folium.Popup(popup_text,max_width=150))
    marker.add_to(map)

    # Draw a bounding box around the location
    bounds = result[0]['geometry']['viewport']
    sw = [bounds['southwest']['lat'], bounds['southwest']['lng']]
    ne = [bounds['northeast']['lat'], bounds['northeast']['lng']]
    folium.Rectangle(bounds=[sw, ne], color='#3186cc', fill=True, fill_color='#3186cc', fill_opacity=0.2).add_to(map)
    count+=1
map.save('ccc/traffic_count_map_ratio.html')
map

{'total_count': 2294, 'triffic_related_count': 1135, 'state': 'Canberra', 'location_code': '8acte', 'region': 'Paddys River', 'employment_related_count': 29, 'policy_related_count': 1, 'crime_related_count': 0, 'geo_info': [{'address_components': [{'long_name': 'Paddys River', 'short_name': 'Paddys River', 'types': ['locality', 'political']}, {'long_name': 'Australian Capital Territory', 'short_name': 'ACT', 'types': ['administrative_area_level_1', 'political']}, {'long_name': 'Australia', 'short_name': 'AU', 'types': ['country', 'political']}, {'long_name': '2620', 'short_name': '2620', 'types': ['postal_code']}], 'formatted_address': 'Paddys River ACT 2620, Australia', 'geometry': {'bounds': {'northeast': {'lat': -35.3195077, 'lng': 149.0762151}, 'southwest': {'lat': -35.5354139, 'lng': 148.8676703}}, 'location': {'lat': -35.444795, 'lng': 148.9604438}, 'location_type': 'APPROXIMATE', 'viewport': {'northeast': {'lat': -35.3195077, 'lng': 149.0762151}, 'southwest': {'lat': -35.5354139

In [None]:
# Create a map centered at the location
map = folium.Map(location=[location_lat, location_lng], zoom_start=15)

# Create a marker at the location
marker = folium.Marker(location=[location_lat, location_lng])
popup_text = f"{state_name}"
marker.add_child(folium.Popup(popup_text))
marker.add_to(map)

# Draw a bounding box around the location
bounds = result[0]['geometry']['viewport']
sw = [bounds['southwest']['lat'], bounds['southwest']['lng']]
ne = [bounds['northeast']['lat'], bounds['northeast']['lng']]
folium.Rectangle(bounds=[sw, ne], color='#3186cc', fill=True, fill_color='#3186cc', fill_opacity=0.2).add_to(map)

# Display the map
map

In [None]:
marker = folium.Marker(location=[location_lat, location_lng])

    # Add a popup to the marker showing the area name and count
popup_text = f"{state_name}"
marker.add_child(folium.Popup(popup_text))

# Add the marker to the map
marker.add_to(map)
map

## SUDO and twitter data anlysis

In [2]:
import pandas as pd

In [3]:
traffic_df = pd.read_csv('ccc/traffic.csv') 
employment_df = pd.read_csv('ccc/employment.csv') 
income_df = pd.read_csv('ccc/household.csv') 
criminal_df = pd.read_csv('ccc/crime.csv') 
political_df = pd.read_csv('ccc/political.csv') 
sa4 = pd.read_csv('ccc/SA4 State.csv') 

In [5]:
australian_states = {'New South Wales', 'Victoria', 'Queensland', 'Western Australia', 'South Australia', 'Tasmania', 'Australian Capital Territory', 'Northern Territory'}

In [48]:
sa4['SA4_CODE_2016'][1]

102

In [6]:
sa4_code_dict = {}
for i in range(len(sa4['SA4_CODE_2016'])):
    code = sa4['SA4_CODE_2016'][i]
    state = sa4['STATE_NAME_2016'][i]
    sa4_code_dict[code] = state

In [7]:
political_df.columns

Index(['lq_3_3', ' lq_3', ' stateab', ' poll_id'], dtype='object')

In [138]:
political_dict = {}
for i in range(len(political_df[' stateab'])):
    state = state_names_uppercase[political_df[' stateab'][i]]
    new_state = state.replace(" ", "_")
    if (new_state not in political_dict):
        political_dict[new_state] = {'first_voter_counts':0.0,'poll_counts':0}
    political_dict[new_state]['first_voter_counts']+=political_df[' lq_3'][i]
    political_dict[new_state]['poll_counts'] += 1

In [139]:
political_dict

{'New_South_Wales': {'first_voter_counts': 2279.7798963160008,
  'poll_counts': 2561},
 'Victoria': {'first_voter_counts': 1529.9651703689985, 'poll_counts': 1698},
 'Queensland': {'first_voter_counts': 1240.6236592580003, 'poll_counts': 1344},
 'South_Australia': {'first_voter_counts': 580.3644889039997,
  'poll_counts': 642},
 'Western_Australia': {'first_voter_counts': 694.8065777769998,
  'poll_counts': 741},
 'Tasmania': {'first_voter_counts': 259.0274962959998, 'poll_counts': 305},
 'Northern_Territory': {'first_voter_counts': 81.59863703400005,
  'poll_counts': 69},
 'Australian_Capital_Territory': {'first_voter_counts': 99.01782222299998,
  'poll_counts': 79}}

In [140]:
for state in political_dict:
    political_dict[state]['first_voter_average'] = political_dict[state]['first_voter_counts']/political_dict[state]['poll_counts']

In [142]:
file_path = 'ccc/political_sudo.json'
with open(file_path, 'w') as file:
    json.dump(political_dict, file)

In [4]:
state_names = {
    'NSW': 'New South Wales',
    'Qld': 'Queensland',
    'SA': 'South Australia',
    'Tas': 'Tasmania',
    'NT': 'Northern Territory',
    'ACT': 'Australian Capital Territory',
    'VIC':'Victoria',
    'WA': 'Western Australia'
}
state_names_uppercase = {key.upper(): value for key, value in state_names.items()}
state_names_uppercase

{'NSW': 'New South Wales',
 'QLD': 'Queensland',
 'SA': 'South Australia',
 'TAS': 'Tasmania',
 'NT': 'Northern Territory',
 'ACT': 'Australian Capital Territory',
 'VIC': 'Victoria',
 'WA': 'Western Australia'}

In [119]:
criminal_dict = {}
for i in range(len(criminal_df)):
    state = state_names[criminal_df['State'][i]]
    new_state = state.replace(" ", "_")
    criminal_dict[new_state] = {'acts_intended_to_cause_injury':criminal_df['Acts intended to cause injury(b)'][i],
                                'offences_against_justice': criminal_df['Offences against justice(c)'][i],
                                 'theft':criminal_df['Theft'][i]}

In [120]:
criminal_dict

{'New_South_Wales': {'acts_intended_to_cause_injury': 39.4,
  'offences_against_justice': 12.6,
  'theft': 10.7},
 'Queensland': {'acts_intended_to_cause_injury': 18.4,
  'offences_against_justice': 22.6,
  'theft': 13.4},
 'South_Australia': {'acts_intended_to_cause_injury': 35.5,
  'offences_against_justice': 10.5,
  'theft': 14.7},
 'Tasmania': {'acts_intended_to_cause_injury': 28.9,
  'offences_against_justice': 16.7,
  'theft': 11.6},
 'Northern_Territory': {'acts_intended_to_cause_injury': 43.5,
  'offences_against_justice': 19.0,
  'theft': 4.5},
 'Australian_Capital_Territory': {'acts_intended_to_cause_injury': 31.3,
  'offences_against_justice': 23.5,
  'theft': 15.7}}

In [122]:
file_path = 'ccc/criminal_sudo.json'
with open(file_path, 'w') as file:
    json.dump(criminal_dict, file)

In [11]:
employment_dict = {}
for i in range(len(employment_df[' sa4_code11'])):
    sa4 = employment_df[' sa4_code11'][i]
    if (sa4 in sa4_code_dict):
        state = sa4_code_dict[sa4]
        new_state = state.replace(" ", "_")
        if (new_state not in employment_dict):
            employment_dict[new_state] = {'science_industry':0.0,'construction_industry':0.0,'financial_insurance_industry':0.0,'total_industry':0.0,'sa4_counts':0}
        employment_dict[new_state]['science_industry'] += employment_df[' proj_empy_grth_five_yrs_may_2022_pr100_pro_sci_tech'][i]
        employment_dict[new_state]['construction_industry'] += employment_df[' proj_empy_grth_five_yrs_may_2022_pr100_cons'][i]
        employment_dict[new_state]['financial_insurance_industry'] += employment_df[' proj_empy_grth_five_yrs_may_2022_pr100_finc_insur'][i]
        employment_dict[new_state]['total_industry'] += employment_df[' proj_empy_grth_five_yrs_may_2022_000_tot_industry'][i]
        employment_dict[new_state]['sa4_counts'] += 1

In [14]:
for state in employment_dict:
    employment_dict[state]['science_average_growth'] = employment_dict[state]['science_industry']/employment_dict[new_state]['sa4_counts']
    employment_dict[state]['construction_average_growth'] = employment_dict[state]['construction_industry']/employment_dict[new_state]['sa4_counts']
    employment_dict[state]['financial_average_growth'] = employment_dict[state]['financial_insurance_industry']/employment_dict[new_state]['sa4_counts']
    employment_dict[state]['total_average_growth'] = employment_dict[state]['total_industry']/employment_dict[new_state]['sa4_counts']

In [16]:
employment_final_dict = {}
for state in employment_dict:
    employment_final_dict[state] ={}
    employment_final_dict[state]['science_average_growth'] = employment_dict[state]['science_average_growth']
    employment_final_dict[state]['construction_average_growth'] = employment_dict[state]['construction_average_growth']
    employment_final_dict[state]['financial_average_growth'] = employment_dict[state]['financial_average_growth']
    employment_final_dict[state]['total_average_growth'] = employment_dict[state]['total_average_growth']
employment_final_dict

{'New_South_Wales': {'science_average_growth': 159.01888757999998,
  'construction_average_growth': 177.44163839999996,
  'financial_average_growth': 63.11964656,
  'total_average_growth': 159.97625014999997},
 'Victoria': {'science_average_growth': 105.4256052,
  'construction_average_growth': 108.79536395,
  'financial_average_growth': 23.817779475000005,
  'total_average_growth': 137.2852897},
 'Queensland': {'science_average_growth': 87.74902469999998,
  'construction_average_growth': 56.330685835,
  'financial_average_growth': 42.039010239999996,
  'total_average_growth': 90.63743324500001},
 'South_Australia': {'science_average_growth': 12.661294300000002,
  'construction_average_growth': 26.6815509,
  'financial_average_growth': 7.665607615,
  'total_average_growth': 19.42952895},
 'Western_Australia': {'science_average_growth': 24.896312350000002,
  'construction_average_growth': 32.5555484,
  'financial_average_growth': 16.736769250000002,
  'total_average_growth': 45.3072578}

In [18]:
file_path = 'ccc/employment_sudo.json'
with open(file_path, 'w') as file:
    json.dump(employment_final_dict, file)

In [94]:
traffic_df

Unnamed: 0,sa4_code_2021,worked_home_p,two_methods_train_tot_p,one_method_train_p,tot_p,one_met_tram_or_lt_rail_p,two_methods_bus_ferry_p,two_methds_othr_two_methds_p,one_method_bicycle_p,one_method_truck_p,...,two_methods_train_ferry_p,one_method_other_p,one_method_car_as_driver_p,one_method_car_as_passenger_p,two_methods_bus_other_p,one_method_tot_one_method_p,three_meth_othr_three_meth_p,one_method_ferry_p,two_methods_trn_car_as_pass_p,three_meth_tot_three_meth_p
0,101,13322,55,38,111423,8,0,685,369,1382,...,0,573,72422,5168,18,84989,38,13,3,79
1,102,38433,926,1126,150834,9,10,624,273,1192,...,8,580,74408,4924,33,85838,23,24,72,211
2,103,11354,34,24,95519,0,0,568,246,1046,...,4,464,62148,4730,8,73126,33,23,0,45
3,104,6710,18,10,59192,0,0,349,416,593,...,0,249,38422,2985,3,45126,9,7,0,28
4,105,5073,7,4,49502,7,0,313,174,576,...,0,314,32347,2822,3,39113,32,8,0,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,604,3260,10,4,50109,0,0,259,128,339,...,0,265,34962,2433,11,40426,18,9,0,28
85,701,3235,30,6,74631,7,0,593,1266,398,...,0,866,50635,4399,58,63244,33,27,6,113
86,702,1252,3,8,31846,6,3,406,872,153,...,0,380,17098,2680,14,27296,35,14,3,61
87,801,27189,84,110,248388,2281,0,2412,5216,1009,...,0,1068,146631,12458,308,190134,112,33,4,535


In [92]:
traffic_dict = {}
for i in range(len(traffic_df['sa4_code_2021'])):
    sa4 = traffic_df['sa4_code_2021'][i]
    state = sa4_code_dict[sa4]
    new_state = state.replace(" ", "_")
    if (new_state not in traffic_dict):
        traffic_dict[new_state] = {'one_method_total':0,'two_method_total':0,'three_method_total':0,'total_people':0}
    traffic_dict[new_state]['one_method_total'] += np.int(traffic_df[' one_method_tot_one_method_p'][i])
    traffic_dict[new_state]['two_method_total'] += np.int(traffic_df[' two_methods_tot_two_methods_p'][i])
    traffic_dict[new_state]['three_method_total'] += np.int(traffic_df[' three_meth_tot_three_meth_p'][i])
    traffic_dict[new_state]['total_people'] += np.int(traffic_df[' tot_p'][i])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  traffic_dict[new_state]['one_method_total'] += np.int(traffic_df[' one_method_tot_one_method_p'][i])
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  traffic_dict[new_state]['two_method_total'] += np.int(traffic_df[' two_methods_tot_two_methods_p'][i])
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  traffic_dict[new_state]['three_method_total'] += np.int(traffic_df[' three_meth_tot_three_meth_p'][i])
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  traffic_dict[new_state]['total_people'] += np.int(traffic_df[' tot_p'][i])


In [95]:
for state in traffic_dict:
    traffic_dict[state]['one_method_ratio'] = traffic_dict[state]['one_method_total']/traffic_dict[state]['total_people']
    traffic_dict[state]['two_method_ratio'] = traffic_dict[state]['two_method_total']/traffic_dict[state]['total_people']
    traffic_dict[state]['three_method_ratio'] = traffic_dict[state]['three_method_total']/traffic_dict[state]['total_people']
json_data = json.dumps(traffic_dict)
file_path = 'ccc/traffic_sudo.json'
with open(file_path, 'w') as file:
    json.dump(traffic_dict, file)