Remember to make sure the notebook runs top-to-bottom by clicking "Restart & Run All" from the "Kernel" menu. You will receive zero credit for answers submitted to Canvas that don't match the work in this notebook!

As a more general note, don't try to draw networks with more than a couple hundred nodes. NetworkX isn't good at drawing large networks and it will take forever, eating all your CPU cycles. 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import gzip
import json
import networkx as nx
from operator import itemgetter

In [3]:
# This could take a few seconds to run.
tweets = []
for line in gzip.open('/content/drive/MyDrive/2022 FALL COURSES/INFO-I368/HW13/worldcupqatar2022.json.gz', 'rt'):
      tweet = json.loads(line)
      tweets.append(tweet)

In [4]:
assert(len(tweets) == 48710)

## Q1

In [5]:
# Q1 What proportion of tweets in this list are retweets? Round to three decimal places, example: 0.123
R = nx.DiGraph()
num_tweets = len(tweets)
num_retweets = 0
for tweet in tweets:
  #CHECK IF REF
  if 'referenced_tweets' in tweet:
    for ref_tweet in tweet['referenced_tweets']:
      # REF NEEDS TO BE RETWEETED
      if ref_tweet['type'] == 'retweeted':
        num_retweets=num_retweets+1
        # AUTHOR OF TWEET (USER RETWEETING / RETWEETER)
        retweeter_user = tweet['author']['username']
        # AUTHOR OF REFERENCED TWEET (RETWEETED USER)
        retweeted_user = ref_tweet['author']['username']
        if 'author' in ref_tweet:
          # EDGE: (RETWEETER_USER ---> RETWEETED_USEER)
          #go in the direction of attention: if you retweet me, 
          #there is an edge from you to me. (Note: this is the opposite of information flow.)
          if R.has_edge(retweeter_user, retweeted_user):
            R.edges[retweeter_user, retweeted_user]
          else:
            R.add_edge(retweeter_user, retweeted_user)

p = num_retweets/num_tweets
p = round(p, 3)
print(f"THERE ARE {num_retweets} RETWEETS OUT OF {num_tweets} TOTAL TWEETS")
print(f"PROPORTION OF TWEETS THAT ARE RETWEETS: {p}")

THERE ARE 30958 RETWEETS OUT OF 48710 TOTAL TWEETS
PROPORTION OF TWEETS THAT ARE RETWEETS: 0.636


## Q2

In [6]:
# Q2 How many nodes are in the retweet network? (exclude self-loops)
print(f"NUMBER OF NODES IN RETWEET NETWORK PRIOR TO REMOVING SELF-LOOPS AND SINGLETONS: {R.number_of_nodes()}")
self_loop_list=[]
for edge in R.edges():
  if edge[0] == edge[1]:
    self_loop_list.append(edge)
R.remove_edges_from(self_loop_list)

singletons = []
for t in R.nodes():
  if R.in_degree(t) == 0 and R.out_degree(t) == 0:
    singletons.append(t)
R.remove_nodes_from(singletons)

print(f"NUMBER OF NODES IN RETWEET NETWORK: {R.number_of_nodes()}")

NUMBER OF NODES IN RETWEET NETWORK PRIOR TO REMOVING SELF-LOOPS AND SINGLETONS: 27621
NUMBER OF NODES IN RETWEET NETWORK: 27567


## Q3

In [7]:
# Q3 How many edges are in the retweet network?
print(f"NUMBER OF EDGES IN RETWEET NETWORK: {R.number_of_edges()}")

NUMBER OF EDGES IN RETWEET NETWORK: 27754


## Q4

In [8]:
# Q4 What is the highest out-degree in the retweet network?
out_degs = {}
for n in R.nodes():
  out_degs[n]=R.out_degree(n)
print(f"HIGHEST OUT-DEGREE IN RETWEET NETWORK: {max(out_degs.values())}")

HIGHEST OUT-DEGREE IN RETWEET NETWORK: 55


## Q5

In [9]:
# Q5 What is the highest in-degree in the retweet network?
in_degs = {}
for n in R.nodes():
  in_degs[n]=R.in_degree(n)
print(f"HIGHEST IN-DEGREE IN RETWEET NETWORK: {max(in_degs.values())}")

HIGHEST IN-DEGREE IN RETWEET NETWORK: 8479


## Q6

In [10]:
# Q6 Which node has the highest in-degree in the network? 
top_5 = sorted(R.in_degree(), key=itemgetter(1), reverse=True)[:5]
print(f"NODE/TWEET WITH HIGHEST IN-DEGREE: {top_5[0][0]} WITH IN-DEGREE OF {top_5[0][1]}")

NODE/TWEET WITH HIGHEST IN-DEGREE: TomorrowLand_We WITH IN-DEGREE OF 8479


## Q7

In [11]:
# Q7 Which node has the second-highest in-degree in the network?
print(f"NODE/TWEET WITH SECOND-HIGHEST IN-DEGREE: {top_5[1][0]} WITH IN-DEGREE OF {top_5[1][1]}")

NODE/TWEET WITH SECOND-HIGHEST IN-DEGREE: CATASTROPHYCLUB WITH IN-DEGREE OF 5500


## Q8

In [12]:
# Q8 What proportion of nodes in this network have zero in-degree? Round to three decimal places, ex: 0.123
num_zero_in = 0
for n in R.nodes():
  if R.in_degree(n) == 0:
    num_zero_in=num_zero_in+1

p = num_zero_in/R.number_of_nodes()
p = round(p, 3)
print(f"THERE ARE {num_zero_in} NODES WITH IN-DEGREE OF ZERO OUT OF {R.number_of_nodes()} TOTAL NODES")
print(f"PROPORTION OF NODES WITH IN-DEGREE ZERO: {p}")

#WE REMOVED ALL SINGLETONS!!

THERE ARE 26314 NODES WITH IN-DEGREE OF ZERO OUT OF 27567 TOTAL NODES
PROPORTION OF NODES WITH IN-DEGREE ZERO: 0.955


## Q11

In [13]:
# Q11 Connectivity
print(f"NETWORK IS STRONGLY CONNECTED: {nx.is_strongly_connected(R)}")
print(f"NETWORK IS WEAKLY CONNECTED: {nx.is_weakly_connected(R)}")
print(f"NUMBER OF WEAKLY CONNECTED COMPONENTS WITHIN THE NETWORK: {nx.number_weakly_connected_components(R)}")

NETWORK IS STRONGLY CONNECTED: False
NETWORK IS WEAKLY CONNECTED: False
NUMBER OF WEAKLY CONNECTED COMPONENTS WITHIN THE NETWORK: 832


## Q12

In [31]:
# Q12 What proportion of this network's nodes are in its largest weakly-connected component? Round to three decimal places.
#print(nx.number_weakly_connected_components(R))
dict_cc = {}
for w in nx.weakly_connected_components(R):
  dict_cc[len(w)] = w
  #print(len(w))

largest_cc = max(list(dict_cc.keys()))
print("THE CONNECTED COMPONENT IS: ", dict_cc[largest_cc])
p = largest_cc/R.number_of_nodes()
p = round(p, 3)
print(f"THERE ARE {num_zero_in} NODES IN THE LARGEST WEAKLY CONNECTED COMPONENT OUT OF {R.number_of_nodes()} TOTAL NODES")
print(f"PROPORTION OF NODES IN THE LARGEST WEAKLY CONNECTED COMPONENT: {p}")

THE CONNECTED COMPONENT IS:  {'bab_coin', 'MVSpree', 'Aaron43db17', 'Tom04110', 'yusaglry_', 'aanggunawan33', 'Syllaou34765828', 'doanthienhanh', 'hseyin08145489', 'l1m4bd', 'Frank_Lampards', 'Maldeven', 'BillBetsy3', 'WillieGibbon2', 'Boy_hariwibowo', 'hadiabbaspour67', 'peterliu_eed05', '77xc', 'Ahmetbalik8', 'FWWinterberg', 'bdkamrul001', 'DonaldJimmy16', 'RanggaSon', 'Jannatu61668355', 'needmoneynotyu', 'Zulfin13', 'sulQAR', 'Adirofik2', 'SaimunHossain01', 'tsearningzone', 'FeltusManes', 'RohitGu92782570', 'iampiusfelix', 'santuyaelur', 'TshepoMoloks19', 'cznftsol', 'LombardoVega', '07Smsm', 'monach18581950', 'sarafsaddam', 'Phorn29089473', 'bapakau_bigwin', 'Johnstontrash', 'DinoDinoOM', 'DavidJi73834854', 'TristaMarlowe', 'aa7zm', 'yvette455716', 'FredericCronin4', 'adjawajahate', 'jocelynAG_1234', 'cohenskigozi31', 'AnnaGorge1', 'Sekoufo73412018', 'NellyRalph3', 'HareemPhoenix', 'MableReeves16', 'nintendoaggy', 'AriesStandoff', 'ndoeldolies', 'fairysbigwin', 'AJemilah', 'sahib87

## Q 14


replacing everything after the last slash with the actual tweet ID.


In [32]:
# Q14 Your task is to find the most retweeted tweet. Exclude self-retweet.
# ask if this is in reguards to the tweets - retweet_count or in the network
# EDGE: (RETWEETER_USER ---> RETWEETED_USER)

#key: user_id value: number retweets 
# Q6 Which node has the highest in-degree in the network? 
#most_retweeted = sorted(R.in_degree(), key=itemgetter(1), reverse=True)[:3]
#print(f"NODE/TWEET WITH HIGHEST RETWEET COUNT: {most_retweeted[0][0]} WITH {most_retweeted[0][1]} RETWEETS")

#for tweet in tweets:
#  if tweet['author']['username'] == most_retweeted[0][0]:
#    print(f"NODE/TWEET ID WITH HIGHEST RETWEET COUNT: {tweet['id']} SCREEN NAME: {most_retweeted[0][0]} WITH {most_retweeted[0][1]} RETWEETS")

###############################################
#attempt 2

tweet_ids_rts = {}
for search_tweet in tweets:
  tweet_ids_rts[search_tweet['id']] = 0

for tweet in tweets:
  for ref_tweet in tweet['referenced_tweets']:
    if ref_tweet['id'] in tweet_ids_rts.keys():
      if ref_tweet['type'] == 'retweeted':
        tweet_ids_rts[ref_tweet['id']]=tweet_ids_rts[ref_tweet['id']]+1


for tweet_id, num_rts in tweet_ids_rts:
  print(f"TWEET_ID: {tweet_id} # OF RETWEETS: {num_rts}");

#https://twitter.com/anything_here/statuses/1590372207490117634 == number of retweets is 891
#https://twitter.com/anything_here/statuses/1589514475442679809 == number of retweets is 110

KeyError: ignored