# Laboratory 1: setting up Twitter API

## Set up

In [None]:
import os
os.environ['TOKEN'] = #bearer token here

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Import libraries

In [2]:
import requests 
import pandas as pd 
import time

### Set up headers

In [None]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [None]:
headers = create_headers(os.environ['TOKEN'])

## Download data

In [None]:
def create_url(keyword, start_date, end_date, env_label, endpoint="fullarchive"):
    
    search_url = "https://api.twitter.com/1.1/tweets/search/{}.json".format(endpoint+"/"+env_label) 

    #change params based on the endpoint you are using
    query_params = {'query': keyword, 'fromDate': start_date, 'toDate': end_date}
    return (search_url, query_params)

In [None]:
def connect_to_endpoint(url, headers, params, next_token = None):
    if next_token is not None and next_token != '':
      params['next'] = next_token
    response = requests.request("GET", url, headers = headers, params = params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [None]:
def get_data(keyword, start_time, end_time, next_token, env_label, endpoint):
  results = []
  
  while next_token is not None:
    ##this part here for one request
    url = create_url(keyword, start_time,end_time, env_label, endpoint)
    json_response = connect_to_endpoint(url[0], headers, url[1], next_token)
    
    if "results" in json_response:
      results.extend(json_response["results"])
    ### up until this point
    if "next" in json_response:
        next_token = json_response["next"]
    else:
      next_token = None
    time.sleep(1)
  
  return results

In [None]:
def get_single_response(keyword, start_time, end_time, env_label, endpoint):
  results = []
  url = create_url(keyword, start_time,end_time, env_label, endpoint)
  json_response = connect_to_endpoint(url[0], headers, url[1])
  
  if "results" in json_response:
    results.extend(json_response["results"])
  
  return results

#Will return 100 results ad will use only 1 request

In [None]:
#tweets = get_data("Sarajevo", "202002090000", "202002100000", "")

In [None]:
#tweets = get_single_response("COVID19 lang:en", "202110060000", "202111050000", "30day", "30day")

In [None]:
#Tweets already prepered for us, so no need to re-run the code (and run out of requests)

### Inspect data

In [None]:
len(tweets)

In [None]:
tweets[0]

# Laboratory 2: working with Twitter data

First, we want to convert the data into Pandas DataFrame. This format enables us easy manipulation of the data as well as saving/loading data.

Since we have our tweets saved as a list of dictionaries, we can easily convert it to DataFrame by executing the cell blow.

In [None]:
tweets_df = pd.DataFrame(tweets)

In [None]:
tweets_df

### Saving the results

Once we have our Tweets as a DataFrame it is a good idea to save it on the disk. 

Be mindful of the fact that the storage of a Colab notebook is deleted everytime runtime is interrupted or restarted, so you need to manually download it to your computer or mount your Google Drive and save it there (this option is unavailable if you're using university's email account for Drive).

In [None]:
path = "/content/" #enter the path to your Drive or leave this as default

We can save it as a comma-separated values file, which enables opening it in a spreadsheet editor and inspecting it.

In [None]:
tweets_df.to_csv(path+"tweets.csv", index=False)

In order to preserve datatypes, we should save it as a parquet or pickle file.

In [None]:
tweets_df.to_pickle(path+"tweets.pkl")

### Loading the data

The different column have different type, once you save as a cvs and load they will all be strings.
We want to preserve the data so it is very convinient for us to use pkl file

If you want to load the results you have previously saved, simply execute the next code, specifying the path to the file.

You will need to either upload it to the Colab workspace or copy the path to the file on Drive.

In [None]:
#tweets_df = pd.read_pickle(path+"tweets.pkl")

In [None]:
#locally
tweets_df = pd.read_pickle("/home/sabrina/Desktop/NetworkScience/labs/"+"tweets.pkl")
tweets_df

In [4]:
#drive
tweets_df=pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/NSdata/"+"tweets.pkl")
tweets_df

Unnamed: 0,created_at,id,id_str,text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,in_reply_to_screen_name,user,geo,coordinates,place,contributors,retweeted_status,quoted_status_id,quoted_status_id_str,quoted_status,quoted_status_permalink,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,entities,favorited,retweeted,filter_level,lang,matching_rules,extended_tweet,possibly_sensitive,display_text_range,extended_entities
0,Sat Nov 06 23:59:58 +0000 2021,1457135701732904962,1457135701732904962,RT @mansukhmandviya: Appreciate @Murugan_MoS j...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,,"{'id': 1366620482997420034, 'id_str': '1366620...",,,,,{'created_at': 'Sat Nov 06 16:23:21 +0000 2021...,1.456847e+18,1456847227930890241,{'created_at': 'Sat Nov 06 04:53:40 +0000 2021...,"{'url': 'https://t.co/0gTV7O4gcB', 'expanded':...",True,0,0,0,0,"{'hashtags': [{'text': 'HarGharDastak', 'indic...",False,False,low,en,[{'tag': None}],,,,
1,Sat Nov 06 23:59:57 +0000 2021,1457135697337458691,1457135697337458691,RT @StephenSteglik: Shoutout to @BigBird for n...,"<a href=""http://twitter.com/download/iphone"" r...",False,,,,,,"{'id': 38947085, 'id_str': '38947085', 'name':...",,,,,{'created_at': 'Sat Nov 06 23:59:24 +0000 2021...,1.456972e+18,1456971880666046465,{'created_at': 'Sat Nov 06 13:09:00 +0000 2021...,"{'url': 'https://t.co/r58ZIiZC90', 'expanded':...",True,0,0,0,0,"{'hashtags': [{'text': 'GetVaccinated', 'indic...",False,False,low,en,[{'tag': None}],,,,
2,Sat Nov 06 23:59:56 +0000 2021,1457135694367924227,1457135694367924227,RT @apsmunro: If you hear there have been &gt;...,"<a href=""http://twitter.com/download/iphone"" r...",False,,,,,,"{'id': 16078191, 'id_str': '16078191', 'name':...",,,,,{'created_at': 'Sat Nov 06 12:36:15 +0000 2021...,,,,,False,0,0,0,0,"{'hashtags': [], 'urls': [], 'user_mentions': ...",False,False,low,en,[{'tag': None}],,,,
3,Sat Nov 06 23:59:55 +0000 2021,1457135690093776899,1457135690093776899,Tonight on CHEK News at 5 p.m. with @JasmineBa...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",True,,,,,,"{'id': 156749387, 'id_str': '156749387', 'name...",,,,,,,,,,False,0,1,2,2,"{'hashtags': [{'text': 'COVID19', 'indices': [...",False,False,low,en,[{'tag': None}],{'full_text': 'Tonight on CHEK News at 5 p.m. ...,False,,
4,Sat Nov 06 23:59:51 +0000 2021,1457135673773727748,1457135673773727748,RT @SovernNation: A #COVID19 outbreak among th...,"<a href=""https://about.twitter.com/products/tw...",False,,,,,,"{'id': 1295326471821520899, 'id_str': '1295326...",,,,,{'created_at': 'Sat Nov 06 23:59:33 +0000 2021...,,,,,False,0,0,0,0,"{'hashtags': [{'text': 'COVID19', 'indices': [...",False,False,low,en,[{'tag': None}],,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Sat Nov 06 23:55:43 +0000 2021,1457134631937929221,1457134631937929221,RT @disclosetv: NEW - Sesame Street with Ameri...,"<a href=""http://twitter.com/download/iphone"" r...",False,,,,,,"{'id': 3233164824, 'id_str': '3233164824', 'na...",,,,,{'created_at': 'Sat Nov 06 12:45:38 +0000 2021...,,,,,False,0,0,0,0,"{'hashtags': [{'text': 'COVID19', 'indices': [...",False,False,low,en,[{'tag': None}],,False,,"{'media': [{'id': 1456965743774474244, 'id_str..."
96,Sat Nov 06 23:55:43 +0000 2021,1457134631392665602,1457134631392665602,RT @UNICEFIndia: As we celebrate the #COVID19 ...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,,"{'id': 1407216867551244288, 'id_str': '1407216...",,,,,{'created_at': 'Tue Oct 19 03:30:00 +0000 2021...,,,,,False,0,0,0,0,"{'hashtags': [{'text': 'COVID19', 'indices': [...",False,False,low,en,[{'tag': None}],,,,
97,Sat Nov 06 23:55:38 +0000 2021,1457134610958139398,1457134610958139398,RT @BulliedAutistic: @Tammlin1963 You can't tr...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,,"{'id': 1213639608304431104, 'id_str': '1213639...",,,,,{'created_at': 'Sat Nov 06 23:55:27 +0000 2021...,,,,,False,0,0,0,0,"{'hashtags': [], 'urls': [], 'user_mentions': ...",False,False,low,en,[{'tag': None}],,,,
98,Sat Nov 06 23:55:35 +0000 2021,1457134599432282114,1457134599432282114,@CoxeAnne @LovesBulldawgs @AnarchistGolden @Do...,"<a href=""http://twitter.com/download/android"" ...",True,1.457113e+18,1457113186658705409,1.163548e+18,1163548191913988096,CoxeAnne,"{'id': 1368049995534974976, 'id_str': '1368049...",,,,,,,,,,False,0,0,0,0,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",False,False,low,en,[{'tag': None}],{'full_text': '@CoxeAnne @LovesBulldawgs @Anar...,,"[95, 140]",


### Preprocessing the data

In our dataframe we have the entire Tweet object. Some columns that might be of particular interest to us are: 

*   created_at - date when Tweet was posted
*   id/id_str - unique Tweet identifiers
*   text - the content of the Tweet
*   user - information about the user who posted the Tweet
*   retweeted_status  - information about the original Tweet
*   quote/reply/retweet/favorite count - Tweet metrics
*   entities - hashtags, urls, user_mentions present in Tweet

We can filter the dataframe and keep only columns we are interested in. You can pick which columns you'd like to keep and put them int the column_list below.



In [5]:
tweets_filtered = tweets_df.copy() #it's a good idea to work on the copy of original dataframe, so we can always go back to it if we mess something up
column_list = ["created_at", "id_str", "text", "user", "retweeted_status", "quote_count", "reply_count", "retweet_count", "favorite_count", "entities"]
tweets_filtered = tweets_filtered[column_list]

In [6]:
tweets_filtered

Unnamed: 0,created_at,id_str,text,user,retweeted_status,quote_count,reply_count,retweet_count,favorite_count,entities
0,Sat Nov 06 23:59:58 +0000 2021,1457135701732904962,RT @mansukhmandviya: Appreciate @Murugan_MoS j...,"{'id': 1366620482997420034, 'id_str': '1366620...",{'created_at': 'Sat Nov 06 16:23:21 +0000 2021...,0,0,0,0,"{'hashtags': [{'text': 'HarGharDastak', 'indic..."
1,Sat Nov 06 23:59:57 +0000 2021,1457135697337458691,RT @StephenSteglik: Shoutout to @BigBird for n...,"{'id': 38947085, 'id_str': '38947085', 'name':...",{'created_at': 'Sat Nov 06 23:59:24 +0000 2021...,0,0,0,0,"{'hashtags': [{'text': 'GetVaccinated', 'indic..."
2,Sat Nov 06 23:59:56 +0000 2021,1457135694367924227,RT @apsmunro: If you hear there have been &gt;...,"{'id': 16078191, 'id_str': '16078191', 'name':...",{'created_at': 'Sat Nov 06 12:36:15 +0000 2021...,0,0,0,0,"{'hashtags': [], 'urls': [], 'user_mentions': ..."
3,Sat Nov 06 23:59:55 +0000 2021,1457135690093776899,Tonight on CHEK News at 5 p.m. with @JasmineBa...,"{'id': 156749387, 'id_str': '156749387', 'name...",,0,1,2,2,"{'hashtags': [{'text': 'COVID19', 'indices': [..."
4,Sat Nov 06 23:59:51 +0000 2021,1457135673773727748,RT @SovernNation: A #COVID19 outbreak among th...,"{'id': 1295326471821520899, 'id_str': '1295326...",{'created_at': 'Sat Nov 06 23:59:33 +0000 2021...,0,0,0,0,"{'hashtags': [{'text': 'COVID19', 'indices': [..."
...,...,...,...,...,...,...,...,...,...,...
95,Sat Nov 06 23:55:43 +0000 2021,1457134631937929221,RT @disclosetv: NEW - Sesame Street with Ameri...,"{'id': 3233164824, 'id_str': '3233164824', 'na...",{'created_at': 'Sat Nov 06 12:45:38 +0000 2021...,0,0,0,0,"{'hashtags': [{'text': 'COVID19', 'indices': [..."
96,Sat Nov 06 23:55:43 +0000 2021,1457134631392665602,RT @UNICEFIndia: As we celebrate the #COVID19 ...,"{'id': 1407216867551244288, 'id_str': '1407216...",{'created_at': 'Tue Oct 19 03:30:00 +0000 2021...,0,0,0,0,"{'hashtags': [{'text': 'COVID19', 'indices': [..."
97,Sat Nov 06 23:55:38 +0000 2021,1457134610958139398,RT @BulliedAutistic: @Tammlin1963 You can't tr...,"{'id': 1213639608304431104, 'id_str': '1213639...",{'created_at': 'Sat Nov 06 23:55:27 +0000 2021...,0,0,0,0,"{'hashtags': [], 'urls': [], 'user_mentions': ..."
98,Sat Nov 06 23:55:35 +0000 2021,1457134599432282114,@CoxeAnne @LovesBulldawgs @AnarchistGolden @Do...,"{'id': 1368049995534974976, 'id_str': '1368049...",,0,0,0,0,"{'hashtags': [], 'urls': [{'url': 'https://t.c..."


## Extracting words/hashtags

There are many ways to build networks from the data we download from Twitter.

One possibility is to have a bipartite network of Tweets and words/hashtags and then observe word, hashtag or word-hashtag projections.

### Extracting words

In order to extract words, we first need to clean the Tweet text. This way we will remove punctuation, hashtags/mentions/urls (they are preserved in the entity column anyway). We will also turn all letters to lowercase.

You can also consider removing stopwords, removing words that are not in the english language corpora, lematizing the words, etc. I suggest you research nltk library and its possibilities.

In [9]:
! pip install emoji

Collecting emoji
  Downloading emoji-1.6.1.tar.gz (170 kB)
[?25l[K     |██                              | 10 kB 7.7 MB/s eta 0:00:01[K     |███▉                            | 20 kB 11.9 MB/s eta 0:00:01[K     |█████▉                          | 30 kB 15.5 MB/s eta 0:00:01[K     |███████▊                        | 40 kB 17.1 MB/s eta 0:00:01[K     |█████████▋                      | 51 kB 9.9 MB/s eta 0:00:01[K     |███████████▋                    | 61 kB 8.3 MB/s eta 0:00:01[K     |█████████████▌                  | 71 kB 8.2 MB/s eta 0:00:01[K     |███████████████▍                | 81 kB 9.2 MB/s eta 0:00:01[K     |█████████████████▍              | 92 kB 8.2 MB/s eta 0:00:01[K     |███████████████████▎            | 102 kB 8.9 MB/s eta 0:00:01[K     |█████████████████████▏          | 112 kB 8.9 MB/s eta 0:00:01[K     |███████████████████████▏        | 122 kB 8.9 MB/s eta 0:00:01[K     |█████████████████████████       | 133 kB 8.9 MB/s eta 0:00:01[K     |████████

In [10]:
import re
import string
import emoji

In [11]:
def cleaner(tweet):
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) # remove mentions
    tweet = re.sub("#[A-Za-z0-9]+", "",tweet) # remove hashtags
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) # remove http links
    tweet = " ".join(tweet.split())
    tweet = str.lower(tweet) #to lowercase
    table = str.maketrans(dict.fromkeys(string.punctuation)) 
    tweet = tweet.translate(table)# remove punctuation         
    return tweet

In [12]:
#applay function cleaner to the column text using the map function, row by row it will appy the function to every value of the column
tweets_filtered["clean_text"] = tweets_filtered["text"].map(cleaner)

In [13]:
tweets_filtered

Unnamed: 0,created_at,id_str,text,user,retweeted_status,quote_count,reply_count,retweet_count,favorite_count,entities,clean_text
0,Sat Nov 06 23:59:58 +0000 2021,1457135701732904962,RT @mansukhmandviya: Appreciate @Murugan_MoS j...,"{'id': 1366620482997420034, 'id_str': '1366620...",{'created_at': 'Sat Nov 06 16:23:21 +0000 2021...,0,0,0,0,"{'hashtags': [{'text': 'HarGharDastak', 'indic...",rt appreciate mos jis initiative to support p...
1,Sat Nov 06 23:59:57 +0000 2021,1457135697337458691,RT @StephenSteglik: Shoutout to @BigBird for n...,"{'id': 38947085, 'id_str': '38947085', 'name':...",{'created_at': 'Sat Nov 06 23:59:24 +0000 2021...,0,0,0,0,"{'hashtags': [{'text': 'GetVaccinated', 'indic...",rt shoutout to for not going to a podcaster c...
2,Sat Nov 06 23:59:56 +0000 2021,1457135694367924227,RT @apsmunro: If you hear there have been &gt;...,"{'id': 16078191, 'id_str': '16078191', 'name':...",{'created_at': 'Sat Nov 06 12:36:15 +0000 2021...,0,0,0,0,"{'hashtags': [], 'urls': [], 'user_mentions': ...",rt if you hear there have been gt100 child de...
3,Sat Nov 06 23:59:55 +0000 2021,1457135690093776899,Tonight on CHEK News at 5 p.m. with @JasmineBa...,"{'id': 156749387, 'id_str': '156749387', 'name...",,0,1,2,2,"{'hashtags': [{'text': 'COVID19', 'indices': [...",tonight on chek news at 5 pm with an outbreak...
4,Sat Nov 06 23:59:51 +0000 2021,1457135673773727748,RT @SovernNation: A #COVID19 outbreak among th...,"{'id': 1295326471821520899, 'id_str': '1295326...",{'created_at': 'Sat Nov 06 23:59:33 +0000 2021...,0,0,0,0,"{'hashtags': [{'text': 'COVID19', 'indices': [...",rt a outbreak among the team left the bears w...
...,...,...,...,...,...,...,...,...,...,...,...
95,Sat Nov 06 23:55:43 +0000 2021,1457134631937929221,RT @disclosetv: NEW - Sesame Street with Ameri...,"{'id': 3233164824, 'id_str': '3233164824', 'na...",{'created_at': 'Sat Nov 06 12:45:38 +0000 2021...,0,0,0,0,"{'hashtags': [{'text': 'COVID19', 'indices': [...",rt new sesame street with american tv networ...
96,Sat Nov 06 23:55:43 +0000 2021,1457134631392665602,RT @UNICEFIndia: As we celebrate the #COVID19 ...,"{'id': 1407216867551244288, 'id_str': '1407216...",{'created_at': 'Tue Oct 19 03:30:00 +0000 2021...,0,0,0,0,"{'hashtags': [{'text': 'COVID19', 'indices': [...",rt as we celebrate the vaccination drive in i...
97,Sat Nov 06 23:55:38 +0000 2021,1457134610958139398,RT @BulliedAutistic: @Tammlin1963 You can't tr...,"{'id': 1213639608304431104, 'id_str': '1213639...",{'created_at': 'Sat Nov 06 23:55:27 +0000 2021...,0,0,0,0,"{'hashtags': [], 'urls': [], 'user_mentions': ...",rt you cant trust the covid19 vaccine especia...
98,Sat Nov 06 23:55:35 +0000 2021,1457134599432282114,@CoxeAnne @LovesBulldawgs @AnarchistGolden @Do...,"{'id': 1368049995534974976, 'id_str': '1368049...",,0,0,0,0,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",ya know what else pr…


We are going to loop through the dataframe and then through the words in the clean text. We are going to add the words as keys to dictionary and use their frequencies as values.

In [14]:
#initialize an empty dict
unique_words = {}
for row in tweets_filtered.clean_text:
  for word in row.split(" "):
    #if the word is encountered for the first time add to dict as key and set its value to 0
    unique_words.setdefault(word,0)
    #increase the value (i.e the count) of the word by 1 every time it is encountered
    unique_words[word] += 1

In [19]:
#remove blank space
unique_words.pop("")
#remove word 'rt'
unique_words.pop("rt")

We can inspect the words as a dataframe. 


You can always save this dataframe as .csv for future reference.

In [20]:
uw_df = pd.DataFrame.from_dict(unique_words, orient='index').reset_index()
uw_df.rename(columns = {'index':'Word', 0:'Count'}, inplace=True)
uw_df.sort_values(by=['Count'], ascending=False, inplace=True)
uw_df

Unnamed: 0,Word,Count
55,the,61
4,to,37
35,is,27
16,a,26
9,of,24
...,...,...
314,flu,1
313,every,1
312,halfthere,1
309,comment,1


### Extracting the hashtags

We are going to loop through the dataframe and then through the hashtags in the entities. We are going to add the hashtags as keys to dictionary and use their frequencies as values. At the same time, we are going to save them in a list and add them to a separate column to facilitate our future work.

In [21]:
unique_hashtags = {}

tweets_filtered["hashtags"] = ""

for idx, row in tweets_filtered.iterrows():
  hashtag_list = []
  for hashtag in row["entities"]["hashtags"]:
    unique_hashtags.setdefault("#"+hashtag["text"], 0)
    unique_hashtags['#'+hashtag["text"]] += 1
    hashtag_list.append(hashtag["text"])
  tweets_filtered.at[idx,"hashtags"] = hashtag_list


In [24]:
uh_df = pd.DataFrame.from_dict(unique_hashtags, orient='index').reset_index()
uh_df.rename(columns = {'index':'Hashtag', 0:'Count'}, inplace=True)
uh_df.sort_values(by=["Count"], ascending=False)

Unnamed: 0,Hashtag,Count
3,#COVID19,34
2,#Covid19,5
20,#Vaccine,4
22,#CCP,3
25,#covid19,2
1,#GetVaccinated,2
9,#auspol,2
30,#AI,2
31,#leadership,1
26,#abstract,1


## Building the network

We are going to use the networkx library, which is a Python library that enables network science analysis of the data.

We are going to use it to create our network and extract edgelist from it, since we can easily import it to Gephi (a software we are going to see in visualization labs).

However, it offers implemented algorithms for analysis (for example PageRank) that you can use out-of-box to analyze your network.

But first, we will loop through our dataframe and connect words and hashtags if they appear together in the same Tweet.

In [25]:
import itertools
import networkx as nx

In [26]:
uh = unique_hashtags.keys()
uw = unique_words.keys()  

In [27]:
network = {}
network_key = 0

for index, row in tweets_filtered.iterrows():
    #hashtags extracted from Tweet do not have the # sign in front of them but we will add it to differentiate hashtags from words
    combined_list = ['#'+hashtag for hashtag in row["hashtags"] if '#'+hashtag in unique_hashtags] + [word for word in str.split(row["clean_text"], " ") if word in uw]
    #itertool product creates Cartesian product of each element in the combined list
    for pair in itertools.product(combined_list, combined_list):
        #exclude self-loops and count each pair only once because our graph is undirected and we do not take self-loops into account
        if pair[0]!=pair[1] and not(pair[::-1] in network):
            network.setdefault(pair,0)
            network[pair] += 1 #* row["retweetCount"]
    
network_df = pd.DataFrame.from_dict(network, orient="index")

In [28]:
network_df.reset_index(inplace=True)
network_df.columns = ["pair","weight"]
network_df.sort_values(by="weight",inplace=True, ascending=False)
network_df

Unnamed: 0,pair,weight
1038,"(the, to)",26
510,"(#COVID19, the)",24
1947,"(the, this)",17
1307,"(covid19, the)",16
352,"(is, a)",16
...,...,...
3591,"(chronic, sack)",1
3590,"(chronic, move)",1
3589,"(chronic, the)",1
3588,"(chronic, feared)",1


In [29]:
#to get weighted graph we need a list of 3-element tuplels (u,v,w) where u and v are nodes and w is a number representing weight
up_weighted = []
for edge in network:
    #we can filter edges by weight by uncommenting the next line and setting desired weight threshold
    #if(network[edge])>1:
    up_weighted.append((edge[0],edge[1],network[edge]))

G = nx.Graph()
G.add_weighted_edges_from(up_weighted)

In [30]:
print(len(G.nodes()))
print(len(G.edges()))

796
9856


#### SAVE EDGELIST

In [31]:
filename = "./edgelist.csv"

In [32]:
nx.write_weighted_edgelist(G, filename, delimiter=",")

In [33]:
#add header with appropriate column names (works on collab and Linux/Mac(?))
!sed -i.bak 1i"Source,Target,Weight" ./edgelist.csv

### Create Node List


In [34]:
word_nodes = pd.DataFrame.from_dict(unique_words,orient="index")
word_nodes.reset_index(inplace=True)
word_nodes["Label"] = word_nodes["index"]
word_nodes.rename(columns={"index":"Id",0:"delete"},inplace=True)
word_nodes = word_nodes.drop(columns=['delete'])

word_nodes

Unnamed: 0,Id,Label
0,appreciate,appreciate
1,mos,mos
2,jis,jis
3,initiative,initiative
4,to,to
...,...,...
749,attention,attention
750,science,science
751,ya,ya
752,else,else


In [35]:
hashtag_nodes = uh_df.copy()
hashtag_nodes["Label"] = hashtag_nodes["Hashtag"]
hashtag_nodes.rename(columns={"Hashtag":"Id"},inplace=True)
hashtag_nodes = hashtag_nodes.drop(columns=['Count'])
hashtag_nodes

Unnamed: 0,Id,Label
0,#HarGharDastak,#HarGharDastak
1,#GetVaccinated,#GetVaccinated
2,#Covid19,#Covid19
3,#COVID19,#COVID19
4,#Britain,#Britain
5,#brexit,#brexit
6,#BorisHasFailedBritain,#BorisHasFailedBritain
7,#BorisJohnson,#BorisJohnson
8,#FennaTugemebwe,#FennaTugemebwe
9,#auspol,#auspol


#### SAVE NODELIST

In [36]:
nodelist = hashtag_nodes.append(word_nodes, ignore_index=True)

nodelist.to_csv("nodelist.csv",index=False)

Tasks: 

*   Extract username of user who posted the tweet into a column "screen_name". Follow the procedure we used to get the hashtags.
*   Create a network of users using the mention relation. Is this a directed or undirected graph?
*   We created a network where nodes are mixed (both words and hashtags). Create network of words only and one of hashtags only.
* Pick one of these network and rank the nodes using PageRank centrality. Extract information about top-20 rated nodes.



