A big part of the following code was based on the Panacealab's tutorial for accesing their
open source covid-19 related tweets database. 

[Original Repo](https://github.com/thepanacealab/covid19_twitter/blob/master/COVID_19_dataset_Tutorial.ipynb)

## Required packages
***

In [13]:
from IPython.display import clear_output
'''
Required Packages
--------------------------
twarc #Twarc
tweepy # Tweepy 3.8.0
argparse #Argparse
xtract #Xtract 
wget #Wget 3.2
'''
# UNCOMMENT THE FOLLOWING LINES IF YOU DON'T HAVE THE PACKAGES INSTALLED
# %pip install twarc
# %pip install tweepy==3.8.0
# %pip install argparse
# %pip install xtract
# %pip install wget

# clear_output()

Note: you may need to restart the kernel to use updated packages.


## DataSet Import

In [14]:
import gzip
import shutil
import os
import wget
import csv
import linecache
from shutil import copyfile
import ipywidgets as widgets
import numpy as np
import pandas as pd

In [1]:
dataset_URL = "https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2021-01-20/2021-01-20_clean-dataset.tsv.gz?raw=true" 
#@param {type:"string"}


#Downloads the dataset (compressed in a GZ format)
#!wget dataset_URL -O clean-dataset.tsv.gz
wget.download(dataset_URL, out='clean-dataset.tsv.gz')

#Unzips the dataset and gets the TSV dataset
with gzip.open('clean-dataset.tsv.gz', 'rb') as f_in:
    with open('clean-dataset.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

#Deletes the compressed GZ file
os.unlink("clean-dataset.tsv.gz")

#Gets all possible languages from the dataset
df = pd.read_csv('clean-dataset.tsv',sep="\t")
lang_list = df.lang.unique()
lang_list= sorted(np.append(lang_list,'all'))
lang_picker = widgets.Dropdown(options=lang_list, value="all")
lang_picker

Dropdown(options=('all', 'am', 'ar', 'bg', 'bn', 'bo', 'ca', 'ckb', 'cs', 'cy', 'da', 'de', 'dv', 'el', 'en', …

## Filter by Language

In [2]:
#Creates a new clean dataset with the specified language (if specified)
filtered_language = lang_picker.value

#If no language specified, it will get all records from the dataset
if filtered_language == "":
  copyfile('clean-dataset.tsv', 'clean-dataset-filtered.tsv')

#If language specified, it will create another tsv file with the filtered records
else:
  filtered_tw = list()
  current_line = 1
  with open("clean-dataset.tsv") as tsvfile:
    tsvreader = csv.reader(tsvfile, delimiter="\t")

    if current_line == 1:
      filtered_tw.append(linecache.getline("clean-dataset.tsv", current_line))

      for line in tsvreader:
        if line[3] == filtered_language:
          filtered_tw.append(linecache.getline("clean-dataset.tsv", current_line))
        current_line += 1

  print('\033[1mShowing first 5 tweets from the filtered dataset\033[0m')
  print(filtered_tw[1:(6 if len(filtered_tw) > 6 else len(filtered_tw))])

  with open('clean-dataset-filtered.tsv', 'w') as f_output:
      for item in filtered_tw:
          f_output.write(item)

[1mShowing first 5 tweets from the filtered dataset[0m
['1351757472873197568\t2021-01-20\t05:04:30\tes\tNULL\n', '1351757472965353477\t2021-01-20\t05:04:30\tes\tNULL\n', '1351757481723240448\t2021-01-20\t05:04:32\tes\tNULL\n', '1351757481727455234\t2021-01-20\t05:04:32\tes\tNULL\n', '1351757488706760705\t2021-01-20\t05:04:34\tes\tNULL\n']


### Authenticate into tweeter

In [8]:
import json
import tweepy
from tweepy import OAuthHandler

# Authenticate
CONSUMER_KEY = input("Enter consumer Key") #@param {type:"string"}
CONSUMER_SECRET_KEY = input("Enter consumer Secret") #@param {type:"string"}
ACCESS_TOKEN_KEY = input("Enter access token") #@param {type:"string"}
ACCESS_TOKEN_SECRET_KEY = input("Enter access token secret") #@param {type:"string"}

#Creates a JSON Files with the API credentials
with open('api_keys.json', 'w') as outfile:
    json.dump({
    "consumer_key":CONSUMER_KEY,
    "consumer_secret":CONSUMER_SECRET_KEY,
    "access_token":ACCESS_TOKEN_KEY,
    "access_token_secret": ACCESS_TOKEN_SECRET_KEY
     }, outfile)


In [10]:
# The lines below are just to test if the twitter credentials are correct
# Authenticate
auth = tweepy.AppAuthHandler(CONSUMER_KEY, CONSUMER_SECRET_KEY)

api = tweepy.API(auth, wait_on_rate_limit=True,
				   wait_on_rate_limit_notify=True)

if (not api):
   print ("Can't Authenticate")
else: 
    print("Authentication Succesful")

Authentication Succesful


### Hydrating the tweet

In [9]:
## Te following code downloadss a Mining tool (SMMT provided by the panacealab team to help with the hydration of tweet ids
'''
WARNING!
If you already have get_metadata.py downloaded the is no need to run this cell again
'''
from IPython.display import clear_output
import wget
import wget
url = 'https://raw.githubusercontent.com/thepanacealab/SMMT/master/data_acquisition/get_metadata.py'
path = 'get_metadata.py'
wget.download(url,out = path)


'get_metadata.py'

In [13]:
!python get_metadata.py -i "clean-dataset-filtered.tsv" -o "./datasets/hydrated_tweets" -k api_keys.json

# clear_output()

^C


In [None]:
!python get_metadata.py -i clean-dataset-filtered.tsv -o hydrated_tweets -k api_keys.json

In [18]:
df = pd.read_csv('clean-dataset.tsv', sep='\t',  header=0)

In [19]:
df.shape

(424648, 5)