In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import os, sys

import fire

In [106]:
#%%writefile ../pyscrap_url.py

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  #.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if '@' in name: #including conditional statement to filter data not needed
                        if len(name) > 0:
                            res.append(name.strip())
                    else:
                        res = res
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('findaing',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
                            
        return res

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))    
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

Writing ../pyscrap_url.py


In [20]:
#python code to obtain top 10 African Twitter Influencers
res = get_elements('https://africafreak.com/100-most-influential-twitter-users-in-africa', tag='h2')
res.reverse()

#displaying the top 10 african twitter influencers
print("Top 10 African Twitter Influencers: \n")
for influencer in res[0:10]:
    print(influencer)

#saving data as csv file
dat = pd.DataFrame(res[0:10], columns=['Infuencers'])
dat.to_csv('top_10_african_influencers.csv')

Top 10 African Twitter Influencers: 

1. Trevor Noah (@Trevornoah)
2. Gareth Cliff (@GarethCliff)
3. Jacob G. Zuma (@SAPresident)
4. News24 (@News24)
5. Julius Sello Malema (@Julius_S_Malema)
6. Helen Zille (@helenzille)
7. mailandguardian (@mailandguardian)
8. 5FM (@5FM)
9. loyiso gola (@loyisogola)
10. Computicket (@Computicket)


In [105]:
#Python code to obtain the twitter account of African top government officials
url= 'https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa'
response = simple_get(url)

res_gov = get_elements(response, search={'find_all':{'class_':'wp-block-embed__wrapper'}})
res_gov

#converting the data to string
list_gov = str(res_gov)

#importing regular expression
import re

#deriving the government officials twitter handles using regex
listnew = re.findall(r"@[\w]*", list_gov)

#displaying 10 african top government officials handles
print("""\n10 African top government officials twitter handles: 
        \n------------------------------------------------------\n""")
count = 1
for hand in listnew[0:10]:
    print(f"{count}. {hand}")
    count = count+1

#saving data as csv file
dat2 = pd.DataFrame(listnew[0:10], columns=['Government Officials'])
dat2.to_csv('10_government_officials.csv')

findaing all of {'class_': 'wp-block-embed__wrapper'}

10 African top government officials twitter handles: 
        
------------------------------------------------------

1. @EswatiniGovern1
2. @MalawiGovt
3. @hagegeingob
4. @FinanceSC
5. @PresidencyZA
6. @Dora_Siliya
7. @ChitaluChilufy3
8. @noalaskinner
9. @coumbagadio_ZM
10. @unicefzambia
