<a href="https://colab.research.google.com/github/dmarinere/NikeCampaign/blob/master/scrapping_starter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Web scrapping using python

#### References
1. [Practical Introduction to Web Scraping in Python](https://realpython.com/python-web-scraping-practical-introduction/)
2. [Web Scraping using Python](https://www.datacamp.com/community/tutorials/web-scraping-using-python)

In [1]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
root_path = 'drive/My Drive/'

Mounted at /content/drive


In [2]:
import string
from requests import get
import numpy as np
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import os, sys
import re 
import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import logging
import json
pd.set_option('display.max_rows', None)

#sentiment analysis package
!pip install textblob
from textblob import TextBlob

#general text pre-processor
!pip install nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

#tweet pre-processor 
!pip install tweet-preprocessor
import preprocessor as p

logger = logging.getLogger()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
cols=  ['created_at', 'source', 'original_text', 'lang',
                    'favorite_count', 'retweet_count', 'original_author'
                    ,'name', "following", 'followers',
                    'hashtags', 'mention', 'place_coord_boundaries', 'location']

In [7]:
class TweetSearch():
    '''
    This is a basic class to search and download twitter data.
    You can build up on it to extend the functionalities for more 
    sophisticated analysis
    '''
    def __init__(self, cols=None,auth=None):
        if cols is None:
            self.cols =  ['created_at', 'source', 'original_text', 
                     'lang',
                    'favorite_count', 'retweet_count', 'original_author'
                    ,'name', "following", 'followers',
                    'hashtags', 'mention', 'place_coord_boundaries', 'location']
        else:
            self.cols = cols
  
            
        if auth is None:
            #Variables that contains the user credentials to access Twitter API 
            consumer_key = os.environ.get('TWITTER_API_KEY')
            consumer_secret = os.environ.get('TWITTER_API_SECRET')
            access_token = os.environ.get('TWITTER_ACCESS_TOKEN')
            access_token_secret = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET')


            #This handles Twitter authetification and the connection to Twitter Streaming API
            auth = OAuthHandler(consumer_key, consumer_secret)
            auth.set_access_token(access_token, access_token_secret)
            

        #            
        self.auth = auth
        self.api = tweepy.API(auth, wait_on_rate_limit=True,
          wait_on_rate_limit_notify=True)            
            

   
    def get_tweets(self, user, csvfile=None):
        
        
        df = pd.DataFrame(columns=self.cols)
        
        if not csvfile is None:
            #If the file exists, then read the existing data from the CSV file.
            if os.path.exists(csvfile):
                df = pd.read_csv(csvfile, header=0)
            

        #page attribute in tweepy.cursor and  for twitter users 
        for page in tweepy.Cursor(api.user_timeline, id=user, include_rts=False).pages(40):



            for status in page:
                
                new_entry = []
                status = status._json
                
               
                new_entry += [status['created_at'],
                              status['source'], status['text'],
                              status['lang'],status['favorite_count'], 
                              status['retweet_count'], 
                              status['user']['screen_name'], status['user']['name'],
                              status['user']['friends_count'], status['user']['followers_count']]

            


                hashtags = ", ".join([hashtag_item['text'] for hashtag_item in status['entities']['hashtags']])
              
                new_entry.append(hashtags) #append the hashtags

                #
                mentions = ", ".join([mention['screen_name'] for mention in status['entities']['user_mentions']])
                new_entry.append(mentions) #append the user mentions

                try:
                    xyz = status['place']['bounding_box']['coordinates']
                    coordinates = [coord for loc in xyz for coord in loc]
                except TypeError:
                    coordinates = None
                #
                new_entry.append(coordinates)

                try:
                    location = status['user']['location']
                except TypeError:
                    location = ''
                #
                new_entry.append(location)

                #now append a row to the dataframe
                single_tweet_df = pd.DataFrame([new_entry], columns=self.cols)
                df = df.append(single_tweet_df, ignore_index=True)

        if not csvfile is None:
            #save it to file
            df.to_csv(csvfile, columns=self.cols, index=False, encoding="utf-8")
            
        return df

In [8]:
#%%writefile ../pyscrap_url.py

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  #.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('findaing',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))    
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

In [9]:
res = get_elements('https://africafreak.com/100-most-influential-twitter-users-in-africa',tag='h2')
#This is to confirm that we successfully pulled the data 


In [10]:
#This is the first step i took in extracting the dataframe,
#i used a range of 100 since there were some lines that were not useful
numbers = []
for item in range(100):
  #I splited here by "." since all the numbers ended with . i wanted to seperate it 
  number = res[item].split('.')
  numbers.append(number)

In [11]:
#this extract just the first object after the spit which is the numbers
index = []
for char in numbers:
  index.append(char[0])
#This line helps to extract the name of each of them from the list
names =[]
for name in numbers:
  #All the usernames started with a bracket so i splitted using that to
  #divide the username and the full name 
  names.append(name[-1].split("("))
fullname = []
for name in names:
  #Since the full names where together i focused on the first element and
  #removed the spaces before and after using strip
  fullname.append(name[0].strip())
handle =[]
for name in names:
  #This helps to append the username and the "-1" helps to stop just before the 
  #last character
  handle.append(name[-1][:-1])


In [12]:
#Here i zipped all the columns and created a dataframe
df = pd.DataFrame(  list(zip(index, fullname, handle)), columns=['Number','Name', 'Handle'])

In [16]:
#This helps to sort the value according to number
df["Number"]= df.Number.astype(int)
df.sort_values('Number', inplace=True)
df.drop(columns='Number', inplace=True)
#This resets the index
df.reset_index(drop=True, inplace=True)

In [14]:
df.head()

Unnamed: 0,Name,Handle
99,Trevor Noah,@Trevornoah
98,Gareth Cliff,@GarethCliff
97,Zuma,@SAPresident
96,News24,@News24
95,Julius Sello Malema,@Julius_S_Malema


In [15]:
url= 'https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/'

In [25]:
response = simple_get(url)
res = get_elements(response, search={'find_all':{'class_':'wp-block-embed__wrapper'}})
pattern = "@[a-zA-Z]+[0-9]*[/_*]*[a-zA-Z]*"
twitter_handle = re.findall(pattern, str(res))
print("The total number of African leaders extracted", len(twitter_handle))
twitter_handle[:5]

findaing all of {'class_': 'wp-block-embed__wrapper'}
The total number of African leaders extracted 52


['@EswatiniGovern1',
 '@MalawiGovt',
 '@hagegeingob',
 '@FinanceSC',
 '@PresidencyZA']

### Twitter Scraping to find the follower counts

In [26]:

followers_count = {}
for count in range(len(handle)):
  try:
    user = api.get_user(handle[count])
    followers_count[handle[count]]= user.followers_count
  except Exception as e:
    pass

In [27]:
df['Follower_count']= df['Handle'].map(followers_count)

In [28]:
df.sort_values('Follower_count', inplace=True, ascending=False)

In [29]:
df.reset_index(drop=True, inplace=True)

In [30]:
#I tried to check all the null values to confirm the problem
dfnull = df[df.isna().any(axis=1)]
print (dfnull)

                Name            Handle  Follower_count
93       John Robbie    @702JohnRobbie             NaN
94       The New Age      @The_New_Age             NaN
95      Afrinnovator     @Afrinnovator             NaN
96      drew hinshaw     @drewfhinshaw             NaN
97       Karen Allen    @BBCKarenAllen             NaN
98  Vanessa Raphaely  @hurricanevaness             NaN
99         andBeyond  @andBeyondSafari             NaN


In [31]:
#i dropped all rows for blocked account that is returned null value for follower count i also performed a twitter search to confirm this
df.dropna(inplace=True)

In [32]:
first_ten = df.head(10)

In [33]:
 first_ten.to_csv ("/content/drive/My Drive/Colab Notebooks/African_influencer.csv", index = False, header=True)
df.to_csv ("/content/drive/My Drive/Colab Notebooks/African_influencer_all.csv", index = False, header=True)

In [34]:
african_follower = {}
new_one ={}
fullname = []
for count in range(len(twitter_handle)):
  try:
    user = api.get_user(twitter_handle[count])
    african_follower[twitter_handle[count]]= user.followers_count
    new_one[twitter_handle[count]]= user.name 
  except Exception:
    pass

In [35]:
df2 = pd.DataFrame(twitter_handle, columns=['Twitter Handle'])
df2['Follower_count']= df2['Twitter Handle'].map(african_follower)
df2["Name"] = df2['Twitter Handle'].map(new_one)
df2.sort_values('Follower_count', inplace=True, ascending=False)
df2.drop_duplicates(inplace=True)
#since Boris Johnson and WHO aren't African Leader we will go ahead and drop those rows
df2.drop(df2.index[[0,2]], inplace=True)
df2.reset_index(drop=True, inplace=True) 

In [36]:
df2.head()

Unnamed: 0,Twitter Handle,Follower_count,Name
0,@MBuhari,3268716,Muhammadu Buhari
1,@PaulKagame,1981672,Paul Kagame
2,@KagutaMuseveni,1810390,Yoweri K Museveni
3,@PresidencyZA,1597602,Presidency | South Africa 🇿🇦
4,@NAkufoAddo,1505244,Nana Akufo-Addo


In [37]:
topten_african_leader = df2.head(10)

In [38]:
topten_african_leader.to_csv("/content/drive/My Drive/Colab Notebooks/African_leaders.csv", index = False, header=True)
df2.to_csv("/content/drive/My Drive/Colab Notebooks/African_leaders.csv", index = False, header=True)

### I wan't to start scraping the tweets using the twitter handles rows from the data gotten previously

In [40]:
#i created for appending the data gotten from tweepy
df_final = pd.DataFrame(columns=cols)

In [21]:
influentials_africa = '/content/drive/My Drive/influentials_africa.json'
ts = TweetSearch()

In [None]:
for user in df['Handle']:
  try:
      df_infl = ts.get_tweets(user)
      df_final = df_final.append(df_infl, ignore_index=True)
      print("collecting data from", user)
  except:
    pass

In [39]:
print(f"{len(df_final)} where gotten from twitter African Influencers")

45030 where gotten from twitter African Leaders


In [38]:
df_final= pd.read_csv("/content/drive/My Drive/Colab Notebooks/all_influenncers_tweets.csv")

In [22]:
#i decided to append all the data gotten to this data frame 
df_infl_gov = pd.DataFrame(columns=cols)

In [None]:
for user in df2['Twitter Handle']:
    try:
      df_inf = ts.get_tweets(user)
      df_infl_gov = df_infl_gov.append(df_inf, ignore_index=True)
      print("collecting data from", user)
    except:
      pass 



In [36]:
print(f"{len(df_infl_gov)} where gotten from twitter African Leaders")

25835 where gotten from twitter African Leaders


In [40]:
#i saved the first twitter result for African Influencer as csv to my Google Drive
df_infl_gov.to_csv("/content/drive/My Drive/Colab Notebooks/all_government_tweets.csv", index = False, header=True)

In [None]:
#I saved the results for African Leaders as csv in my google drive
df_final.to_csv("/content/drive/My Drive/Colab Notebooks/all_influencers_tweets.csv", index = False, header=True)