<a href="https://colab.research.google.com/github/dmarinere/NikeCampaign/blob/master/scrapping_starter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Web scrapping using python

#### References
1. [Practical Introduction to Web Scraping in Python](https://realpython.com/python-web-scraping-practical-introduction/)
2. [Web Scraping using Python](https://www.datacamp.com/community/tutorials/web-scraping-using-python)

In [99]:
!pip install fire
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import os, sys
import re 
import fire
import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import logging
import json

#sentiment analysis package
!pip install textblob
from textblob import TextBlob

#general text pre-processor
!pip install nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

#tweet pre-processor 
!pip install tweet-preprocessor
import preprocessor as p

logger = logging.getLogger()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [100]:

class tweetsearch():
    '''
    This is a basic class to search and download twitter data.
    You can build up on it to extend the functionalities for more 
    sophisticated analysis
    '''
    def __init__(self, cols=None,auth=None):
        if cols is None:
            self.cols = ['id', 'created_at', 'source', 'original_text','clean_text', 
                    'sentiment','polarity','subjectivity', 'lang',
                    'favorite_count', 'retweet_count', 'original_author',   
                    'possibly_sensitive', 'hashtags',
                    'user_mentions', 'place', 'place_coord_boundaries']
        else:
            self.cols = cols
  
            
        if auth is None:
            #Variables that contains the user credentials to access Twitter API 
            consumer_key = os.environ.get('TWITTER_API_KEY')
            consumer_secret = os.environ.get('TWITTER_API_SECRET')
            access_token = os.environ.get('TWITTER_ACCESS_TOKEN')
            access_token_secret = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET')


            #This handles Twitter authetification and the connection to Twitter Streaming API
            auth = OAuthHandler(consumer_key, consumer_secret)
            auth.set_access_token(access_token, access_token_secret)
            

        #            
        self.auth = auth
        self.api = tweepy.API(auth)            
            

    def clean_tweets(self, twitter_text):
        self.twitter_text = twitter_text
        #use pre processor
        tweet = p.clean(self.twitter_text)

         #HappyEmoticons
        emoticons_happy = set([
            ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
            ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
            '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
            'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
            '<3'
            ])

        # Sad Emoticons
        emoticons_sad = set([
            ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
            ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
            ':c', ':{', '>:\\', ';('
            ])

        #Emoji patterns
        emoji_pattern = re.compile("["
                 u"\U0001F600-\U0001F64F"  # emoticons
                 u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                 u"\U0001F680-\U0001F6FF"  # transport & map symbols
                 u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                 u"\U00002702-\U000027B0"
                 u"\U000024C2-\U0001F251"
                 "]+", flags=re.UNICODE)

        #combine sad and happy emoticons
        emoticons = emoticons_happy.union(emoticons_sad)

        stop_words = set(stopwords.words('english'))
        word_tokens = nltk.word_tokenize(tweet)
        #after tweepy preprocessing the colon symbol left remain after      
        #removing mentions
        tweet = re.sub(r':', '', tweet)
        tweet = re.sub(r'‚Ä¶', '', tweet)

        #replace consecutive non-ASCII characters with a space
        tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)

        #remove emojis from tweet
        tweet = emoji_pattern.sub(r'', tweet)

        #filter using NLTK library append it to a string
        filtered_tweet = [w for w in word_tokens if not w in stop_words]

        #looping through conditions
        filtered_tweet = []    
        for w in word_tokens:
        #check tokens against stop words , emoticons and punctuations
            if w not in stop_words and w not in emoticons and w not in string.punctuation:
                filtered_tweet.append(w)

        self.filtered_tweet = ' '.join(filtered_tweet)
    def get_tweets(self, keyword, csvfile=None):
        
        
        df = pd.DataFrame(columns=self.cols)
        
        if not csvfile is None:
            #If the file exists, then read the existing data from the CSV file.
            if os.path.exists(csvfile):
                df = pd.read_csv(csvfile, header=0)
            

        #page attribute in tweepy.cursor and  for twitter users 
        for page in tweepy.Cursor(self.api.user_timeline, id=user_handle).pages(70):



            for status in page:
                
                new_entry = []
                status = status._json
                
                #filter by language
                if status['lang'] != 'en':
                    continue

                
                #if this tweet is a retweet update retweet count
                if status['created_at'] in df['created_at'].values:
                    i = df.loc[df['created_at'] == status['created_at']].index[0]
                    #
                    cond1 = status['favorite_count'] != df.at[i, 'favorite_count']
                    cond2 = status['retweet_count'] != df.at[i, 'retweet_count']
                    if cond1 or cond2:
                        df.at[i, 'favorite_count'] = status['favorite_count']
                        df.at[i, 'retweet_count'] = status['retweet_count']
                    continue
                self.clean_tweets(status['text'])

                #calculate sentiment
                blob = TextBlob(self.filtered_tweet)
                Sentiment = blob.sentiment     
                polarity = Sentiment.polarity
                subjectivity = Sentiment.subjectivity

                new_entry += [status['id'], status['created_at'],
                              status['source'], status['text'],self.filtered_tweet, 
                              Sentiment,polarity,subjectivity, status['lang'],
                              status['favorite_count'], status['retweet_count']]

                new_entry.append(status['user']['screen_name'])
                new_entry.append(status['user']['created_at'])


                try:
                    is_sensitive = status['possibly_sensitive']
                except KeyError:
                    is_sensitive = None

                new_entry.append(is_sensitive)

                hashtags = ", ".join([hashtag_item['text'] for hashtag_item in status['entities']['hashtags']])
                new_entry.append(hashtags) #append the hashtags

                #
                mentions = ", ".join([mention['screen_name'] for mention in status['entities']['user_mentions']])
                new_entry.append(mentions) #append the user mentions

                try:
                    xyz = status['place']['bounding_box']['coordinates']
                    coordinates = [coord for loc in xyz for coord in loc]
                except TypeError:
                    coordinates = None
                #
                new_entry.append(coordinates)

                try:
                    location = status['user']['location']
                except TypeError:
                    location = ''
                #
                new_entry.append(location)

                #now append a row to the dataframe
                single_tweet_df = pd.DataFrame([new_entry], columns=self.cols)
                df = df.append(single_tweet_df, ignore_index=True)

        if not csvfile is None:
            #save it to file
            df.to_csv(csvfile, columns=self.cols, index=False, encoding="utf-8")
            
        return df

In [101]:
#%%writefile ../pyscrap_url.py

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  #.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('findaing',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))    
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

In [102]:
res = get_elements('https://africafreak.com/100-most-influential-twitter-users-in-africa',tag='h2')

In [103]:
numbers = []
for item in range(100):
  number = res[item].split('.')
  numbers.append(number)

In [106]:
#This helps to extract the number in the dataframe
index = []
for char in numbers:
  index.append(char[0])

In [108]:
#This line helps to extract the name of each of them from the list
names =[]
for name in numbers:
  names.append(name[-1].split("("))
fullname = []
for name in names:
  fullname.append(name[0].strip())
handle =[]
for name in names:
  handle.append(name[-1][:-1])


In [109]:
df = pd.DataFrame(list(zip(index,fullname, handle)), columns=['Number','Name', 'Handle'])

In [110]:
df["Number"]= df.Number.astype(int)
df.sort_values('Number', inplace=True)
df.set_index('Number', inplace=True)

In [96]:
from google.colab import drive
drive.mount('/content/drive')
drive.mount('/content/gdrive',force_remount=True)
root_path = 'gdrive/My Drive/Kaggle_project/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [98]:
drive.mount('/content/gdrive',force_remount=True)
root_path = 'gdrive/My Drive/Kaggle_project/'

Mounted at /content/gdrive


In [111]:
df.head()

Unnamed: 0_level_0,Name,Handle
Number,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Trevor Noah,@Trevornoah
2,Gareth Cliff,@GarethCliff
3,Zuma,@SAPresident
4,News24,@News24
5,Julius Sello Malema,@Julius_S_Malema


In [112]:
url= 'https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/'

In [135]:

res = get_elements('https://africafreak.com/100-most-influential-twitter-users-in-africa','h2')
influencers = get_influencers(res)

NameError: ignored

In [141]:
  response = simple_get(url)
  res = get_elements(response, search={'find_all':{'class_':'wp-block-embed__wrapper'}})
  pattern = "@[a-zA-Z]+[0-9]*[/_*]*[a-zA-Z]*"
  twitter_handle = re.findall(pattern, str(res))
  print("The total number of African leaders extracted", len(twitter_handle))
  twitter_handle[:5]

findaing all of {'class_': 'wp-block-embed__wrapper'}
The total number of African leaders extracted 52


['@EswatiniGovern1',
 '@MalawiGovt',
 '@hagegeingob',
 '@FinanceSC',
 '@PresidencyZA']

### Twitter Scraping to find the follower counts and most used Hastags

In [117]:

followers_count = {}
for count in range(len(handle)):
  try:
    user = api.get_user(handle[count])
    followers_count[handle[count]]= user.followers_count
  except Exception as e:
    pass

In [118]:
len(followers_count)

93

In [119]:
df['Follower_count']= df['Handle'].map(followers_count)

In [120]:
df.sort_values('Follower_count', inplace=True, ascending=False)

In [121]:
df.head()

Unnamed: 0_level_0,Name,Handle,Follower_count
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Trevor Noah,@Trevornoah,10800941.0
4,News24,@News24,3574229.0
5,Julius Sello Malema,@Julius_S_Malema,3124656.0
2,Gareth Cliff,@GarethCliff,1974376.0
15,Euphonik™♛,@euphonik,1753722.0


In [125]:
#I tried to check all the null values to confirm the problem
dfnull = df[df.isna().any(axis=1)]
print (dfnull)

                    Name            Handle  Follower_count
Number                                                    
13           John Robbie    @702JohnRobbie             NaN
36           The New Age      @The_New_Age             NaN
50          Afrinnovator     @Afrinnovator             NaN
67          drew hinshaw     @drewfhinshaw             NaN
73           Karen Allen    @BBCKarenAllen             NaN
74      Vanessa Raphaely  @hurricanevaness             NaN
88             andBeyond  @andBeyondSafari             NaN


In [126]:
#i dropped all rows for blocked account that is returned null value for follower count i also performed a twitter search to confirm this
df.dropna(inplace=True)

In [None]:
first_ten = df.head(10)

In [127]:
 first_ten.to_csv ("/content/gdrive/My Drive/Colab Notebooks/African_influencer.csv", index = False, header=True)

In [143]:
african_follower = {}
new_one ={}
fullname = []
for count in range(len(twitter_handle)):
  try:
    user = api.get_user(twitter_handle[count])
    african_follower[twitter_handle[count]]= user.followers_count
    new_one[twitter_handle[count]]= user.name

    
  except Exception as e:
    pass

In [144]:
df2 = pd.DataFrame(twitter_handle, columns=['Twitter Handle'])
df2['Follower_count']= df2['Twitter Handle'].map(african_follower)
df2["Name"] = df2['Twitter Handle'].map(new_one)
df2.sort_values('Follower_count', inplace=True, ascending=False)
df2.drop_duplicates(inplace=True) 

In [146]:
topten_african_leader = df.head(10)

In [None]:
topten_african_leader.to_csv("/content/drive/My Drive/Colab Notebooks/African_leaders.csv", index = False, header=True)

In [148]:
df2.head(10)

Unnamed: 0,Twitter Handle,Follower_count,Name
16,@WHO,8095178,World Health Organization (WHO)
47,@MBuhari,3267424,Muhammadu Buhari
22,@BorisJohnson,2906976,Boris Johnson #StayAlert
48,@PaulKagame,1980621,Paul Kagame
29,@KagutaMuseveni,1809068,Yoweri K Museveni
4,@PresidencyZA,1596864,Presidency | South Africa 🇿🇦
41,@NAkufoAddo,1503822,Nana Akufo-Addo
21,@CyrilRamaphosa,1483153,Cyril Ramaphosa 🇿🇦 #StaySafe
49,@Macky_Sall,1372441,Macky Sall
15,@StateHouseKenya,1101951,State House Kenya


In [None]:
/content/gdrive/My Drive/Colab Notebooks