In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

In [None]:
# %load ../helpers.py
import datetime

def dated_filename (fn, ext='.csv'):
    today = datetime.date.today()
    return '{}-{}{}'.format(fn, today, ext)


# Get the rankings for each country

## Get Alexa rankings for one country

In [2]:
from funcy import compose

def get_rankings (country_ranking_url, country_name):
        
    def parse_ranking (country_ranking_url):
        '''Takes a country ranking URL and returns a list of (bs4 parsed) HTML ranking listings.'''
        rankings_html = requests.get(country_ranking_url).content
        soup = BeautifulSoup(rankings_html, 'html.parser')
        listings = soup.find_all("div", {"class": "site-listing"})
        return listings

    def extract_info (site_listing_html):
        '''Takes HTML of an Alexa site listing and returns JSON.'''
        attrs = site_listing_html.find_all("div", {"class":"td"})
        rank = int(attrs[0].text)
        url = attrs[1].a.text
        site_info = attrs[1].a['href']
        return {
            'rank': rank,
            'url': url,
            'site_info': site_info,
            'country_name': country_name,
        }

    return [extract_info(ranking) for
           ranking in parse_ranking(country_ranking_url)]        
    

get_rankings('https://www.alexa.com/topsites/countries/AF', 'Afghanistan')


[{'rank': 1,
  'url': 'Google.com',
  'site_info': '/siteinfo/google.com',
  'country_name': 'Afghanistan'},
 {'rank': 2,
  'url': 'Youtube.com',
  'site_info': '/siteinfo/youtube.com',
  'country_name': 'Afghanistan'},
 {'rank': 3,
  'url': 'Facebook.com',
  'site_info': '/siteinfo/facebook.com',
  'country_name': 'Afghanistan'},
 {'rank': 4,
  'url': 'Yahoo.com',
  'site_info': '/siteinfo/yahoo.com',
  'country_name': 'Afghanistan'},
 {'rank': 5,
  'url': 'Acbar.org',
  'site_info': '/siteinfo/acbar.org',
  'country_name': 'Afghanistan'},
 {'rank': 6,
  'url': 'Bbc.com',
  'site_info': '/siteinfo/bbc.com',
  'country_name': 'Afghanistan'},
 {'rank': 7,
  'url': 'Wikipedia.org',
  'site_info': '/siteinfo/wikipedia.org',
  'country_name': 'Afghanistan'},
 {'rank': 8,
  'url': 'Google.com.af',
  'site_info': '/siteinfo/google.com.af',
  'country_name': 'Afghanistan'},
 {'rank': 9,
  'url': 'Jobs.af',
  'site_info': '/siteinfo/jobs.af',
  'country_name': 'Afghanistan'},
 {'rank': 10,
  '

## Get name and URL for all the countries

In [3]:
all_countries_url = 'https://www.alexa.com/topsites/countries'

In [4]:
soup = BeautifulSoup(requests.get(all_countries_url).content, 'html.parser')

In [5]:
# print(soup.prettify())
from funcy import flatten

country_list=\
list(
    flatten([ 
        lst.find_all('a') for lst 
        in soup.find_all('ul', {'class': 'countries'}) 
]))


## Create a dataframe/CSV with the top 50 Alexa rankings for every country

In [6]:
all_rankings = []
# get the global rankings first
all_rankings += get_rankings('https://www.alexa.com/topsites', 'Global')
# get the rankings for each country
for country in country_list:
    country_name = country.text
    print(country_name)
    country_url = 'https://www.alexa.com/' + country['href']
    country_rankings = get_rankings(country_url, country_name)
    all_rankings += country_rankings
    sleep(0.3)

Afghanistan
Albania
Algeria
Angola
Argentina
Armenia
Australia
Austria
Azerbaijan
Bahamas
Bahrain
Bangladesh
Barbados
Belarus
Belgium
Belize
Benin
Bermuda
Bhutan
Bolivia
Bosnia and Herzegovina
Botswana
Brazil
Brunei
Bulgaria
Burkina Faso
Burundi
Cambodia
Cameroon
Canada
Cape Verde
Chile
China
Colombia
Congo
Costa Rica
Cote d'Ivoire
Croatia
Cuba
Curaçao
Cyprus
Czech Republic
Democratic Republic of Congo
Denmark
Djibouti
Dominican Republic
Ecuador
Egypt
El Salvador
Estonia
Ethiopia
Fiji
Finland
France
French Guiana
French Polynesia
Gabon
Gambia
Georgia
Germany
Ghana
Greece
Guadeloupe
Guam
Guatemala
Guinea
Guyana
Haiti
Honduras
Hong Kong
Hungary
Iceland
India
Indonesia
Iran
Iraq
Ireland
Israel
Italy
Jamaica
Japan
Jordan
Kazakhstan
Kenya
Kuwait
Kyrgyzstan
Laos
Latvia
Lebanon
Lesotho
Liberia
Libya
Lithuania
Luxembourg
Macao
Macedonia
Madagascar
Malawi
Malaysia
Maldives
Mali
Malta
Martinique
Mauritania
Mauritius
Mayotte
Mexico
Moldova
Mongolia
Montenegro
Morocco
Mozambique
Myanmar
Namibia
Ne

In [7]:
global_alexa_rankings = pd.DataFrame(all_rankings)
global_alexa_rankings.tail()

Unnamed: 0,country_name,rank,site_info,url
9045,Zimbabwe,46,/siteinfo/thepiratebay.org,Thepiratebay.org
9046,Zimbabwe,47,/siteinfo/newsdzezimbabwe.co.uk,Newsdzezimbabwe.co.uk
9047,Zimbabwe,48,/siteinfo/xvideos.com,Xvideos.com
9048,Zimbabwe,49,/siteinfo/classifieds.co.zw,Classifieds.co.zw
9049,Zimbabwe,50,/siteinfo/sciencedirect.com,Sciencedirect.com


In [3]:
global_alexa_rankings.to_csv(dated_filename('data/global-alexa-rankings'))

NameError: name 'global_alexa_rankings' is not defined