In [1]:
from bs4 import BeautifulSoup
import os
import glob
import logging
import re
import pycountry

In [2]:
pycountry.countries.get(alpha_2='TR')

Country(alpha_2='TR', alpha_3='TUR', name='Turkey', numeric='792', official_name='Republic of Turkey')

In [3]:
logger = logging.getLogger("awstats")
logger.setLevel(logging.INFO)
logging.info("starting")

In [4]:
datadir = "/home/ctroupin/Data/EMODnet/Chemistry/Awstats/"
filelist = sorted(glob.glob(os.path.join(datadir, "awstats1*2017.sdn-oceanbrowser-net.txt")))
nfiles = len(filelist)
logger.info("Working on {} files".format(nfiles))

INFO:awstats:Working on 0 files


In [110]:
clist = []
for statfiles in filelist:
    logger.info("Working on file {}".format(os.path.basename(statfiles)))
    
    with open(statfiles) as f:
        line = ""

        # Read until BEGIN_DOMAIN is found
        while not "BEGIN_DOMAIN" in line:
            line = f.readline().rstrip()

        # Get the number of domains from the line
        ncountries = int(re.findall(r'\d+', line)[0])
        logger.info("Countries: {}".format(ncountries))

        # Loop on the domains
        for i in range(0, ncountries):
            line = f.readline().rstrip()
            lsplit = line.split()
            cname = lsplit[0]
            pages = int(lsplit[1])
            hits = int(lsplit[2])
            bw = int(lsplit[3])

            countryindex = findcountry(cname, clist)
            if countryindex is None:
                logger.debug("New country")
                clist.append(CountryStat(cname, pages, hits, bw))
            else:
                logger.debug("Country already encountered")
                clist[countryindex].pages += int(pages)
                clist[countryindex].hits += hits
                clist[countryindex].bw += bw           

INFO:awstats:Working on file awstats102017.sdn-oceanbrowser-net.txt
INFO:awstats:Countries: 31
INFO:awstats:Working on file awstats112017.sdn-oceanbrowser-net.txt
INFO:awstats:Countries: 27
INFO:awstats:Working on file awstats122017.sdn-oceanbrowser-net.txt
INFO:awstats:Countries: 22


In [114]:
clist_sorted = sorted(clist, key=lambda x: x.pages, reverse=True)

[Country ip: 23762 pages, 32637 hits and 573952417 MB of bandwidth,
 Country es: 12554 pages, 16669 hits and 72482406 MB of bandwidth,
 Country net: 11078 pages, 14759 hits and 65811039 MB of bandwidth,
 Country be: 3742 pages, 4793 hits and 24352410 MB of bandwidth,
 Country in: 2608 pages, 2956 hits and 12153502 MB of bandwidth,
 Country si: 1900 pages, 1993 hits and 5915557 MB of bandwidth,
 Country com: 1125 pages, 1710 hits and 18366058 MB of bandwidth,
 Country ee: 861 pages, 1038 hits and 5251092 MB of bandwidth,
 Country it: 501 pages, 1685 hits and 18882390 MB of bandwidth,
 Country tr: 481 pages, 675 hits and 4416832 MB of bandwidth,
 Country nl: 462 pages, 721 hits and 5949976 MB of bandwidth,
 Country de: 395 pages, 762 hits and 9030367 MB of bandwidth,
 Country ru: 341 pages, 400 hits and 1826741 MB of bandwidth,
 Country fr: 295 pages, 769 hits and 13751274 MB of bandwidth,
 Country gr: 221 pages, 511 hits and 6937792 MB of bandwidth,
 Country edu: 216 pages, 336 hits and

In [124]:
fo = open("statsCountry.csv", 'w')
for i in range(0, 10):
    country = clist_sorted[i]
    fo.write("\t".join((country.name, str(country.pages), str(country.hits), "\n")))
fo.close()

In [47]:
class CountryStat(object):
    
    def __init__(self, name=None, pages=0, hits=0, bw=0):
        self.name = name
        self.pages = pages
        self.hits = hits
        self.bw = bw
        
    def __repr__(self):
        return "Country {0}: {1} pages, {2} hits and {3} MB of bandwidth".format(self.name, 
                                                                          self.pages,
                                                                          self.hits,
                                                                          self.bw)

In [102]:
def findcountry(cname, clist):
    """
    Find the object in a list which has the attribute 'name' matching cname
    """
    index = None
    for i, c in enumerate(clist):
        if c.name == cname:
            index = i
            break
    return index