In [2]:
import os
from os import path, listdir

from lxml import etree
from lxml.etree import XPath

import ujson as json
import pandas as pd

from itertools import filterfalse
from time import time
from datetime import datetime as dt

In [3]:
# hack to avoid passing the namespace all the time; from http://stackoverflow.com/a/17293795/786559
# etree.FunctionNamespace("http://exslt.org/regular-expressions").prefix = 're'
re_NS = {'re': "http://exslt.org/regular-expressions"}

In [4]:
def get_next_file(journal_path, min_year=None, max_year=None):
    """ Returns a the filename to process next, along with its year and month. 
        If the range is given, then only those years are considered """
    for year in listdir(journal_path):
        # filter range:
        if min_year and int(year) < min_year or \
           max_year and int(year) > max_year:
            continue
            
        year_path = path.join(journal_path, year)
        for filename in listdir(year_path):
            month = path.splitext(filename)[0]
            yield path.join(year_path, filename), int(year), int(month)

In [5]:
def get_xpath_search(names, with_words=None):
    """ Returns an XPath search function returning articles whose full_text matches any of their names
    and any of `with_words` but none of `without_words` (if given)"""

    names_regex_str = 're:test(., "{regex}", "i")'.format(regex='|'.join(names))
    if with_words:
        with_regex_str = ' and re:test(., "{regex}", "i")'.format(regex='|'.join(with_words))
    else:
        with_regex_str = ''
    
    match_string = './/full_text[{names_regex} {with_regex}]//ancestor::article'.format(
            names_regex = names_regex_str, with_regex=with_regex_str)
    return XPath(match_string, namespaces=re_NS)
    # no need to pass the namespaces, because of the 'hack' above
#     return etree.XPath(match_string)

In [6]:
result_path = '/home/tomoiaga/hum_dig/results/'

# set the searches
search_text = XPath('.//full_text/text()')
search_date = XPath('./entity[1]/meta/issue_date/text()')
search_journal = XPath('./entity[1]/meta/publication/text()')
search_num_words = XPath('./entity[1]/meta/updated_word_count/text()')
search_num_chars = XPath('./entity[1]/meta/updated_char_count/text()')

def export_iramu(articles, country):
    """ Appends all the articles to the country's file """
    # the current name is the first in the list
    country_name = country['all_names'][0] 
    txt_name = path.join(result_path, country_name + '.txt')
    
    # pays, date, date_an, date_mois, date_jour, journal, len_mots, len_chars, colonist, continent
    article_template = "**** *p_{country} *date_{date} *an_{year} *mois_{month} *jour_{day} *j_{journal}" + \
                           " *lenMots_{words} *lenChar_{chars} *col_{colonist} *con_{continent}\n"
    
    # prepare country_name for iramuteq
    country_name = country_name.replace('-', '_').replace(' ', '_').replace("'", '_')
        
    with open(txt_name, 'a') as f:
        for art in articles:
            date_str = search_date(art)[0].replace('/', '-') # put in iramu format
            date = dt.strptime(date_str, "%d-%m-%Y")
            journal = search_journal(art)[0]
            num_words = search_num_words(art)[0]
            num_chars = search_num_chars(art)[0]
            texts = search_text(art)
            
            # write the header line
            try:
                header = article_template.format(country=country_name, date=date_str, year=date.year, month=date.month, day=date.day,
                                             journal=journal, words=num_words, chars=num_chars, 
                                             colonist=country['colonist'], continent=country['continent'])
            except:
                Tracer()()
            f.write(header)
            
            # write the contents, removing any stars first
            f.write(' '.join(texts).replace('*', ' '))
            
            # write separation
            f.write('\n\n')

In [7]:
def prepare_countries(countries, span, with_words=[]):
    """ Inserts the needed fields for searching 
        Returns the smallest and biggest independence years """
    min_year, max_year = 3000, 0
    for name, country in countries.items():
        # build the word list
        country['all_names'] = [name]
        if country['other']:
            country['all_names'].append(country['other'])

        # setup search
        country['path'] = get_xpath_search(country['all_names'], with_words)
        country['articles'] = []
        
        # update years
        if country['ind_year'] < min_year:
            min_year = country['ind_year']
        if country['ind_year'] > max_year:
            max_year = country['ind_year']
    return min_year, max_year

def year_in_range(country, year, span):
    """ Whether we lookup the country in the given year, based on a span around its independence year """
    return country['ind_year'] - span <= year <= country['ind_year'] + span

In [8]:
def get_articles(countries, with_words, span=2):
    """ span = the period to consider around independence date
        countries = { 'Lybie' : {           # each country is a dict
                'other': 'Urundi',          # other names that it was known as 
                'ind_year' : 1960,          # year of independence
                'continent': 'Africa',
                'colonist' : 'France',
                
                # the function adds these:
                'names'    : [...]
                'path'     : XPath(names),  # an XLST search path for 'alt_names'
                'articles' : []             # the list of articles corresponding to this country
            }
        } """
    # check you didn't pass the params in wrong order by mistake
    assert(type(with_words) != 'number')
    
    min_year, max_year = prepare_countries(countries, span, with_words)
    
    # augment with the required range
    if span:
        min_year, max_year = min_year - span, max_year + span
    else:
        min_year, max_year = 1945, 1995

    root_path = "/mnt/le_temps_data/letempsdata/data4-month/"
    log_file  = open('./logs/' + dt.now().strftime('%m_%d_%H_%M') + '.log', 'w')
    for journal in ['GDL', 'JDG']:
        journal_path = path.join(root_path, journal)
        start = time()
        prev_file_year = min_year - 1 # make sure it's different
        year_articles = 0
        
        for file_path, file_year, file_month in get_next_file(journal_path, min_year, max_year):
            # prin stats when finishing an file_year
            if file_year != prev_file_year:
                duration = time() - start
                print(journal, file_year, duration, 'Processed: ', year_articles, flush=True)
                print(journal, file_year, duration, 'Processed: ', year_articles, flush=True, file=log_file)
                start = time()
                prev_file_year = file_year
                year_articles = 0

            with open(file_path) as f:
                tree = etree.parse(f)
                
                for name, country in countries.items():
                    # skip irrelevant countries for this year
                    if span and not year_in_range(country, file_year, span):
                        continue

                    articles = country['path'](tree)
                    year_articles += len(articles)
                    # save into the data structure
#                     country['articles'].extend(articles)
                    export_iramu(articles, country)
    log_file.close()
#     return countries
        

In [72]:
deco = {'décolonis' : {'colonist': 'tous', 'continent':'tous', 'ind_year': 1970, 'other':''}}

In [73]:
get_articles(deco, None, span=25)

GDL 1945 0.005806684494018555 Processed:  0
GDL 1946 1.9948046207427979 Processed:  0
GDL 1947 2.0686988830566406 Processed:  0
GDL 1948 1.7058167457580566 Processed:  0
GDL 1949 1.9859576225280762 Processed:  1
GDL 1950 1.8600895404815674 Processed:  1
GDL 1951 2.0267996788024902 Processed:  0
GDL 1952 1.8184974193572998 Processed:  0
GDL 1953 1.8471410274505615 Processed:  0
GDL 1954 1.7810282707214355 Processed:  0
GDL 1955 2.066211700439453 Processed:  0
GDL 1956 1.817195177078247 Processed:  0
GDL 1957 1.7490234375 Processed:  1
GDL 1958 2.0494437217712402 Processed:  1
GDL 1959 2.0210931301116943 Processed:  3
GDL 1960 2.060258150100708 Processed:  5
GDL 1961 2.326807737350464 Processed:  40
GDL 1962 2.509708881378174 Processed:  81
GDL 1963 2.3595192432403564 Processed:  59
GDL 1964 2.3196139335632324 Processed:  45
GDL 1965 2.6904420852661133 Processed:  35
GDL 1966 2.0535528659820557 Processed:  19
GDL 1967 2.6269397735595703 Processed:  41
GDL 1968 2.2165913581848145 Processe

In [74]:
countries = pd.read_excel('./Liste mots cles.xlsx', sheetname='Pays_2', index_col='name', convert_float=True).fillna('').to_dict('index')

In [75]:
countries

{'Algérie': {'colonist': 'France',
  'continent': 'Afrique',
  'ind_date': Timestamp('1962-07-05 00:00:00'),
  'ind_year': 1962,
  'other': '',
  'president': 'Ahmed Ben Bella'},
 'Angola': {'colonist': 'Portugal',
  'continent': 'Afrique',
  'ind_date': Timestamp('1975-11-11 00:00:00'),
  'ind_year': 1975,
  'other': '',
  'president': 'Agostinho Neto'},
 'Bangladesh': {'colonist': 'Pakistan',
  'continent': 'Asie',
  'ind_date': Timestamp('1971-03-26 00:00:00'),
  'ind_year': 1971,
  'other': '',
  'president': ''},
 'Birmanie': {'colonist': 'Grande-Bretagne',
  'continent': 'Asie',
  'ind_date': Timestamp('1948-01-04 00:00:00'),
  'ind_year': 1948,
  'other': '',
  'president': ''},
 'Botswana': {'colonist': 'Grande-Bretagne',
  'continent': 'Afrique',
  'ind_date': Timestamp('1966-09-30 00:00:00'),
  'ind_year': 1966,
  'other': 'Bechuanaland',
  'president': 'Seretse Khama'},
 'Burkina Faso': {'colonist': 'France',
  'continent': 'Afrique',
  'ind_date': Timestamp('1960-08-05 00:0

In [79]:
get_articles(countries, [], 2)

GDL 1944 0.005637407302856445 Processed:  0
GDL 1945 1.9117045402526855 Processed:  57
GDL 1946 3.7421064376831055 Processed:  1015
GDL 1947 5.296717643737793 Processed:  1300
GDL 1948 7.743750810623169 Processed:  1254
GDL 1949 8.55806040763855 Processed:  1382
GDL 1950 8.709542989730835 Processed:  1555
GDL 1951 6.514414072036743 Processed:  455
GDL 1952 6.849787712097168 Processed:  368
GDL 1953 5.731336355209351 Processed:  473
GDL 1954 6.122942924499512 Processed:  777
GDL 1955 8.757981538772583 Processed:  2344
GDL 1956 11.114814281463623 Processed:  1114
GDL 1957 9.978607892990112 Processed:  806
GDL 1958 9.422631740570068 Processed:  886
GDL 1959 34.894047498703 Processed:  1640
GDL 1960 33.89405703544617 Processed:  1188
GDL 1961 44.93664526939392 Processed:  3164
GDL 1962 45.990092039108276 Processed:  2572
GDL 1963 47.97602987289429 Processed:  2366
GDL 1964 18.895557403564453 Processed:  907
GDL 1965 22.76979160308838 Processed:  714
GDL 1966 10.86103892326355 Processed:  1

In [83]:
countries = pd.read_excel('./Liste mots cles.xlsx', sheetname='Pays_2', index_col='name', convert_float=True).fillna('').to_dict('index')

In [88]:
countries

{'Algérie': {'colonist': 'France',
  'continent': 'Afrique',
  'ind_date': Timestamp('1962-07-05 00:00:00'),
  'ind_year': 1962,
  'other': '',
  'president': 'Ahmed Ben Bella'},
 'Angola': {'colonist': 'Portugal',
  'continent': 'Afrique',
  'ind_date': Timestamp('1975-11-11 00:00:00'),
  'ind_year': 1975,
  'other': '',
  'president': 'Agostinho Neto'},
 'Bangladesh': {'colonist': 'Pakistan',
  'continent': 'Asie',
  'ind_date': Timestamp('1971-03-26 00:00:00'),
  'ind_year': 1971,
  'other': '',
  'president': ''},
 'Birmanie': {'colonist': 'Grande-Bretagne',
  'continent': 'Asie',
  'ind_date': Timestamp('1948-01-04 00:00:00'),
  'ind_year': 1948,
  'other': '',
  'president': ''},
 'Botswana': {'colonist': 'Grande-Bretagne',
  'continent': 'Afrique',
  'ind_date': Timestamp('1966-09-30 00:00:00'),
  'ind_year': 1966,
  'other': 'Bechuanaland',
  'president': 'Seretse Khama'},
 'Burkina Faso': {'colonist': 'France',
  'continent': 'Afrique',
  'ind_date': Timestamp('1960-08-05 00:0

In [122]:
get_articles(countries, ['décolonis', 'indépend'], 0)

GDL 1945 0.02464890480041504 Processed:  0
GDL 1946 72.52974724769592 Processed:  196
GDL 1947 86.55457782745361 Processed:  383
GDL 1948 72.93958163261414 Processed:  417
GDL 1949 81.36562395095825 Processed:  348
GDL 1950 82.39210176467896 Processed:  490
GDL 1951 83.25923609733582 Processed:  459
GDL 1952 69.3944046497345 Processed:  217
GDL 1953 71.64987540245056 Processed:  405
GDL 1954 75.30775880813599 Processed:  603
GDL 1955 82.45940351486206 Processed:  998
GDL 1956 79.0343017578125 Processed:  661
GDL 1957 71.34850692749023 Processed:  746
GDL 1958 84.59223532676697 Processed:  993
GDL 1959 81.58337593078613 Processed:  1033
GDL 1960 81.81172299385071 Processed:  839
GDL 1961 99.96001362800598 Processed:  1600
GDL 1962 101.86950445175171 Processed:  1168
GDL 1963 96.50038433074951 Processed:  967
GDL 1964 94.40223956108093 Processed:  609
GDL 1965 109.50357413291931 Processed:  755
GDL 1966 80.90409421920776 Processed:  460
GDL 1967 105.17990255355835 Processed:  585
GDL 196

In [9]:
countries = pd.read_excel('./Liste mots cles.xlsx', sheetname='Pays_2', index_col='name', convert_float=True).fillna('').to_dict('index')

In [11]:
t = etree.parse('/mnt/le_temps_data/letempsdata/data4-month/GDL/1962/08.xml')

In [13]:
cs = {'Burkina Faso': countries['Burkina Faso']}

In [15]:
prepare_countries(cs, 2)

(1960, 1960)

In [21]:
cs['Burkina Faso']['path']

.//full_text[re:test(., "Burkina Faso|Afrique-Occidentale française", "i") ]//ancestor::article

In [28]:
t.xpath('.//full_text[re:test(., "Burkina", "i")]//ancestor::article', namespaces=re_NS)

[]