In [13]:
import os
from os import path, listdir

from lxml import etree
from lxml.etree import XPath
import pickle as pkl
from itertools import filterfalse

In [2]:
# hack to avoid passing the namespace all the time; from http://stackoverflow.com/a/17293795/786559
# etree.FunctionNamespace("http://exslt.org/regular-expressions").prefix = 're'
re_NS = {'re': "http://exslt.org/regular-expressions"}

In [3]:
def get_next_file(journal_path, min_year=None, max_year=None):
    """ Returns a the filename to process next, along with its year and month. 
        If the range is given, then only those years are considered """
    for year in listdir(journal_path):
        # filter range:
        if min_year and int(year) < min_year or \
           max_year and int(year) > max_year:
            continue
            
        year_path = path.join(journal_path, year)
        for filename in listdir(year_path):
            month = path.splitext(filename)[0]
            yield path.join(year_path, filename), int(year), int(month)

In [14]:
def get_xpath_search(word_list):
    """ Given the word list, returns an XPath search function returning articles whose full_text matches any of those
    words"""
    match_string = './/full_text[re:test(text(), "{regex}", "i")]//ancestor::article'.format(regex='|'.join(word_list))
    return XPath(match_string, namespaces=re_NS)
    # no need to pass the namespaces, because of the 'hack' above
#     return etree.XPath(match_string)

In [16]:
result_path = '/home/tomoiaga/hum_dig/results/'

# set the searches
search_text = XPath('.//full_text/text()')
search_date = XPath('./entity/issue_date/text()')

def export_iramu(articles, country_name):
    """ Appends all the articles to the country's file """
    file_name = path.join(result_path, country_name + '.txt')
    # pays, date_an, date_mois, date_jour, journal, len_mots, len_chars
    article_template = "**** *p_{country} *da_{year} *dm_{month} *dj_{day} j_{journal} *lm_{words} *lc_{chars}"
    

    
    
    with open(file_name, 'a') as f:
        for art in articles:
            
            
            
            texts = to_text(art)
            f.write(' '.join(texts).replace('*', ' '))
            f.write('\n\n')

In [11]:
def get_articles(countries, with_words, without_words, min_year=None, max_year=None):
    """ countries = [                                   # list of countries
            {                                           # each country is a dict
                'name'     : 'Burundi'                  # the current name
                'alt_names': ['Urundi', 'Urundi-Ghana'],# other names that it was known as 
                
                # the function adds these:
                'path'     : XPath(names),              # an XLST search path for 'alt_names'
                'articles' : []                         # the list of articles corresponding to this country
            }
        ] """

    # prepare the structure
    for country in countries:
        country['path'] = get_xpath_search(country['alt_names'])
        country['articles'] = []

    # prepare the AND and EXCLUDE filters
    and_filter = get_xpath_search(with_words)
    if without_words:
        not_filter = get_xpath_search(without_words)
    else:
        not_filter = None

    root_path = "/mnt/le_temps_data/letempsdata/data4-month/"
    for journal in ['GDL', 'JDG']:
        journal_path = path.join(root_path, journal)
        for file_path, year, month in get_next_file(journal_path, min_year, max_year):
            with open(file_path) as f:
                tree = etree.parse(f)
                
                for country in countries:
                    all_articles = country['path'](tree)
                    filtered = filter(and_filter, all_articles)
                    if not_filter is not None:
                        filtered = filter(not_filter, filtered)
                    
                    # save into the data structure
                    country['articles'].extend(filtered)
        
    return countries

In [60]:
t = etree.parse('./text.xml')

In [19]:
s = get_xpath_search(['12', '2'])

In [20]:
As = s(t)

In [22]:
a = As[0]

In [66]:
# t.xpath('.//full_text[re:test(text(), "text") and re:test(text(), "7") and not(re:test(text(), "1"))]/ancestor::article', namespaces=re_NS)
t.xpath('.//full_text[re:test(text(), "text") and re:test(text(), "[0-9]{2}")  and not(re:test(text(), "2"))]/ancestor::article', namespaces=re_NS)

[<Element article at 0x7f566c10dc08>]

In [38]:
a.xpath('./entity[1]/meta/issue_date[1]/text()')

['01/02/1798']

In [149]:
f = etree.XPath('.//full_text[re:test(text(),"12")]//ancestor::article', namespaces=re_NS)

In [154]:
list(filter(f, filter(f, As)))

[<Element article at 0x7fefaf939608>]

In [138]:
list(filter(lambda x: x, [2, 3, [4], {5}, '6', None, [], {}, ""]))

[2, 3, [4], {5}, '6']

In [136]:
if [2]:
    print("Yee")
else:
    print("noo")

Yee


In [12]:
arts = get_articles(['Angola'], 1965, 1980)

//full_text[re:match(text(), "Angola")]//ancestor::article


In [35]:
export_iramu(arts, 'angola.txt')

In [13]:
len(arts)

2460

In [24]:
a = arts[1]

In [None]:
filter(withtout(words), articles)

In [87]:
def without(word):
    has_word = etree.XPath('.//full_text[contains(text(), "{w}")]'.format(w=word))
    def predicate(article):
        return len(has_word(article)) == 0
    return predicate

In [94]:
def with_filter(word):
    has_word = etree.XPath('.//full_text[contains(text(), "{w}")]'.format(w=word))
    def predicate(article):
        return len(has_word(article)) > 0
    return predicate

Angola AND Deco

In [72]:
len(list(filter(with_filter('décolonisation'), arts)))

131

In [76]:
tous_deco = get_articles(['décolonisation'], 1945, 1990)

//full_text[re:match(text(), "décolonisation")]//ancestor::article


In [78]:
export_iramu(tous_deco, 'decolonisation.txt')

In [77]:
len(tous_deco)

2012

In [80]:
burundi = get_articles(['burundi'], 1952, 1967)

//full_text[re:match(text(), "burundi", "i")]//ancestor::article


In [82]:
len(burundi)

310

In [83]:
export_iramu(burundi, 'burundi.txt')

In [96]:
burundi_congo = list(filter(with_filter('congo'), burundi))

In [93]:
algerie = get_articles(['algerie'], 1952, 1967)
print(len(algerie))
export_iramu(algerie, 'algerie.txt')

265


In [238]:
find_articles = etree.XPath('//full_text[starts-with(text(), "que")]//ancestor::article')

In [244]:
find_articles = etree.XPath('//full_text[re:match(text(), "^que")]//ancestor::article', namespaces={"re": "http://exslt.org/regular-expressions"})

<Element article at 0x7fefafc81ec8>

In [275]:
find_articles = get_xpath_search(['points', 'armeé'])

In [264]:
len(t.findall('article'))

111