# Data mining

## Load paths

In [None]:
def umlaut(string):
    if not isinstance(string, str):
        raise TypeError("The oject is a " + str(type(string)) + ", not a string!")
    string = string.replace("Ã¼", "ü")
    string = string.replace("Ã¤", "ä")    
    string = string.replace("Ã¶", "ö")    
    string = string.replace("Ãœ", "Ü")    
    string = string.replace("Ã„", "Ä")    
    string = string.replace("Ã–", "Ö")    
    string = string.replace("ÃŸ", "ß")    
    return string

def replace_umlauts(dictionary):
    try:
        dictionary = umlaut(dictionary)
    except TypeError:
        copy = dictionary
        try:
            iteritems = dictionary.items()
        except AttributeError:
            iteritems = enumerate(dictionary)
        for k, v in iteritems:
            copy[k] = replace_umlauts(v)
        dictionary = copy
    return dictionary

In [None]:
import json

with open('paths.json') as f:
    data = json.load(f)
    data = replace_umlauts(data)

data

In [None]:
keywordFile = data['keywords']['file']
keywordFile

In [None]:
dataDirs = data['data']['directory']
dataDirs

In [None]:
testFile = data['data']['test']
testFile

## Load data

### Keywords
Keywords are in a table in a MS word document

In [None]:
# read the first table of the docx file

from docx import Document
document = Document(keywordFile)
table = document.tables[0]
text = []
for i in range(len(table.rows)):
    text.append([])
    for j in range(len(table.columns)):
        text[i].append(table.rows[i].cells[j].text)
        
# remove header
text = text[1:]

In [None]:
# split in English and German
text_en = list(map(lambda x: x[2:4], text))
text_de = list(map(lambda x: x[0:2], text))
text_de[0]

In [None]:
def produce_dict(text):
    dictionary = {}
    i = 0
    key = ''
    for row in text:
        word = row[0]
        veto = row[1]
        if word != '':
            key = word
            dictionary[key] = []
        if veto != '':
            dictionary[key].append(veto)
    return dictionary

In [None]:
# make dictionary of pairwise keywords and vetoes
keywords_de = produce_dict(text_de)
keywords_en = produce_dict(text_en)
keywords_de[next(iter(keywords_de))]

### Documents

Documents are drawn from URLs in MS Outlook files

#### Read emails

In [None]:
import win32com.client
import os
import re
from collections import OrderedDict

outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")

URLs = []
maxMails = 200
openMails = 0
for directory in dataDirs:
    for filename in os.listdir(directory):
        openMails += 1
        if openMails > maxMails:
            break
        filePath = os.path.abspath(os.path.join(directory, filename))
        msg = outlook.OpenSharedItem(filePath)
#         print(msg.SenderName)
#         print(msg.Body)
        matchObj = re.findall(r'<(.*?)>', msg.Body)
        matchObj = list(OrderedDict.fromkeys(matchObj))  # remove duplicates
        matchObj = [link for link in matchObj if not 'alerts' in link]
        if matchObj:
            print(filePath + " : " + str(len(matchObj)) + " links")
            for index, element in enumerate(matchObj):
#                 print(str(index) + ": " + element)
                URLs.append(element)

        else:
            print("No match!")

outlook.OpenSharedItem(os.path.abspath(os.path.join(dataDirs[0], testFile)))

In [None]:
print('Found {} URLs'.format(len(URLs)))

#### Follow links and extract article text

In [None]:
from bs4.element import Comment

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]', 'footer', 'a']:
        return False
    if isinstance(element, Comment):
        return False
    if (element.parent.name.strip() == element.strip()):
        return False
    return True

def extract_info(soup):
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    visible_texts = filter(lambda t: t.strip(), visible_texts)
#     for text in visible_texts:
#         print(str(text.parent) + ' : ' + text)
    return [t.strip() for t in visible_texts]

In [None]:
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
import functools

maxURLs = 1000
data = []
exceptions = []

for index, URL in enumerate(tqdm(URLs[0:])):
    if index >= maxURLs:
        break
    # get googleLink
    try:
        r = requests.get(URL)
    except requests.exceptions.MissingSchema as e: 
        exceptions.append([e, URL])
    except Exception as e:
        print('Exception ' + type(e).__name__ + " in google URL" + URL)
        exceptions.append([e, URL])
    result = re.search('URL=(.*?)"', r.text)
    try:
        articleURL = result.group(1) 
    except AttributeError:
        continue
#     print(index, articleURL)
    # follow redirect to get final article
    try:
        r = requests.get(articleURL)
    except requests.exceptions.MissingSchema as e: 
        exceptions.append([e, articleURL])
    except Exception as e:
        print('Exception ' + type(e).__name__ + " in article URL " + articleURL)
        exceptions.append([e, articleURL])
    soup = BeautifulSoup(r.text, 'html5lib')
    text = extract_info(soup)
    if not text:
        continue
    # restrict to text around longest continous entry
    # TODO: improve this to cut away more junk
    longest = text.index(max(text, key=len))
    margin = 10
    minIdx = max(longest - margin, 0)
    maxIdx = min(longest + margin, len(text)-1)
    text = text[minIdx:maxIdx]
    text = functools.reduce(lambda x, y : x + ' ' + y, text, "")
    #     print(*text, sep='\n\n')
    data.append({'text': text, 'URL': articleURL})

In [None]:
data[0]

In [None]:
import pandas as pd

df = pd.DataFrame(data)
df

In [None]:
df.info()

#### Write data on disk

In [None]:
df.to_pickle("articles.h5")
# read with df = pd.read_pickle(file_name)
# for very quick acces use HDF5 (PyTables)

### Preprocessing