# Docx

In [42]:
import os
import psutil
import zipfile
import chardet
import numpy as np
import xml.etree.ElementTree as ET
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score

## 0 RAM usage monitoring

In [43]:
def get_ram_usage():
    memory_info = psutil.Process().memory_info()
    return memory_info.rss / (1024 * 1024 * 1024)  # Resident Set Size (RSS) in bytes

print(f"Current RAM usage: {get_ram_usage():.2f} / 24 GB ({get_ram_usage()/24*100:.1f} %)")

Current RAM usage: 0.37 / 24 GB (1.5 %)


## 1 Process data

### 1.1 Get files

In [69]:
def get_parsed_docx(docx_filenames_train, labels_train, decoded_index_train, train=True):
    print('Number of training .docx:', len(docx_filenames_train))
    errors_train = 0
    
    for i, docxname in enumerate(docx_filenames_train):
        try:
            # Get one document, that is a zip archive with many XML files, and its label
            docx = zipfile.ZipFile(docxname)
            docxroot = ET.Element('root')
            for xmlname in docx.namelist():
                if xmlname[-3:] == 'xml':
                    # For all the .xml files in ou docx, parse it and stores the parsed tree in docxroot, which will be an ElementTree
                    xmlfile = docx.read(xmlname)
                    encoding = chardet.detect(xmlfile)["encoding"]
                    xmlfile = xmlfile.decode(encoding if encoding != None else 'utf-8')
                    tree = ET.fromstring(xmlfile)
                    docxroot.append(tree)
            # Now each docx has to be one ElementTree, whose children are each single .xml included in the .docx 
            if train:
                labels_train += [int(docxname[-1])]
            docxtree = ET.ElementTree(docxroot)
            yield ET.tostring(docxtree.getroot(), encoding='unicode')
        except zipfile.BadZipFile as e:
            # Handle bad zipfile error
            print(i, docxname, 'zipfile.BadZipFile')
            errors_train += 1
            decoded_index_train[i] = 0
            continue
        except UnicodeDecodeError:
            # Deal with decoding error
            print(i, docxname, 'UnicodeDecodeError')
            errors_train += 1
            decoded_index_train[i] = 0
            continue
        except ET.ParseError:
            # Deal with parsing error
            print(i, docxname, 'ET.ParseError')
            errors_train += 1
            decoded_index_train[i] = 0
            continue
    # Print error number + return parsed list
    print('Number of non-decoded sequences:', errors_train)
    print('Percentage of non-decoded sequences:', round(errors_train/len(docx_filenames_train)*100, 2), '%')

# docx_filenames_train = zipfile.ZipFile('docx-train.zip').namelist()[:-1]
# # docx_filenames_train = zipfile.ZipFile('docx-train.zip').namelist()[:100]
# labels_train, decoded_index_train = [], np.ones(len(docx_filenames_train))
# docx_generator_train = get_parsed_docx(docx_filenames_train, labels_train, decoded_index_train)

### 1.2 Save and load parsed docx

In [4]:
def save_data(filename, array):
    """
    SAVE the elements to a file using pickle
    """
    with open(filename, 'wb') as file:
        pickle.dump(array, file)
def load_data(filename):
    """
    LOAD the elements from the file using pickle
    """
    with open(filename, 'rb') as file:
        array = pickle.load(file)
    return array

# save_data('docx-train-3000.pkl', parsed_docx_train)
# parsed_docx_train = load_data('docx-train-3000.pkl')
# save_data('labels-train-3000.pkl', labels_train)
# labels_train = load_data('labels-train-3000.pkl')

FileNotFoundError: [Errno 2] No such file or directory: 'docx-train-3000.pkl'

### 1.3 Feature extraction

Overview of the attributes of an `Element`:\
`tag`: The tag of the element.\
`attrib`: A dictionary containing the element's attributes.\
`text`: The text content of the element.\
`iter()`: An iterator that generates all the subelements of the element.\
`find()`: Finds the first subelement with the given tag.\
`findall()`: Finds all subelements with the given tag.

In [48]:
n = 3
vectorizer = CountVectorizer(strip_accents='unicode', lowercase=True, ngram_range=(n,n), analyzer='char', min_df=5)
features_train = vectorizer.fit_transform(docx_generator_train)
features_train.shape

Number of training .docx: 6301
224 data/docx-2017-01/uclvhtuckhtprhgn.1 zipfile.BadZipFile
294 data/docx-2017-01/lbfoxladmzymqvfj.1 zipfile.BadZipFile
337 data/docx-2017-01/vqesbgqyinhhnvfk.1 zipfile.BadZipFile
404 data/docx-2016-07/frriizhcbhhatlxq.0 UnicodeDecodeError
518 data/docx-2017-01/btnagxosptxudviw.1 zipfile.BadZipFile
929 data/docx-2017-01/wahhucrnhhaownvm.1 zipfile.BadZipFile
1186 data/docx-2016-07/sevvokgiaznvbfub.1 ET.ParseError
1256 data/docx-2017-01/thkjtswtmjlvlrcr.1 zipfile.BadZipFile
1337 data/docx-2016-07/pmwdrnsvonimrbyq.0 UnicodeDecodeError
1349 data/docx-2016-07/ijccvegzcbnbfjwo.0 UnicodeDecodeError
1657 data/docx-2017-01/uugkgbullwpupcox.1 zipfile.BadZipFile
1972 data/docx-2017-01/vvkdmlotszdqjfoz.1 zipfile.BadZipFile
2030 data/docx-2017-01/gzoyyidmbehydfpp.1 zipfile.BadZipFile
2038 data/docx-2016-07/vbntkivddaodqttn.0 zipfile.BadZipFile
2345 data/docx-2017-01/ckfayemcuarwfnlm.1 zipfile.BadZipFile
2357 data/docx-2016-07/uhssqmrzgeynsmxp.0 UnicodeDecodeError
2428

(6264, 105814)

In [None]:
# np.save('features_train_n3_bis.npy', features_train)
# features_train = np.load('features_train_n3.npy')

## 2 Train models

### 2.0 Train any model using 5-fold cross-validation

In [20]:
def train_evaluate(model, features, labels, model_name='base-model', n_splits=5, fit_whole_dataset=True):
    # K-Fold CV
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=0)
    accuracy_list, var_list = [], []
    split = 0
    
    # Training + metrics
    for idx_train, idx_eval in kf.split(features):
        X_train, X_eval, y_train, y_eval = features[idx_train], features[idx_eval], labels[idx_train], labels[idx_eval]
        # Fit the model
        model.fit(X_train, y_train)
        # Predict
        y_pred = model.predict(X_eval)
        # Evaluate (accuracy and F1-score)
        acc = balanced_accuracy_score(y_eval, y_pred)
        accuracy_list += [acc*len(y_eval)]
        var_list += [acc]
        print(f'Split {split} done')
        split += 1
        
    # Compute CV_score
    cvscore = sum(accuracy_list)/len(labels)
    variance = np.std(var_list)
    print(f'CV-score of {model_name}: CV-score = {cvscore:.3f}, Variance = {variance:.4f}\n')

    # Train the model on the whole Train dataset
    if fit_whole_dataset:
        model.fit(features, labels)
    
    return cvscore, variance

### 2.1 XGBoost

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier()
%time _ = train_evaluate(gb_model, features_train, np.array(labels_train), model_name="GBC()", n_splits=5)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
CV-score of GBC(): CV-score = 0.992, Variance = 0.0027


In [49]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(random_state=0, warm_start=True)
%time _ = train_evaluate(gb_model, features_train, np.array(labels_train), model_name="GBC(warm_start=True)", n_splits=5, fit_whole_dataset=True)

Split 0 done
Split 1 done
Split 2 done
Split 3 done
Split 4 done
CV-score of GBC(warm_start=True): CV-score = 0.999, Variance = 0.0024

CPU times: user 5min 35s, sys: 6.32 s, total: 5min 42s
Wall time: 5min 42s


## 3 Predict on test dataset

In [70]:
docx_filenames_test = zipfile.ZipFile('docx-test.zip').namelist()[:-1]
# docx_filenames_test = zipfile.ZipFile('docx-test.zip').namelist()[:100]
labels_test, decoded_index_train = [], np.ones(len(docx_filenames_test))
docx_generator_test = get_parsed_docx(docx_filenames_test, labels_test, decoded_index_train, train=False)
decoded_index_test = decoded_index_train.copy()

In [71]:
# docx_string_test = [ET.tostring(docx.getroot(), encoding='unicode') for docx in parsed_docx_test]
features_test = vectorizer.transform(docx_generator_test)

Number of training .docx: 2968
200 data/docx-2017-09/hjxzjigfrcpqkmwc.x zipfile.BadZipFile
366 data/docx-2017-09/klecwwdgkwzyqudm.x UnicodeDecodeError
516 data/docx-2017-09/lyhfmwjzqltgztwi.x zipfile.BadZipFile
566 data/docx-2017-09/qgrzabkbbcbswbfv.x zipfile.BadZipFile
568 data/docx-2017-09/quzappguarpuodwu.x zipfile.BadZipFile
602 data/docx-2017-09/qwxbljpxglasxfnp.x zipfile.BadZipFile
854 data/docx-2017-09/lmawooifkluqjzcl.x UnicodeDecodeError
868 data/docx-2017-09/muexzdvjdtzkbqrb.x zipfile.BadZipFile
904 data/docx-2017-09/jjizjcnumcgnrvda.x zipfile.BadZipFile
937 data/docx-2017-09/amvmomyzjtsimsyb.x zipfile.BadZipFile
941 data/docx-2017-09/qhryecbrcxcmufpp.x zipfile.BadZipFile
1009 data/docx-2017-09/dmpiyfxfiytsafsf.x zipfile.BadZipFile
1207 data/docx-2017-09/zwsvmqtrunodfqxr.x UnicodeDecodeError
1233 data/docx-2017-09/tflajjtenmjesywc.x zipfile.BadZipFile
1340 data/docx-2017-09/qpfqebykcfediaja.x UnicodeDecodeError
1408 data/docx-2017-09/fvscslimbxolnxdb.x zipfile.BadZipFile
1484

In [73]:
##### Predict of test dataset with GB #####

X_test = features_test
y_pred = gb_model.predict(X_test)

In [89]:
submission = []
i_corr = 0
# Write the prediction as expected output
for i, filename in enumerate(docx_filenames_test):
    if decoded_index_test[i]:
    # if decoded_indices_test[i]:
        submission += [filename + ';' + y_pred[i-i_corr].astype(str)]
    else: # if email hasn't been decoded and thus predicted, we randomly choose its class
        submission += [filename + ';' + str(1)]
        i_corr += 1
print(f'Length of our submission: {len(submission)} | Length of zip file: {len(docx_filenames_test)}')
# Save the output as a text file
np.savetxt('output_docx_gb_n3_warm.csv', np.array(submission), fmt='%s', delimiter=',')

Length of our submission: 2968 | Length of zip file: 2968
