In [5]:
import pandas as pd
from scipy import stats
from collections import Counter
import xml.etree.ElementTree as ET
from scipy import stats

## Préparation des données

In [6]:
df_decisions = pd.read_csv("documents.csv")

In [7]:
print(df_decisions.shape)

(284630, 12)


On trie les décisions par date et ordre alphabétique. On groupe ensuite les statuts successifs pour chaque décisions.
On rappelle les valeurs des différents code statut :
   - 0 non traité
   - 1 ok
   - 2 en doute
   - 3 en erreur
   - 4 en modification
   - 5 corrigé

Les codes 0, 1, 2, 3 sont directements générés par le système legacy.

In [4]:
df_decisions = df_decisions.sort_values('timestamp_modification',ascending=True).sort_values('chemin_source',ascending=True)

In [6]:
validation_path = df_decisions.groupby('chemin_source')['statut'].apply(tuple).reset_index(name='validation')

## Comptage des différents statuts

Il existe 192 combinaisons de statuts différentes. Les 10 plus fréquentes représentent plus de 99% du total.  
Le code le plus présent est le 1-ok unique, donc jamais ni mis en doute ni modifié. Il représente la performance maximale du système (hypothèse que tous les codes 1-ok sont justes, hypothèse à challenger).

In [14]:
validation_paths = Counter(validation_path.validation.to_list())
print(len(validation_paths))

192


In [35]:
"Les 10 premières combinaisons de code représentent {0:.2f} % du total des fichiers".format(100*sum(n for _, n in validation_paths.most_common(10))/validation_path.shape[0])

'Les 10 premières combinaisons de code représentent 99.03 % du total des fichiers'

In [38]:
print("Prortions respectives des 10 combinaisons de statuts les plus fréquentes : \n")
for statuts, n in validation_paths.most_common(10):
    print(statuts, "{0:.2f} %".format((n/validation_path.shape[0])*100))

Prortions respectives des 10 combinaisons de statuts les plus fréquentes : 

(1,) 76.99 %
(5,) 11.59 %
(1, 1) 7.47 %
(5, 5) 0.85 %
(1, 1, 1) 0.72 %
(2,) 0.61 %
(4,) 0.42 %
(0,) 0.14 %
(3,) 0.13 %
(1, 1, 1, 1) 0.11 %


## Exploration des types de corrections

On selectionne uniquement les fichiers ayant plusieurs versions du xml de pseudonymisation avec des statuts successifs.

In [40]:
to_check = [combinaison for combinaison in validation_paths if len(combinaison) > 1]
files_to_check = {}
for i in to_check:
    
    files_to_check[i] = validation_path[validation_path["validation"] == i]["chemin_source"].to_list()
    

In [41]:
def make_utf8_corrections(text):
    """Corrections standards sur les XML"""
    
    return text.replace("\xa0;", b'\xe2\x80\x82'.decode('utf-8')).replace("&#x1F;",
                                                                          b'\xe2\x80\x82'.decode('utf-8')).replace(
        "&#xB;", b'\xe2\x80\x82'.decode('utf-8')).replace("&gt;", b'\xe2\x80\x82'.decode('utf-8')).replace("&lt;",
                                                                                                           b'\xe2\x80\x82'.decode(
                                                                                                               'utf-8')).replace(
        "&#x1E;", b'\xe2\x80\x82'.decode('utf-8')).replace("&#xF;", b'\xe2\x80\x82'.decode('utf-8')).replace("&#x1D;",
                                                                                                             b'\xe2\x80\x82'.decode(
                                                                                                                 'utf-8')).replace(
        "&#x1C;", b'\xe2\x80\x82'.decode('utf-8')).replace("&#x1B;", b'\xe2\x80\x82'.decode('utf-8'))


In [42]:
def get_xmls(chemin_source):
    """Récupération des XML de pseudonymisation pour un fichier"""
    
    sub_df = df_decisions[df_decisions["chemin_source"] == chemin_source]
    modifications = dict(zip(sub_df.timestamp_modification, sub_df.detail_anonymisation))
    return {k: modifications[k] for k in modifications if type(k) is str}



In [70]:
def get_xml_items(items):
    """ Récupération des items pour un xml donné"""
    
    pseudos = Counter([e.find("Mots").text for e in items])
            
    return pseudos
            
    

In [86]:
def xml_difference(previous_xml, new_xml):
    """Comparaison des différences entre deux versions d'un xml de pseudonymisation"""
    
    items_previous = ET.ElementTree(ET.fromstring(make_utf8_corrections(previous_xml))).findall(".//MotsAnonymises/MotAnonymise")
    items_new = ET.ElementTree(ET.fromstring(make_utf8_corrections(new_xml))).findall(".//MotsAnonymises/MotAnonymise")
    
    pseudos_previous = get_xml_items(items_previous)
    pseudos_new = get_xml_items(items_new)

    if pseudos_previous != pseudos_new:
        corrected = 1
        added_items = len([k for k in pseudos_new.keys() if k not in pseudos_previous])
        total_added = sum(pseudos_new.values()) - sum(pseudos_previous.values())
    else:
        corrected = 0
        added_items = 0
        total_added = 0
        
    return corrected, added_items,total_added


In [94]:
missed_items = []
deleted_items = []
additions = []
deletions = []
correcteds = 0
ct = 0
for statuts in files_to_check:
    for file in files_to_check[statuts]:
        try:
            ct+=1
            xmls = get_xmls(file)
            dates = [x for x in xmls]
            dates.sort() #plus vieux en premier
            corrected, added_items,total_added = xml_difference(xmls[dates[0]],xmls[dates[-1]])

            if added_items > 0:
                missed_items.append(added_items)
            elif added_items < 0:
                deleted_items.append(added_items)

            if total_added > 0:
                additions.append(added_items)
            elif total_added < 0:
                deletions.append(added_items)
                
            correcteds += corrected
                
        except Exception as e:
            pass

reference to invalid character number: line 30, column 72
'NoneType' object has no attribute 'text'
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index o

In [112]:
print(ct)
print(3559/correcteds)


25531
0.5316701523752614


In [104]:
print(stats.describe(missed_items))
print(stats.describe(additions))
print(stats.describe(deletions))

DescribeResult(nobs=2125, minmax=(1, 133), mean=2.2931764705882354, variance=14.253460950481887, skewness=21.97618777362818, kurtosis=698.4577796102368)
DescribeResult(nobs=2790, minmax=(0, 133), mean=1.5351254480286738, variance=11.129457764823663, skewness=23.56841113789913, kurtosis=875.1411677181997)
DescribeResult(nobs=3559, minmax=(0, 53), mean=0.10958134307389716, variance=0.9868643272066272, skewness=43.10685772855553, kurtosis=2259.3847468895715)


In [8]:
284630*0.77

219165.1

In [9]:
219165 + 25531

244696

In [10]:
244696/284630

0.8596985560200963

In [None]:

# pour un jour checker step by step l'évolutiond des status
def grouped(iterable, n):
    return zip(*[iter(iterable)]*n)

for x, y in grouped(xmls, 2):
   print(y,x)