In [1]:
import pandas as pd
import numpy as np
import itertools as it
import random as rd

In [2]:
def distElements(elements):
    prop=[float(1)/c for c in range(1,len(elements)+1)]
    sumProp=sum(prop)
    dist=[p/sumProp for p in prop]
    return dist

## Paramétrage du paradigme
Les marks définissent les cases du paradigme
- le nom de la case correspond à la concaténation des valeurs possibles
 - abc correspond à une case qui peut prendre comme valeur a, b ou c
 
Les classes sont calculées en faisant le produit cartésien des différentes valeurs x cases

In [3]:
marks=["abc","def","ghi","klm","nop","qrs","tuv","wxy"]
marksN=[m+"N" for m in marks]
distMarks=distElements(marksN)

classes=pd.DataFrame(columns=marks)

cf=0
for element in it.product(*marks):
    cf+=1
    row = pd.Series({m:element[n] for n,m in enumerate(marks)},name=cf)
    classes=classes.append(row)

nbClasses=len(classes)
classes = classes.sample(frac=1).reset_index(drop=True)
#classes

In [192]:
nbGrammaireCF=len(classes)

## Paramètres du lexique
On fixe le nombre de lexèmes présents dans le corpus d'apprentissage et le ratio de formes attestées
- nombre de lexèmes
- ratio des formes-cases attestées par rapport au potentiel complet
 - 25% correspond au ratio observé pour les verbes dans Lex3
- nbFormes est l'ordre de grandeur à respecter pour l'échantillon en nombre de types

In [4]:
ratio=.25
nbLexemes=7500

nbFormes=int(nbLexemes*len(marks)*ratio)
nbFormes

15000

## Distribution des lexèmes et des CF
Les différents éléments suivent des distributions par Zipf.
- distElements renvoie une liste de probabilités correspondant au nombre d'éléments suivant une Zipf(x)

Chaque lexème a une fréquence de lemme qui correspond à son rang et reçoit une CF tirée au hasard suivant une loi de Zipf.
- np.random.choice(cfs,p=distCFs)

In [5]:
cfs=range(nbClasses)
distCFs=distElements(cfs)

lexs=range(nbLexemes)
distLexs=distElements(lexs)

lexemes=pd.DataFrame(columns=["CF","P"]+marks)
for l in range(nbLexemes):
    cf=np.random.choice(cfs,p=distCFs)
    dictL={"CF":int(cf), "P":distLexs[l]}
    dictL.update(classes.iloc[cf].to_dict())
    row=pd.Series(dictL,name=l)
    lexemes=lexemes.append(row)
#lexemes.index+=1

In [6]:
lexemes

Unnamed: 0,CF,P,abc,def,ghi,klm,nop,qrs,tuv,wxy
0,2814,0.105264,b,d,g,l,n,q,u,y
1,13,0.052632,a,f,h,k,p,r,u,w
2,1,0.035088,c,f,i,m,p,r,t,w
3,5,0.026316,c,e,g,k,o,r,t,y
4,43,0.021053,b,f,h,m,p,r,t,y
5,773,0.017544,b,f,i,l,n,s,t,y
6,1,0.015038,c,f,i,m,p,r,t,w
7,413,0.013158,c,e,g,l,o,r,u,x
8,2575,0.011696,b,d,g,k,o,r,t,w
9,964,0.010526,b,e,g,m,p,r,v,y


In [202]:
print "Nombre de types potentiel",lexemes[marks].count().sum()

Nombre de types potentiel 60000


## Nombre de CF tirées dans le lexique
Ce nombre représente le maximum de CF qui pourraient être nécessaires pour la description.
Le nombre de CF nécessaire est au moins égal au nombre de CF qui possèdent un paradigme exemplaire, mais les CF qui n'ont qu'une représentation partielle de leur paradigme ne sont pas obligatoirement nécessaires à la description.

In [197]:
nbLexiqueCF=len(lexemes.groupby("CF").groups.keys())
print nbLexiqueCF,"CF dans le lexique sur",nbGrammaireCF,"CF dans la grammaire"

2039 CF dans le lexique sur 6561 CF dans la grammaire


## Constitution du DF pour le tirage

In [9]:
tiragesColonnes=pd.DataFrame(0, index=np.arange(len(lexs)), columns=marksN)
tirages=pd.concat([lexemes, tiragesColonnes], axis=1, sort=False)
#tirages

### Tirage des lexèmes pour les formes attestées
- lexTirs contient le nombre de token à tirer pour chaque lexème

In [10]:
#tirages=pd.DataFrame(0, index=np.arange(len(lexs)), columns=marks)
#tirages.index+=1

nbTokens=500000
lexTirs={}
for n in range(nbTokens):
    l=np.random.choice(lexs,p=distLexs)
    if not l in lexTirs:
        lexTirs[l]=0
    lexTirs[l]+=1
#lexTirs

### Tirage des formes-cases
- cellTirs contient le nombre de fois que chaque forme-case a été tirée

In [11]:
for l in lexTirs:
    cellTirs={}
    for n in range(lexTirs[l]):
        c=np.random.choice(marksN,p=distMarks)
        if not c in cellTirs:
            cellTirs[c]=0
        cellTirs[c]+=1
    if len(cellTirs)==8:
        print l,tirages.loc[l,"CF"], cellTirs
    for c in cellTirs:
        tirages.loc[l,c]=cellTirs[c]
    

0 2814 {'wxyN': 2444, 'abcN': 19395, 'tuvN': 2816, 'ghiN': 6399, 'klmN': 4890, 'nopN': 3787, 'qrsN': 3298, 'defN': 9570}
1 13 {'wxyN': 1237, 'abcN': 9589, 'tuvN': 1399, 'ghiN': 3145, 'klmN': 2420, 'nopN': 2053, 'qrsN': 1597, 'defN': 4738}
2 1 {'wxyN': 812, 'abcN': 6324, 'tuvN': 954, 'ghiN': 2152, 'klmN': 1634, 'nopN': 1271, 'qrsN': 1036, 'defN': 3145}
3 5 {'wxyN': 616, 'abcN': 4853, 'tuvN': 676, 'ghiN': 1631, 'klmN': 1249, 'nopN': 985, 'qrsN': 801, 'defN': 2518}
4 43 {'wxyN': 434, 'abcN': 3856, 'tuvN': 561, 'ghiN': 1264, 'klmN': 949, 'nopN': 739, 'qrsN': 624, 'defN': 1915}
5 773 {'wxyN': 409, 'abcN': 3195, 'tuvN': 444, 'ghiN': 1054, 'klmN': 810, 'nopN': 592, 'qrsN': 502, 'defN': 1607}
6 1 {'wxyN': 336, 'abcN': 2772, 'tuvN': 386, 'ghiN': 968, 'klmN': 677, 'nopN': 569, 'qrsN': 489, 'defN': 1395}
7 413 {'wxyN': 337, 'abcN': 2484, 'tuvN': 361, 'ghiN': 804, 'klmN': 608, 'nopN': 498, 'qrsN': 363, 'defN': 1190}
8 2575 {'wxyN': 293, 'abcN': 2199, 'tuvN': 314, 'ghiN': 729, 'klmN': 543, 'nopN': 

77 33 {'wxyN': 40, 'abcN': 246, 'tuvN': 36, 'ghiN': 79, 'klmN': 42, 'nopN': 47, 'qrsN': 43, 'defN': 134}
78 71 {'wxyN': 28, 'abcN': 255, 'tuvN': 32, 'ghiN': 82, 'klmN': 57, 'nopN': 54, 'qrsN': 37, 'defN': 133}
79 2 {'wxyN': 39, 'abcN': 232, 'tuvN': 30, 'ghiN': 80, 'klmN': 53, 'nopN': 46, 'qrsN': 43, 'defN': 117}
80 0 {'wxyN': 27, 'abcN': 227, 'tuvN': 25, 'ghiN': 63, 'klmN': 77, 'nopN': 45, 'qrsN': 38, 'defN': 111}
81 2209 {'wxyN': 23, 'abcN': 250, 'tuvN': 33, 'ghiN': 71, 'klmN': 66, 'nopN': 52, 'qrsN': 47, 'defN': 137}
82 18 {'wxyN': 29, 'abcN': 240, 'tuvN': 33, 'ghiN': 90, 'klmN': 51, 'nopN': 48, 'qrsN': 33, 'defN': 114}
83 0 {'wxyN': 41, 'abcN': 194, 'tuvN': 28, 'ghiN': 83, 'klmN': 61, 'nopN': 54, 'qrsN': 28, 'defN': 99}
84 129 {'wxyN': 26, 'abcN': 227, 'tuvN': 34, 'ghiN': 70, 'klmN': 57, 'nopN': 45, 'qrsN': 34, 'defN': 130}
85 0 {'wxyN': 26, 'abcN': 232, 'tuvN': 34, 'ghiN': 82, 'klmN': 57, 'nopN': 44, 'qrsN': 37, 'defN': 111}
86 19 {'wxyN': 26, 'abcN': 226, 'tuvN': 33, 'ghiN': 76, '

167 0 {'wxyN': 7, 'abcN': 103, 'tuvN': 23, 'ghiN': 53, 'klmN': 32, 'nopN': 26, 'qrsN': 17, 'defN': 61}
168 9 {'wxyN': 20, 'abcN': 119, 'tuvN': 8, 'ghiN': 31, 'klmN': 25, 'nopN': 25, 'qrsN': 17, 'defN': 52}
169 25 {'wxyN': 17, 'abcN': 127, 'tuvN': 15, 'ghiN': 39, 'klmN': 22, 'nopN': 29, 'qrsN': 24, 'defN': 59}
170 19 {'wxyN': 19, 'abcN': 115, 'tuvN': 22, 'ghiN': 40, 'klmN': 25, 'nopN': 23, 'qrsN': 28, 'defN': 62}
171 2926 {'wxyN': 18, 'abcN': 108, 'tuvN': 18, 'ghiN': 41, 'klmN': 23, 'nopN': 20, 'qrsN': 18, 'defN': 54}
172 2179 {'wxyN': 10, 'abcN': 117, 'tuvN': 19, 'ghiN': 44, 'klmN': 26, 'nopN': 30, 'qrsN': 10, 'defN': 60}
173 0 {'wxyN': 20, 'abcN': 115, 'tuvN': 16, 'ghiN': 40, 'klmN': 23, 'nopN': 21, 'qrsN': 17, 'defN': 50}
174 2689 {'wxyN': 14, 'abcN': 113, 'tuvN': 13, 'ghiN': 41, 'klmN': 28, 'nopN': 20, 'qrsN': 12, 'defN': 65}
175 5536 {'wxyN': 16, 'abcN': 112, 'tuvN': 21, 'ghiN': 46, 'klmN': 31, 'nopN': 19, 'qrsN': 21, 'defN': 53}
176 0 {'wxyN': 21, 'abcN': 110, 'tuvN': 13, 'ghiN': 

265 1 {'wxyN': 7, 'abcN': 73, 'tuvN': 17, 'ghiN': 20, 'klmN': 17, 'nopN': 14, 'qrsN': 14, 'defN': 33}
266 0 {'wxyN': 11, 'abcN': 83, 'tuvN': 4, 'ghiN': 30, 'klmN': 14, 'nopN': 13, 'qrsN': 11, 'defN': 30}
267 212 {'wxyN': 10, 'abcN': 53, 'tuvN': 11, 'ghiN': 24, 'klmN': 10, 'nopN': 16, 'qrsN': 14, 'defN': 42}
268 1 {'wxyN': 8, 'abcN': 74, 'tuvN': 11, 'ghiN': 24, 'klmN': 20, 'nopN': 15, 'qrsN': 10, 'defN': 41}
269 332 {'wxyN': 4, 'abcN': 72, 'tuvN': 8, 'ghiN': 15, 'klmN': 14, 'nopN': 7, 'qrsN': 15, 'defN': 28}
270 1324 {'wxyN': 10, 'abcN': 71, 'tuvN': 7, 'ghiN': 27, 'klmN': 27, 'nopN': 9, 'qrsN': 12, 'defN': 41}
271 3 {'wxyN': 8, 'abcN': 71, 'tuvN': 12, 'ghiN': 22, 'klmN': 31, 'nopN': 15, 'qrsN': 11, 'defN': 30}
272 105 {'wxyN': 10, 'abcN': 65, 'tuvN': 10, 'ghiN': 16, 'klmN': 18, 'nopN': 17, 'qrsN': 3, 'defN': 35}
273 0 {'wxyN': 2, 'abcN': 75, 'tuvN': 10, 'ghiN': 33, 'klmN': 21, 'nopN': 20, 'qrsN': 10, 'defN': 42}
274 73 {'wxyN': 9, 'abcN': 78, 'tuvN': 9, 'ghiN': 20, 'klmN': 15, 'nopN': 1

351 63 {'wxyN': 8, 'abcN': 52, 'tuvN': 7, 'ghiN': 20, 'klmN': 14, 'nopN': 11, 'qrsN': 12, 'defN': 23}
352 5209 {'wxyN': 6, 'abcN': 66, 'tuvN': 7, 'ghiN': 18, 'klmN': 19, 'nopN': 6, 'qrsN': 15, 'defN': 29}
353 1 {'wxyN': 9, 'abcN': 65, 'tuvN': 5, 'ghiN': 18, 'klmN': 18, 'nopN': 16, 'qrsN': 2, 'defN': 26}
354 35 {'wxyN': 8, 'abcN': 59, 'tuvN': 9, 'ghiN': 15, 'klmN': 9, 'nopN': 10, 'qrsN': 9, 'defN': 33}
355 148 {'wxyN': 4, 'abcN': 57, 'tuvN': 5, 'ghiN': 20, 'klmN': 19, 'nopN': 12, 'qrsN': 10, 'defN': 29}
356 4 {'wxyN': 4, 'abcN': 55, 'tuvN': 7, 'ghiN': 18, 'klmN': 13, 'nopN': 10, 'qrsN': 15, 'defN': 22}
357 121 {'wxyN': 8, 'abcN': 45, 'tuvN': 8, 'ghiN': 20, 'klmN': 11, 'nopN': 16, 'qrsN': 8, 'defN': 25}
358 2807 {'wxyN': 2, 'abcN': 45, 'tuvN': 6, 'ghiN': 14, 'klmN': 12, 'nopN': 12, 'qrsN': 15, 'defN': 22}
359 0 {'wxyN': 9, 'abcN': 37, 'tuvN': 10, 'ghiN': 19, 'klmN': 15, 'nopN': 9, 'qrsN': 8, 'defN': 20}
360 1 {'wxyN': 6, 'abcN': 51, 'tuvN': 5, 'ghiN': 15, 'klmN': 16, 'nopN': 7, 'qrsN': 8

434 271 {'wxyN': 6, 'abcN': 49, 'tuvN': 4, 'ghiN': 18, 'klmN': 10, 'nopN': 15, 'qrsN': 4, 'defN': 31}
435 0 {'wxyN': 7, 'abcN': 51, 'tuvN': 5, 'ghiN': 8, 'klmN': 6, 'nopN': 8, 'qrsN': 8, 'defN': 31}
436 5077 {'wxyN': 9, 'abcN': 46, 'tuvN': 6, 'ghiN': 15, 'klmN': 8, 'nopN': 6, 'qrsN': 4, 'defN': 20}
437 4 {'wxyN': 8, 'abcN': 36, 'tuvN': 11, 'ghiN': 9, 'klmN': 9, 'nopN': 6, 'qrsN': 6, 'defN': 13}
438 398 {'wxyN': 6, 'abcN': 41, 'tuvN': 7, 'ghiN': 20, 'klmN': 12, 'nopN': 9, 'qrsN': 9, 'defN': 24}
439 48 {'wxyN': 4, 'abcN': 41, 'tuvN': 3, 'ghiN': 15, 'klmN': 8, 'nopN': 7, 'qrsN': 4, 'defN': 15}
440 1596 {'wxyN': 4, 'abcN': 36, 'tuvN': 6, 'ghiN': 10, 'klmN': 14, 'nopN': 8, 'qrsN': 4, 'defN': 26}
441 3 {'wxyN': 8, 'abcN': 35, 'tuvN': 3, 'ghiN': 16, 'klmN': 11, 'nopN': 15, 'qrsN': 5, 'defN': 24}
442 3281 {'wxyN': 4, 'abcN': 55, 'tuvN': 6, 'ghiN': 32, 'klmN': 9, 'nopN': 12, 'qrsN': 9, 'defN': 24}
443 389 {'wxyN': 2, 'abcN': 27, 'tuvN': 12, 'ghiN': 13, 'klmN': 2, 'nopN': 13, 'qrsN': 9, 'defN': 

521 0 {'wxyN': 5, 'abcN': 29, 'tuvN': 4, 'ghiN': 14, 'klmN': 17, 'nopN': 6, 'qrsN': 5, 'defN': 21}
522 49 {'wxyN': 2, 'abcN': 42, 'tuvN': 6, 'ghiN': 13, 'klmN': 9, 'nopN': 5, 'qrsN': 7, 'defN': 25}
523 192 {'wxyN': 3, 'abcN': 44, 'tuvN': 4, 'ghiN': 15, 'klmN': 8, 'nopN': 14, 'qrsN': 6, 'defN': 26}
524 0 {'wxyN': 8, 'abcN': 33, 'tuvN': 8, 'ghiN': 7, 'klmN': 11, 'nopN': 6, 'qrsN': 9, 'defN': 22}
525 2 {'wxyN': 4, 'abcN': 32, 'tuvN': 3, 'ghiN': 12, 'klmN': 8, 'nopN': 10, 'qrsN': 8, 'defN': 21}
526 0 {'wxyN': 7, 'abcN': 31, 'tuvN': 7, 'ghiN': 10, 'klmN': 8, 'nopN': 10, 'qrsN': 2, 'defN': 14}
527 1 {'wxyN': 5, 'abcN': 35, 'tuvN': 7, 'ghiN': 8, 'klmN': 17, 'nopN': 12, 'qrsN': 11, 'defN': 15}
528 74 {'wxyN': 4, 'abcN': 41, 'tuvN': 5, 'ghiN': 13, 'klmN': 7, 'nopN': 2, 'qrsN': 6, 'defN': 15}
529 39 {'wxyN': 1, 'abcN': 40, 'tuvN': 6, 'ghiN': 13, 'klmN': 8, 'nopN': 6, 'qrsN': 5, 'defN': 16}
530 3 {'wxyN': 8, 'abcN': 40, 'tuvN': 6, 'ghiN': 6, 'klmN': 14, 'nopN': 9, 'qrsN': 6, 'defN': 13}
531 1 {'w

616 1477 {'wxyN': 4, 'abcN': 37, 'tuvN': 4, 'ghiN': 10, 'klmN': 2, 'nopN': 3, 'qrsN': 3, 'defN': 17}
617 3 {'wxyN': 7, 'abcN': 27, 'tuvN': 3, 'ghiN': 10, 'klmN': 13, 'nopN': 8, 'qrsN': 3, 'defN': 20}
619 154 {'wxyN': 6, 'abcN': 25, 'tuvN': 6, 'ghiN': 9, 'klmN': 3, 'nopN': 5, 'qrsN': 9, 'defN': 10}
620 1315 {'wxyN': 1, 'abcN': 31, 'tuvN': 5, 'ghiN': 8, 'klmN': 8, 'nopN': 11, 'qrsN': 3, 'defN': 14}
621 1047 {'wxyN': 2, 'abcN': 39, 'tuvN': 2, 'ghiN': 17, 'klmN': 8, 'nopN': 8, 'qrsN': 6, 'defN': 12}
622 0 {'wxyN': 4, 'abcN': 33, 'tuvN': 2, 'ghiN': 12, 'klmN': 6, 'nopN': 7, 'qrsN': 4, 'defN': 17}
623 51 {'wxyN': 7, 'abcN': 40, 'tuvN': 4, 'ghiN': 4, 'klmN': 10, 'nopN': 7, 'qrsN': 4, 'defN': 17}
624 523 {'wxyN': 5, 'abcN': 25, 'tuvN': 6, 'ghiN': 12, 'klmN': 7, 'nopN': 4, 'qrsN': 7, 'defN': 19}
625 10 {'wxyN': 3, 'abcN': 29, 'tuvN': 1, 'ghiN': 7, 'klmN': 11, 'nopN': 5, 'qrsN': 5, 'defN': 15}
626 882 {'wxyN': 1, 'abcN': 24, 'tuvN': 5, 'ghiN': 5, 'klmN': 7, 'nopN': 8, 'qrsN': 2, 'defN': 18}
627 

721 11 {'wxyN': 4, 'abcN': 31, 'tuvN': 3, 'ghiN': 12, 'klmN': 8, 'nopN': 6, 'qrsN': 6, 'defN': 9}
722 274 {'wxyN': 8, 'abcN': 40, 'tuvN': 4, 'ghiN': 9, 'klmN': 5, 'nopN': 4, 'qrsN': 7, 'defN': 13}
723 4 {'wxyN': 1, 'abcN': 30, 'tuvN': 3, 'ghiN': 7, 'klmN': 6, 'nopN': 4, 'qrsN': 5, 'defN': 23}
724 1 {'wxyN': 4, 'abcN': 35, 'tuvN': 5, 'ghiN': 9, 'klmN': 6, 'nopN': 4, 'qrsN': 4, 'defN': 12}
725 189 {'wxyN': 2, 'abcN': 28, 'tuvN': 2, 'ghiN': 12, 'klmN': 5, 'nopN': 6, 'qrsN': 4, 'defN': 8}
726 61 {'wxyN': 4, 'abcN': 38, 'tuvN': 3, 'ghiN': 9, 'klmN': 4, 'nopN': 7, 'qrsN': 5, 'defN': 17}
727 580 {'wxyN': 4, 'abcN': 24, 'tuvN': 3, 'ghiN': 13, 'klmN': 1, 'nopN': 5, 'qrsN': 7, 'defN': 10}
728 5329 {'wxyN': 2, 'abcN': 19, 'tuvN': 5, 'ghiN': 9, 'klmN': 7, 'nopN': 3, 'qrsN': 5, 'defN': 10}
729 2624 {'wxyN': 3, 'abcN': 27, 'tuvN': 4, 'ghiN': 12, 'klmN': 6, 'nopN': 5, 'qrsN': 8, 'defN': 13}
730 25 {'wxyN': 1, 'abcN': 33, 'tuvN': 5, 'ghiN': 7, 'klmN': 2, 'nopN': 2, 'qrsN': 3, 'defN': 9}
731 6143 {'wxy

829 2593 {'wxyN': 5, 'abcN': 33, 'tuvN': 1, 'ghiN': 10, 'klmN': 10, 'nopN': 4, 'qrsN': 2, 'defN': 8}
830 1177 {'wxyN': 3, 'abcN': 20, 'tuvN': 4, 'ghiN': 12, 'klmN': 6, 'nopN': 9, 'qrsN': 4, 'defN': 15}
831 1772 {'wxyN': 1, 'abcN': 20, 'tuvN': 3, 'ghiN': 6, 'klmN': 6, 'nopN': 3, 'qrsN': 3, 'defN': 9}
832 5 {'wxyN': 5, 'abcN': 21, 'tuvN': 2, 'ghiN': 8, 'klmN': 6, 'nopN': 1, 'qrsN': 2, 'defN': 6}
833 0 {'wxyN': 4, 'abcN': 25, 'tuvN': 2, 'ghiN': 13, 'klmN': 4, 'nopN': 4, 'qrsN': 5, 'defN': 18}
834 1195 {'wxyN': 7, 'abcN': 24, 'tuvN': 2, 'ghiN': 9, 'klmN': 7, 'nopN': 4, 'qrsN': 5, 'defN': 8}
835 686 {'wxyN': 1, 'abcN': 14, 'tuvN': 2, 'ghiN': 10, 'klmN': 4, 'nopN': 5, 'qrsN': 4, 'defN': 8}
836 1398 {'wxyN': 3, 'abcN': 22, 'tuvN': 1, 'ghiN': 8, 'klmN': 8, 'nopN': 4, 'qrsN': 4, 'defN': 14}
838 1 {'wxyN': 7, 'abcN': 24, 'tuvN': 1, 'ghiN': 6, 'klmN': 6, 'nopN': 3, 'qrsN': 5, 'defN': 12}
839 28 {'wxyN': 4, 'abcN': 25, 'tuvN': 6, 'ghiN': 7, 'klmN': 11, 'nopN': 4, 'qrsN': 6, 'defN': 8}
840 0 {'wxyN

943 1 {'wxyN': 2, 'abcN': 20, 'tuvN': 6, 'ghiN': 8, 'klmN': 3, 'nopN': 1, 'qrsN': 4, 'defN': 6}
944 22 {'wxyN': 2, 'abcN': 24, 'tuvN': 1, 'ghiN': 10, 'klmN': 6, 'nopN': 1, 'qrsN': 7, 'defN': 14}
945 9 {'wxyN': 2, 'abcN': 25, 'tuvN': 1, 'ghiN': 6, 'klmN': 5, 'nopN': 10, 'qrsN': 7, 'defN': 16}
946 0 {'wxyN': 3, 'abcN': 22, 'tuvN': 3, 'ghiN': 5, 'klmN': 5, 'nopN': 3, 'qrsN': 1, 'defN': 7}
950 2708 {'wxyN': 1, 'abcN': 22, 'tuvN': 2, 'ghiN': 5, 'klmN': 7, 'nopN': 4, 'qrsN': 5, 'defN': 16}
951 0 {'wxyN': 1, 'abcN': 32, 'tuvN': 2, 'ghiN': 5, 'klmN': 6, 'nopN': 1, 'qrsN': 6, 'defN': 8}
953 7 {'wxyN': 2, 'abcN': 18, 'tuvN': 1, 'ghiN': 5, 'klmN': 7, 'nopN': 4, 'qrsN': 4, 'defN': 8}
955 0 {'wxyN': 3, 'abcN': 26, 'tuvN': 1, 'ghiN': 3, 'klmN': 4, 'nopN': 7, 'qrsN': 3, 'defN': 7}
956 4449 {'wxyN': 4, 'abcN': 18, 'tuvN': 7, 'ghiN': 12, 'klmN': 6, 'nopN': 4, 'qrsN': 4, 'defN': 6}
957 38 {'wxyN': 2, 'abcN': 24, 'tuvN': 2, 'ghiN': 6, 'klmN': 5, 'nopN': 2, 'qrsN': 3, 'defN': 6}
958 161 {'wxyN': 4, 'abcN'

1055 1153 {'wxyN': 2, 'abcN': 12, 'tuvN': 3, 'ghiN': 7, 'klmN': 5, 'nopN': 3, 'qrsN': 5, 'defN': 6}
1056 1 {'wxyN': 3, 'abcN': 19, 'tuvN': 9, 'ghiN': 4, 'klmN': 2, 'nopN': 5, 'qrsN': 1, 'defN': 12}
1057 5 {'wxyN': 3, 'abcN': 23, 'tuvN': 3, 'ghiN': 10, 'klmN': 7, 'nopN': 3, 'qrsN': 2, 'defN': 6}
1058 3092 {'wxyN': 1, 'abcN': 18, 'tuvN': 3, 'ghiN': 3, 'klmN': 4, 'nopN': 2, 'qrsN': 6, 'defN': 11}
1060 3800 {'wxyN': 1, 'abcN': 20, 'tuvN': 2, 'ghiN': 7, 'klmN': 5, 'nopN': 2, 'qrsN': 4, 'defN': 6}
1062 82 {'wxyN': 4, 'abcN': 15, 'tuvN': 3, 'ghiN': 2, 'klmN': 2, 'nopN': 3, 'qrsN': 3, 'defN': 14}
1063 0 {'wxyN': 2, 'abcN': 18, 'tuvN': 4, 'ghiN': 6, 'klmN': 7, 'nopN': 2, 'qrsN': 4, 'defN': 10}
1066 61 {'wxyN': 2, 'abcN': 14, 'tuvN': 2, 'ghiN': 4, 'klmN': 2, 'nopN': 7, 'qrsN': 3, 'defN': 2}
1067 213 {'wxyN': 4, 'abcN': 21, 'tuvN': 2, 'ghiN': 7, 'klmN': 5, 'nopN': 2, 'qrsN': 4, 'defN': 3}
1068 10 {'wxyN': 1, 'abcN': 15, 'tuvN': 3, 'ghiN': 4, 'klmN': 3, 'nopN': 3, 'qrsN': 2, 'defN': 9}
1069 0 {'wx

1178 1 {'wxyN': 2, 'abcN': 18, 'tuvN': 5, 'ghiN': 2, 'klmN': 4, 'nopN': 2, 'qrsN': 2, 'defN': 6}
1179 47 {'wxyN': 1, 'abcN': 14, 'tuvN': 2, 'ghiN': 8, 'klmN': 6, 'nopN': 3, 'qrsN': 2, 'defN': 6}
1180 3 {'wxyN': 3, 'abcN': 13, 'tuvN': 7, 'ghiN': 4, 'klmN': 1, 'nopN': 5, 'qrsN': 4, 'defN': 7}
1181 866 {'wxyN': 1, 'abcN': 15, 'tuvN': 5, 'ghiN': 2, 'klmN': 2, 'nopN': 6, 'qrsN': 1, 'defN': 10}
1182 214 {'wxyN': 1, 'abcN': 23, 'tuvN': 4, 'ghiN': 4, 'klmN': 2, 'nopN': 1, 'qrsN': 3, 'defN': 6}
1183 23 {'wxyN': 3, 'abcN': 29, 'tuvN': 3, 'ghiN': 5, 'klmN': 2, 'nopN': 5, 'qrsN': 2, 'defN': 7}
1184 26 {'wxyN': 1, 'abcN': 16, 'tuvN': 1, 'ghiN': 8, 'klmN': 4, 'nopN': 3, 'qrsN': 1, 'defN': 6}
1186 0 {'wxyN': 2, 'abcN': 11, 'tuvN': 1, 'ghiN': 4, 'klmN': 4, 'nopN': 3, 'qrsN': 2, 'defN': 10}
1187 114 {'wxyN': 3, 'abcN': 18, 'tuvN': 3, 'ghiN': 8, 'klmN': 6, 'nopN': 2, 'qrsN': 3, 'defN': 14}
1189 14 {'wxyN': 1, 'abcN': 10, 'tuvN': 5, 'ghiN': 6, 'klmN': 3, 'nopN': 3, 'qrsN': 4, 'defN': 7}
1190 2 {'wxyN': 2

1302 1 {'wxyN': 2, 'abcN': 7, 'tuvN': 3, 'ghiN': 6, 'klmN': 3, 'nopN': 7, 'qrsN': 1, 'defN': 8}
1303 1695 {'wxyN': 2, 'abcN': 14, 'tuvN': 2, 'ghiN': 4, 'klmN': 3, 'nopN': 3, 'qrsN': 1, 'defN': 5}
1305 795 {'wxyN': 1, 'abcN': 19, 'tuvN': 1, 'ghiN': 4, 'klmN': 2, 'nopN': 5, 'qrsN': 3, 'defN': 4}
1307 0 {'wxyN': 1, 'abcN': 9, 'tuvN': 2, 'ghiN': 5, 'klmN': 6, 'nopN': 3, 'qrsN': 2, 'defN': 12}
1308 51 {'wxyN': 2, 'abcN': 20, 'tuvN': 2, 'ghiN': 4, 'klmN': 3, 'nopN': 1, 'qrsN': 4, 'defN': 10}
1310 0 {'wxyN': 3, 'abcN': 13, 'tuvN': 1, 'ghiN': 4, 'klmN': 4, 'nopN': 4, 'qrsN': 6, 'defN': 8}
1311 2471 {'wxyN': 4, 'abcN': 8, 'tuvN': 5, 'ghiN': 1, 'klmN': 2, 'nopN': 2, 'qrsN': 1, 'defN': 7}
1315 5367 {'wxyN': 5, 'abcN': 15, 'tuvN': 2, 'ghiN': 9, 'klmN': 7, 'nopN': 4, 'qrsN': 3, 'defN': 10}
1317 0 {'wxyN': 1, 'abcN': 11, 'tuvN': 3, 'ghiN': 3, 'klmN': 1, 'nopN': 3, 'qrsN': 6, 'defN': 10}
1319 2035 {'wxyN': 1, 'abcN': 17, 'tuvN': 3, 'ghiN': 2, 'klmN': 5, 'nopN': 3, 'qrsN': 2, 'defN': 11}
1321 2 {'wxyN

1479 10 {'wxyN': 2, 'abcN': 8, 'tuvN': 2, 'ghiN': 2, 'klmN': 2, 'nopN': 2, 'qrsN': 3, 'defN': 14}
1480 1 {'wxyN': 4, 'abcN': 13, 'tuvN': 2, 'ghiN': 10, 'klmN': 2, 'nopN': 2, 'qrsN': 6, 'defN': 6}
1481 4 {'wxyN': 1, 'abcN': 14, 'tuvN': 2, 'ghiN': 5, 'klmN': 1, 'nopN': 2, 'qrsN': 2, 'defN': 4}
1486 300 {'wxyN': 4, 'abcN': 12, 'tuvN': 2, 'ghiN': 6, 'klmN': 4, 'nopN': 3, 'qrsN': 1, 'defN': 7}
1491 407 {'wxyN': 1, 'abcN': 15, 'tuvN': 4, 'ghiN': 5, 'klmN': 5, 'nopN': 2, 'qrsN': 2, 'defN': 3}
1492 4 {'wxyN': 1, 'abcN': 16, 'tuvN': 2, 'ghiN': 3, 'klmN': 2, 'nopN': 1, 'qrsN': 5, 'defN': 4}
1494 1256 {'wxyN': 2, 'abcN': 6, 'tuvN': 2, 'ghiN': 7, 'klmN': 4, 'nopN': 3, 'qrsN': 1, 'defN': 7}
1495 0 {'wxyN': 1, 'abcN': 11, 'tuvN': 2, 'ghiN': 2, 'klmN': 6, 'nopN': 1, 'qrsN': 4, 'defN': 5}
1496 6 {'wxyN': 3, 'abcN': 15, 'tuvN': 1, 'ghiN': 8, 'klmN': 2, 'nopN': 3, 'qrsN': 5, 'defN': 5}
1497 1377 {'wxyN': 3, 'abcN': 13, 'tuvN': 2, 'ghiN': 1, 'klmN': 5, 'nopN': 3, 'qrsN': 4, 'defN': 1}
1498 0 {'wxyN': 1, 

1687 0 {'wxyN': 2, 'abcN': 18, 'tuvN': 3, 'ghiN': 3, 'klmN': 1, 'nopN': 3, 'qrsN': 4, 'defN': 7}
1690 2 {'wxyN': 1, 'abcN': 16, 'tuvN': 1, 'ghiN': 2, 'klmN': 3, 'nopN': 2, 'qrsN': 2, 'defN': 4}
1692 25 {'wxyN': 3, 'abcN': 11, 'tuvN': 5, 'ghiN': 1, 'klmN': 1, 'nopN': 3, 'qrsN': 3, 'defN': 1}
1696 2983 {'wxyN': 1, 'abcN': 6, 'tuvN': 1, 'ghiN': 4, 'klmN': 1, 'nopN': 1, 'qrsN': 1, 'defN': 7}
1699 417 {'wxyN': 2, 'abcN': 7, 'tuvN': 2, 'ghiN': 4, 'klmN': 3, 'nopN': 2, 'qrsN': 1, 'defN': 3}
1709 11 {'wxyN': 2, 'abcN': 8, 'tuvN': 3, 'ghiN': 1, 'klmN': 3, 'nopN': 6, 'qrsN': 2, 'defN': 5}
1712 6300 {'wxyN': 2, 'abcN': 10, 'tuvN': 2, 'ghiN': 3, 'klmN': 4, 'nopN': 1, 'qrsN': 2, 'defN': 6}
1714 79 {'wxyN': 2, 'abcN': 13, 'tuvN': 1, 'ghiN': 4, 'klmN': 2, 'nopN': 1, 'qrsN': 1, 'defN': 5}
1716 0 {'wxyN': 3, 'abcN': 11, 'tuvN': 1, 'ghiN': 1, 'klmN': 3, 'nopN': 3, 'qrsN': 2, 'defN': 5}
1718 15 {'wxyN': 2, 'abcN': 11, 'tuvN': 1, 'ghiN': 5, 'klmN': 1, 'nopN': 4, 'qrsN': 1, 'defN': 6}
1720 81 {'wxyN': 1, '

1990 1453 {'wxyN': 1, 'abcN': 10, 'tuvN': 1, 'ghiN': 1, 'klmN': 5, 'nopN': 2, 'qrsN': 1, 'defN': 6}
1993 77 {'wxyN': 1, 'abcN': 10, 'tuvN': 3, 'ghiN': 4, 'klmN': 7, 'nopN': 1, 'qrsN': 5, 'defN': 3}
1995 1021 {'wxyN': 1, 'abcN': 7, 'tuvN': 3, 'ghiN': 4, 'klmN': 3, 'nopN': 1, 'qrsN': 1, 'defN': 3}
1996 3822 {'wxyN': 2, 'abcN': 9, 'tuvN': 4, 'ghiN': 5, 'klmN': 3, 'nopN': 4, 'qrsN': 1, 'defN': 5}
2002 38 {'wxyN': 3, 'abcN': 6, 'tuvN': 2, 'ghiN': 3, 'klmN': 1, 'nopN': 2, 'qrsN': 4, 'defN': 4}
2003 0 {'wxyN': 2, 'abcN': 9, 'tuvN': 1, 'ghiN': 4, 'klmN': 3, 'nopN': 1, 'qrsN': 2, 'defN': 2}
2007 14 {'wxyN': 3, 'abcN': 11, 'tuvN': 1, 'ghiN': 5, 'klmN': 1, 'nopN': 4, 'qrsN': 1, 'defN': 5}
2008 1 {'wxyN': 1, 'abcN': 6, 'tuvN': 1, 'ghiN': 2, 'klmN': 4, 'nopN': 1, 'qrsN': 1, 'defN': 5}
2012 18 {'wxyN': 1, 'abcN': 7, 'tuvN': 1, 'ghiN': 5, 'klmN': 1, 'nopN': 3, 'qrsN': 3, 'defN': 5}
2013 74 {'wxyN': 1, 'abcN': 10, 'tuvN': 1, 'ghiN': 3, 'klmN': 1, 'nopN': 2, 'qrsN': 1, 'defN': 6}
2014 0 {'wxyN': 4, 'ab

2325 500 {'wxyN': 1, 'abcN': 9, 'tuvN': 1, 'ghiN': 3, 'klmN': 2, 'nopN': 1, 'qrsN': 3, 'defN': 2}
2328 1384 {'wxyN': 1, 'abcN': 7, 'tuvN': 2, 'ghiN': 2, 'klmN': 1, 'nopN': 3, 'qrsN': 3, 'defN': 3}
2331 1562 {'wxyN': 1, 'abcN': 10, 'tuvN': 2, 'ghiN': 4, 'klmN': 2, 'nopN': 2, 'qrsN': 1, 'defN': 5}
2340 38 {'wxyN': 1, 'abcN': 7, 'tuvN': 1, 'ghiN': 4, 'klmN': 2, 'nopN': 2, 'qrsN': 1, 'defN': 2}
2341 4 {'wxyN': 3, 'abcN': 9, 'tuvN': 3, 'ghiN': 5, 'klmN': 1, 'nopN': 2, 'qrsN': 1, 'defN': 6}
2346 2427 {'wxyN': 1, 'abcN': 7, 'tuvN': 2, 'ghiN': 2, 'klmN': 2, 'nopN': 4, 'qrsN': 2, 'defN': 2}
2348 7 {'wxyN': 4, 'abcN': 7, 'tuvN': 1, 'ghiN': 2, 'klmN': 8, 'nopN': 2, 'qrsN': 1, 'defN': 3}
2354 3 {'wxyN': 1, 'abcN': 6, 'tuvN': 1, 'ghiN': 4, 'klmN': 4, 'nopN': 2, 'qrsN': 1, 'defN': 3}
2361 5 {'wxyN': 1, 'abcN': 5, 'tuvN': 1, 'ghiN': 7, 'klmN': 1, 'nopN': 5, 'qrsN': 1, 'defN': 4}
2363 661 {'wxyN': 3, 'abcN': 7, 'tuvN': 1, 'ghiN': 2, 'klmN': 2, 'nopN': 4, 'qrsN': 4, 'defN': 3}
2366 0 {'wxyN': 3, 'abcN'

2771 5 {'wxyN': 2, 'abcN': 11, 'tuvN': 1, 'ghiN': 2, 'klmN': 1, 'nopN': 1, 'qrsN': 2, 'defN': 4}
2774 16 {'wxyN': 2, 'abcN': 3, 'tuvN': 1, 'ghiN': 2, 'klmN': 5, 'nopN': 2, 'qrsN': 2, 'defN': 6}
2800 3 {'wxyN': 3, 'abcN': 11, 'tuvN': 1, 'ghiN': 2, 'klmN': 1, 'nopN': 1, 'qrsN': 1, 'defN': 3}
2803 6330 {'wxyN': 1, 'abcN': 4, 'tuvN': 2, 'ghiN': 1, 'klmN': 1, 'nopN': 1, 'qrsN': 1, 'defN': 2}
2805 3 {'wxyN': 1, 'abcN': 5, 'tuvN': 1, 'ghiN': 3, 'klmN': 4, 'nopN': 2, 'qrsN': 1, 'defN': 2}
2817 0 {'wxyN': 3, 'abcN': 8, 'tuvN': 1, 'ghiN': 1, 'klmN': 3, 'nopN': 1, 'qrsN': 1, 'defN': 4}
2818 26 {'wxyN': 1, 'abcN': 5, 'tuvN': 1, 'ghiN': 2, 'klmN': 2, 'nopN': 2, 'qrsN': 2, 'defN': 3}
2826 437 {'wxyN': 1, 'abcN': 8, 'tuvN': 1, 'ghiN': 3, 'klmN': 2, 'nopN': 2, 'qrsN': 1, 'defN': 3}
2833 1550 {'wxyN': 5, 'abcN': 8, 'tuvN': 1, 'ghiN': 4, 'klmN': 3, 'nopN': 2, 'qrsN': 2, 'defN': 5}
2843 1907 {'wxyN': 2, 'abcN': 8, 'tuvN': 2, 'ghiN': 4, 'klmN': 1, 'nopN': 2, 'qrsN': 3, 'defN': 6}
2844 9 {'wxyN': 2, 'abcN'

3824 17 {'wxyN': 1, 'abcN': 3, 'tuvN': 2, 'ghiN': 2, 'klmN': 2, 'nopN': 1, 'qrsN': 3, 'defN': 1}
3846 2826 {'wxyN': 1, 'abcN': 7, 'tuvN': 1, 'ghiN': 2, 'klmN': 1, 'nopN': 1, 'qrsN': 2, 'defN': 5}
3850 0 {'wxyN': 1, 'abcN': 4, 'tuvN': 2, 'ghiN': 1, 'klmN': 1, 'nopN': 2, 'qrsN': 2, 'defN': 2}
3883 0 {'wxyN': 1, 'abcN': 2, 'tuvN': 1, 'ghiN': 1, 'klmN': 3, 'nopN': 3, 'qrsN': 2, 'defN': 3}
3910 49 {'wxyN': 1, 'abcN': 4, 'tuvN': 2, 'ghiN': 1, 'klmN': 2, 'nopN': 2, 'qrsN': 3, 'defN': 2}
3987 7 {'wxyN': 1, 'abcN': 5, 'tuvN': 2, 'ghiN': 2, 'klmN': 1, 'nopN': 2, 'qrsN': 2, 'defN': 1}
3999 1666 {'wxyN': 1, 'abcN': 2, 'tuvN': 1, 'ghiN': 1, 'klmN': 1, 'nopN': 2, 'qrsN': 1, 'defN': 3}
4050 1 {'wxyN': 1, 'abcN': 4, 'tuvN': 1, 'ghiN': 3, 'klmN': 1, 'nopN': 2, 'qrsN': 1, 'defN': 2}
4247 8 {'wxyN': 1, 'abcN': 3, 'tuvN': 1, 'ghiN': 1, 'klmN': 1, 'nopN': 1, 'qrsN': 2, 'defN': 6}
4255 6238 {'wxyN': 2, 'abcN': 6, 'tuvN': 1, 'ghiN': 1, 'klmN': 1, 'nopN': 1, 'qrsN': 2, 'defN': 5}
4281 0 {'wxyN': 1, 'abcN': 2,

### Nombre de formes du tirage brut
Le nombre de formes du tirage brut correspond au nombre de tokens paramétré si ce nombre est plus élevé que le nombre de formes calculé via le *ratio*, le tirage est réduit pour obtenir un nombre de forme de l'ordre de grandeur désiré

In [12]:
tirages[marksN].replace(0,np.nan).count().sum()

44200

## Réduction du nombre de types
Pour obtenir un nombre de type compatible avec l'ordre de grandeur fixé via *ratio*, on fixe un seuil de tokens pour inclure les formes dans le tirage.
- si le seuil est fixé à 3, par exemple, les formes ayant moins de 3 attestations sont éliminées
- le seuil est calculé pour s'approcher de l'ordre de grandeur par le haut

Les lexèmes qui n'ont aucune forme dans l'échantillon sont éliminés.
- result=result.dropna(thresh=len(marks)+2+1) => un lexème qui a au moins une forme doit avoir ses 2 colonnes CF, P remplies ainsi que toutes les marques, len(marks), plus au moins une forme tirée (+1)

In [224]:
def reduceTirages(df,seuil):
    result=df.copy()
    for n in range(seuil+1):
        result[marksN]=result[marksN].replace(n,np.nan)
    result=result.dropna(thresh=len(marks)+2+1)
    return result

In [225]:
tiragesReduits=tirages.copy()
for i in range(nbFormes):
    if reduceTirages(tirages,i)[marksN].count().sum()<nbFormes:
        break
if i>0:
    tiragesReduits=reduceTirages(tirages,i-1)
print "Nombre de types réduit pris en compte",tiragesReduits[marksN].count().sum()

Nombre de types réduit pris en compte 16171


In [228]:
#tiragesReduits

In [227]:
print "Nombre de lexèmes dans l'échantillon",len(tiragesReduits)

Nombre de lexèmes dans l'échantillon 5703


In [230]:
fullParadigms=tiragesReduits[marksN].replace(0,np.nan).dropna().index.tolist()
print "Nombre de lexèmes avec un paradigme complet dans l'échantillon",len(fullParadigms)

Nombre de lexèmes avec un paradigme complet dans l'échantillon 518


### Liste des CF avec un paradigme exemplaire

In [231]:
fullCF=sorted(tiragesReduits[tiragesReduits.index.isin(fullParadigms)].groupby("CF").groups.keys())

In [232]:
examplaryCF=classes[classes.index.isin(fullCF)].copy()
examplaryCF

Unnamed: 0,abc,def,ghi,klm,nop,qrs,tuv,wxy
0,c,f,g,l,p,q,v,x
1,c,f,i,m,p,r,t,w
2,c,f,i,l,o,r,u,y
3,a,e,i,l,p,s,u,y
4,c,f,g,l,p,s,u,w
5,c,e,g,k,o,r,t,y
6,b,e,h,m,p,q,v,x
7,c,d,i,l,n,r,t,w
8,c,d,h,m,n,s,u,w
9,c,d,h,l,o,r,v,w


In [234]:
partialParadigms=tiragesReduits[tiragesReduits[marksN].replace(0,np.nan).isnull().any(axis=1)].index.tolist()
print "Nombre de lexèmes avec un paradigme incomplet",len(partialParadigms)

Nombre de lexèmes avec un paradigme incomplet 5185


In [236]:
partialCFLexemes=tiragesReduits[(tiragesReduits.index.isin(partialParadigms)) & (~tiragesReduits.CF.isin(fullCF))]

In [237]:
mark={}
mark['a']=(classes['abc']=='a')
mark['b']=(classes['abc']=='b')
mark['c']=(classes['abc']=='c')
mark['d']=(classes['def']=='d')
mark['e']=(classes['def']=='e')
mark['f']=(classes['def']=='f')
mark['g']=(classes['ghi']=='g')
mark['h']=(classes['ghi']=='h')
mark['i']=(classes['ghi']=='i')
mark['k']=(classes['klm']=='k')
mark['l']=(classes['klm']=='l')
mark['m']=(classes['klm']=='m')
mark['n']=(classes['nop']=='n')
mark['o']=(classes['nop']=='o')
mark['p']=(classes['nop']=='p')
mark['q']=(classes['qrs']=='q')
mark['r']=(classes['qrs']=='r')
mark['s']=(classes['qrs']=='s')
mark['t']=(classes['tuv']=='t')
mark['u']=(classes['tuv']=='u')
mark['v']=(classes['tuv']=='v')
mark['w']=(classes['wxy']=='w')
mark['x']=(classes['wxy']=='x')
mark['y']=(classes['wxy']=='y')

In [238]:
def filterPartialCF(ix):
    s="abcdefghiklmnopqrstuvwxy"
    result=(True)
    for champ in [s[i:i+3] for i in range(0, len(s), 3)]:
        if tirages.iloc[ix][champ+"N"]>0:
            result=result&mark[tirages.iloc[ix][champ]]
    return result

def getPartialCF(ix):
    s="abcdefghiklmnopqrstuvwxy"
    result={"CF":tirages.iloc[ix]["CF"]}
    for champ in [s[i:i+3] for i in range(0, len(s), 3)]:
        if tirages.iloc[ix][champ+"N"]>0:
            result[champ]=tirages.iloc[ix][champ]
    return result


In [239]:
filterPartialCF(19)

0       False
1       False
2       False
3       False
4        True
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
6531    False
6532    False
6533    False
6534    False
6535    False
6536    False
6537    False
6538    False
6539    False
6540    False
6541    False
6542    False
6543    False
6544    False
6545    False
6546    False
6547    False
6548    False
6549    False
6550    False
6551    False
6552    False
6553    False
6554    False
6555    False
6556    False
6557    False
6558    False
6559    False
6560    False
Length: 6561, dtype: bool

In [240]:
#tirages.loc[19][marksN]!=0

## CF partiellement représentées
Les CF partielles qui peuvent être regroupées avec un paradigme exemplaire complet ne sont pas considérées
- si l'intersection avec *necessaryCF* n'est pas vide, la classe est comptée comme une sous-classe d'une CF entièrement attestée
 - le lexème en question est associé aux classes de l'intersection
- sinon, il s'agit d'une CF partielle à considérer

In [252]:
def getPartialLexemes(lexemes,necessaryCF):
    partialLexemes=[]
    mappedCF={}
    ambiguousMapping=0
    for l in lexemes:
    #    print l,
        includingCF=(set(classes[filterPartialCF(l)].index.tolist()) & set(necessaryCF))
        if not includingCF:
            neutralisations={"neutre":",".join(str(i) for i in classes[filterPartialCF(l)].index.tolist())}
    #        print neutralisations
            descriptionPartialCF=getPartialCF(l)
            descriptionPartialCF.update(neutralisations)
            partialLexemes.append(descriptionPartialCF)
        else:
            if len(includingCF)==1:
                mappedCF[l]=list(includingCF)[0]
            else:
                mappedCF[l]=includingCF
                ambiguousMapping+=1
    return partialLexemes,mappedCF,ambiguousMapping

In [253]:
partialLexemes,mappedCF,ambiguousMapping=getPartialLexemes(partialCFLexemes.index,fullCF)

print  len(mappedCF),ambiguousMapping,mappedCF
#print (partialCF)

788 475 {4096: set([256, 306, 5164]), 6145: set([256, 129, 3, 1924, 4872, 5642, 11, 13, 15, 144, 274, 2179, 21, 408, 25, 159, 1952, 803, 561, 1324, 815, 1713, 306, 1462, 2487, 313, 2109, 71, 73, 859, 75, 81, 83, 2139, 220, 221, 3294, 96, 4449, 483, 1389, 376]), 5803: 53, 6150: 1324, 4103: set([256, 2179, 5642, 306, 25, 2139, 159]), 4107: set([2667, 2109]), 5805: set([256, 33, 3, 3969, 103, 3184, 42, 5164, 946, 687, 48, 81, 306, 4449, 441, 24, 2649, 335, 3294, 1713]), 6820: set([2442, 139, 13, 271, 274, 21, 919, 25, 154, 155, 2076, 159, 5536, 4513, 313, 315, 188, 707, 325, 1864, 5067, 3283, 4692, 982, 88, 2650, 350, 1631, 1390, 367, 83, 3583]), 2076: 182, 6180: set([256, 306, 5164]), 6181: 687, 4134: 125, 6185: 0, 7175: set([84, 2753, 43, 44]), 4143: 1946, 4145: set([88, 4513]), 5811: 825, 4149: set([2753, 427, 964, 294, 6, 41, 1431, 44, 208, 1016, 467, 84, 1847, 4312, 5209, 58, 4957, 345]), 4150: set([4513, 707, 2442, 3283, 4692, 155]), 4154: 36, 6697: set([2, 1390]), 4161: set([1863, 

In [254]:
subExamplaryCF=pd.DataFrame.from_dict(partialLexemes)
subExamplaryCF.set_index("CF",inplace=True)
subExamplaryCF=subExamplaryCF[marks+["neutre"]]
subExamplaryCF.sort_index(inplace=True)
subExamplaryCF

Unnamed: 0_level_0,abc,def,ghi,klm,nop,qrs,tuv,wxy,neutre
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
27,b,e,i,m,p,q,v,x,27
27,b,e,i,m,,q,v,,2714041860204322532957583559336515
27,b,e,i,m,,q,v,x,2729575835
27,b,e,i,,p,q,v,x,2739954638
27,b,e,i,m,,q,v,x,2729575835
27,b,e,i,m,p,q,v,x,27
27,b,e,i,m,,q,,x,277271107295730224195583562276456
27,b,e,i,m,p,q,v,,2718605933
27,b,e,i,m,p,q,v,x,27
27,b,e,i,m,p,q,v,x,27


In [244]:
partialCF=list(set(subExamplaryCF.index.tolist()))

In [245]:
def unify(x):
    unification=set(e for e in set(x) if e==e) 
    '''e==e filtre les np.nan qui ont la propriété de rater ce test'''
    if unification:
        result="".join(list(unification))
    else:
        result=np.nan
    return result

def controleCF(x):
    possibleCF=[]
    for l in x:
        possibleCF.extend([int(c) for c in l.split(",") if int(c) in partialCF])
#    print x.str.split(",")
#    cfs=[c for c in x.str.split(",") if c in partialCF]
    return ",".join([str(i) for i in sorted((set(possibleCF)))])

In [246]:
dictAgg={m:lambda x: unify(x) for m in marks}
dictAgg.update({"neutre":lambda x: controleCF(x)})
groupSubExamplaryCF=subExamplaryCF.reset_index().groupby("CF").agg(dictAgg)

In [248]:
groupSubExamplaryCF=groupSubExamplaryCF[marks+["neutre"]]
groupSubExamplaryCF

Unnamed: 0_level_0,abc,def,ghi,klm,nop,qrs,tuv,wxy,neutre
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
27,b,e,i,m,p,q,v,x,271404
30,a,d,g,m,n,q,u,y,30771062
32,a,e,h,m,o,r,u,w,3211711979
38,b,e,i,l,o,q,u,y,3813017218349099928734427
40,a,d,h,l,p,q,u,x,"40,113,203,245,393,830,1053,1534,4137,5184,522..."
46,c,e,h,l,p,r,t,y,46851545176920093641
49,c,f,i,l,o,r,u,x,4987112261583
50,b,e,g,l,o,q,u,x,50141477
52,c,f,h,l,o,q,t,x,52160180501122617182576
55,b,d,g,l,p,q,t,x,5577712941315


In [249]:
cliquedCF=groupSubExamplaryCF.dropna().index.tolist()
groupSubExamplaryCF.dropna()

Unnamed: 0_level_0,abc,def,ghi,klm,nop,qrs,tuv,wxy,neutre
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
27,b,e,i,m,p,q,v,x,271404
30,a,d,g,m,n,q,u,y,30771062
32,a,e,h,m,o,r,u,w,3211711979
38,b,e,i,l,o,q,u,y,3813017218349099928734427
40,a,d,h,l,p,q,u,x,"40,113,203,245,393,830,1053,1534,4137,5184,522..."
46,c,e,h,l,p,r,t,y,46851545176920093641
49,c,f,i,l,o,r,u,x,4987112261583
50,b,e,g,l,o,q,u,x,50141477
52,c,f,h,l,o,q,t,x,52160180501122617182576
55,b,d,g,l,p,q,t,x,5577712941315


In [250]:
groupSubExamplaryCF[~groupSubExamplaryCF.index.isin(cliquedCF)]

Unnamed: 0_level_0,abc,def,ghi,klm,nop,qrs,tuv,wxy,neutre
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
65,a,d,h,k,p,r,v,,651053
107,b,d,g,m,p,,,,1071294429551806142
117,a,d,h,,o,,,y,1175412033
131,c,f,,k,p,s,t,,131
141,b,e,g,l,n,q,t,,1411907
150,b,d,g,,n,q,,w,150
151,c,d,h,m,o,,u,y,151605163422955328
178,c,d,h,k,n,,v,y,1781254
180,c,f,g,l,o,q,,x,180
183,b,e,i,l,o,q,v,,1833514427


# Changements en cours => RegEx

In [65]:
subExamplaryCF.drop_duplicates(subset=["neutre"],keep="first",inplace=True)
subExamplaryCF

Unnamed: 0_level_0,abc,def,ghi,klm,nop,qrs,tuv,wxy,neutre
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
27,b,e,i,m,,q,,x,277271107295730224195583562276456
27,b,e,i,,p,q,v,x,2739954638
27,b,e,i,m,,q,v,x,2729575835
27,b,e,i,m,,q,v,,2714041860204322532957583559336515
27,b,e,i,m,p,q,v,,2718605933
30,a,d,g,m,n,q,u,,30772141
30,a,d,g,,n,q,u,,30772141360641784382517755986485
30,a,d,g,,,q,u,y,303291997248828273736440251776485
30,a,d,g,m,,q,u,,30772141282737364071543755135849
30,a,d,g,,n,q,u,y,3051776485


In [64]:
subExamplaryCF.iloc[300:350]
classes[mark["c"]&mark["f"]&mark["h"]&mark["k"]&mark["r"]]

Unnamed: 0,abc,def,ghi,klm,nop,qrs,tuv,wxy
99,c,f,h,k,o,r,t,y
258,c,f,h,k,n,r,t,y
290,c,f,h,k,p,r,v,y
606,c,f,h,k,n,r,u,y
1075,c,f,h,k,o,r,t,w
1183,c,f,h,k,p,r,t,w
1405,c,f,h,k,p,r,v,w
1725,c,f,h,k,n,r,v,w
1836,c,f,h,k,n,r,v,x
2119,c,f,h,k,o,r,v,w


In [28]:
df=pd.concat([subExamplaryCF,examplaryCF],sort=False)
df
df.sort_index()

Unnamed: 0,abc,def,ghi,klm,nop,qrs,tuv,wxy,neutre
0,c,f,g,l,p,q,v,x,
1,c,f,i,m,p,r,t,w,
2,c,f,i,l,o,r,u,y,
3,a,e,i,l,p,s,u,y,
4,c,f,g,l,p,s,u,w,
5,c,e,g,k,o,r,t,y,
6,b,e,h,m,p,q,v,x,
7,c,d,i,l,n,r,t,w,
8,c,d,h,m,n,s,u,w,
9,c,d,h,l,o,r,v,w,


In [29]:
df.groupby(by=marks).groups.keys()

[('a', 'f', nan, 'k', 'n', 'r', 't', nan),
 ('c', 'd', 'g', 'k', nan, 's', nan, nan),
 ('b', 'd', 'h', nan, 'n', 'q', nan, nan),
 ('b', 'f', 'g', 'm', 'o', 'r', nan, nan),
 ('b', 'f', 'g', 'l', 'p', 'q', 'v', 'w'),
 ('a', 'd', 'h', 'm', 'p', 'r', 'v', 'x'),
 ('c', 'e', 'g', 'k', 'n', 's', 'u', 'x'),
 ('c', 'd', 'i', 'm', nan, 'q', 'u', nan),
 ('c', 'f', 'i', 'm', 'p', 'r', 't', 'w'),
 ('b', 'e', 'h', nan, 'p', 's', 't', 'x'),
 ('b', 'd', 'g', 'k', 'o', 'r', 'v', 'y'),
 ('c', 'f', 'i', 'l', nan, nan, 't', 'y'),
 ('a', 'd', 'i', 'k', 'p', 'r', 't', 'w'),
 ('c', 'd', 'h', 'k', 'p', 'r', nan, 'x'),
 ('b', 'e', 'i', 'l', nan, nan, nan, 'x'),
 ('b', 'e', 'h', 'm', 'o', 's', 'u', nan),
 ('a', 'd', 'i', nan, 'n', nan, 't', nan),
 ('b', 'd', nan, 'l', 'n', 'q', 'v', 'w'),
 ('b', 'd', 'h', 'm', 'n', 'q', nan, nan),
 ('c', 'd', nan, 'm', 'p', 'r', 'u', 'y'),
 ('c', 'e', 'g', 'l', 'p', nan, 'v', 'y'),
 ('a', 'f', 'g', 'k', 'o', nan, 't', nan),
 ('a', 'f', 'g', 'l', 'n', 's', 't', 'w'),
 ('a', 'd',