In [25]:
import pandas as pd
import numpy as np
import itertools as it
import random as rd
import networkx as nx
import re

In [2]:
def distElements(elements):
    prop=[float(1)/c for c in range(1,len(elements)+1)]
    sumProp=sum(prop)
    dist=[p/sumProp for p in prop]
    return dist

## Paramétrage du paradigme
Les marks définissent les cases du paradigme
- le nom de la case correspond à la concaténation des valeurs possibles
 - abc correspond à une case qui peut prendre comme valeur a, b ou c
 
Les classes sont calculées en faisant le produit cartésien des différentes valeurs x cases

In [3]:
marks=["abc","def","ghi","klm","nop","qrs","tuv","wxy"]
marksN=[m+"N" for m in marks]
distMarks=distElements(marksN)

classes=pd.DataFrame(columns=marks)

cf=0
for element in it.product(*marks):
    cf+=1
    row = pd.Series({m:element[n] for n,m in enumerate(marks)},name=cf)
    classes=classes.append(row)

nbClasses=len(classes)
classes = classes.sample(frac=1).reset_index(drop=True)
#classes

In [4]:
nbGrammaireCF=len(classes)

## Paramètres du lexique
On fixe le nombre de lexèmes présents dans le corpus d'apprentissage et le ratio de formes attestées
- nombre de lexèmes
- ratio des formes-cases attestées par rapport au potentiel complet
 - 25% correspond au ratio observé pour les verbes dans Lex3
- nbFormes est l'ordre de grandeur à respecter pour l'échantillon en nombre de types

In [5]:
ratio=.25
nbLexemes=10000

nbFormes=int(nbLexemes*len(marks)*ratio)
nbFormes

28000

## Distribution des lexèmes et des CF
Les différents éléments suivent des distributions par Zipf.
- distElements renvoie une liste de probabilités correspondant au nombre d'éléments suivant une Zipf(x)

Chaque lexème a une fréquence de lemme qui correspond à son rang et reçoit une CF tirée au hasard suivant une loi de Zipf.
- np.random.choice(cfs,p=distCFs)

In [6]:
cfs=range(nbClasses)
distCFs=distElements(cfs)

lexs=range(nbLexemes)
distLexs=distElements(lexs)

lexemes=pd.DataFrame(columns=["CF","P"]+marks)
for l in range(nbLexemes):
    cf=np.random.choice(cfs,p=distCFs)
    dictL={"CF":int(cf), "P":distLexs[l]}
    dictL.update(classes.iloc[cf].to_dict())
    row=pd.Series(dictL,name=l)
    lexemes=lexemes.append(row)
#lexemes.index+=1

In [7]:
lexemes

Unnamed: 0,CF,P,abc,def,ghi,klm,nop,qrs,tuv,wxy
0,1465,0.102170,c,e,i,l,p,r,t,y
1,6408,0.051085,a,e,g,m,n,q,v,y
2,4,0.034057,b,e,h,k,p,r,t,y
3,1,0.025543,a,e,i,l,p,s,u,x
4,266,0.020434,c,d,i,l,o,s,v,w
5,397,0.017028,c,f,g,m,o,q,u,w
6,1889,0.014596,a,f,i,l,p,r,t,w
7,62,0.012771,b,f,i,l,o,q,t,w
8,0,0.011352,b,e,h,k,p,r,t,x
9,16,0.010217,b,e,i,m,o,s,v,w


In [8]:
print "Nombre de types potentiel",lexemes[marks].count().sum()

Nombre de types potentiel 80000


## Nombre de CF tirées dans le lexique
Ce nombre représente le maximum de CF qui pourraient être nécessaires pour la description.
Le nombre de CF nécessaire est au moins égal au nombre de CF qui possèdent un paradigme exemplaire, mais les CF qui n'ont qu'une représentation partielle de leur paradigme ne sont pas obligatoirement nécessaires à la description.

In [9]:
nbLexiqueCF=len(lexemes.groupby("CF").groups.keys())
print nbLexiqueCF,"CF dans le lexique sur",nbGrammaireCF,"CF dans la grammaire"

2478 CF dans le lexique sur 6561 CF dans la grammaire


## Constitution du DF pour le tirage

In [10]:
tiragesColonnes=pd.DataFrame(0, index=np.arange(len(lexs)), columns=marksN)
tirages=pd.concat([lexemes, tiragesColonnes], axis=1, sort=False)
#tirages

### Tirage des lexèmes pour les formes attestées
- lexTirs contient le nombre de token à tirer pour chaque lexème

In [11]:
#tirages=pd.DataFrame(0, index=np.arange(len(lexs)), columns=marks)
#tirages.index+=1

nbTokens=500000
lexTirs={}
for n in range(nbTokens):
    l=np.random.choice(lexs,p=distLexs)
    if not l in lexTirs:
        lexTirs[l]=0
    lexTirs[l]+=1
#lexTirs

### Tirage des formes-cases
- cellTirs contient le nombre de fois que chaque forme-case a été tirée

In [12]:
for l in lexTirs:
    cellTirs={}
    for n in range(lexTirs[l]):
        c=np.random.choice(marksN,p=distMarks)
        if not c in cellTirs:
            cellTirs[c]=0
        cellTirs[c]+=1
    if len(cellTirs)==8:
        print l,tirages.loc[l,"CF"], cellTirs
    for c in cellTirs:
        tirages.loc[l,c]=cellTirs[c]
    

0 1465 {'wxyN': 2277, 'abcN': 18438, 'tuvN': 2675, 'ghiN': 6293, 'klmN': 4581, 'nopN': 3782, 'qrsN': 3142, 'defN': 9436}
1 6408 {'wxyN': 1230, 'abcN': 9609, 'tuvN': 1388, 'ghiN': 3081, 'klmN': 2275, 'nopN': 1933, 'qrsN': 1670, 'defN': 4697}
2 4 {'wxyN': 800, 'abcN': 6160, 'tuvN': 943, 'ghiN': 2115, 'klmN': 1570, 'nopN': 1249, 'qrsN': 1064, 'defN': 3091}
3 1 {'wxyN': 598, 'abcN': 4607, 'tuvN': 666, 'ghiN': 1571, 'klmN': 1193, 'nopN': 947, 'qrsN': 767, 'defN': 2300}
4 266 {'wxyN': 466, 'abcN': 3726, 'tuvN': 520, 'ghiN': 1242, 'klmN': 920, 'nopN': 737, 'qrsN': 620, 'defN': 1850}
5 397 {'wxyN': 396, 'abcN': 3187, 'tuvN': 403, 'ghiN': 1016, 'klmN': 801, 'nopN': 633, 'qrsN': 517, 'defN': 1584}
6 1889 {'wxyN': 361, 'abcN': 2714, 'tuvN': 417, 'ghiN': 919, 'klmN': 665, 'nopN': 555, 'qrsN': 451, 'defN': 1370}
7 62 {'wxyN': 300, 'abcN': 2381, 'tuvN': 317, 'ghiN': 821, 'klmN': 562, 'nopN': 460, 'qrsN': 403, 'defN': 1201}
8 0 {'wxyN': 256, 'abcN': 2116, 'tuvN': 317, 'ghiN': 666, 'klmN': 516, 'nopN'

77 0 {'wxyN': 35, 'abcN': 219, 'tuvN': 36, 'ghiN': 77, 'klmN': 65, 'nopN': 49, 'qrsN': 49, 'defN': 121}
78 0 {'wxyN': 20, 'abcN': 248, 'tuvN': 45, 'ghiN': 80, 'klmN': 58, 'nopN': 38, 'qrsN': 36, 'defN': 120}
79 49 {'wxyN': 30, 'abcN': 214, 'tuvN': 30, 'ghiN': 66, 'klmN': 56, 'nopN': 45, 'qrsN': 40, 'defN': 109}
80 3 {'wxyN': 22, 'abcN': 225, 'tuvN': 32, 'ghiN': 76, 'klmN': 64, 'nopN': 47, 'qrsN': 43, 'defN': 113}
81 1378 {'wxyN': 29, 'abcN': 221, 'tuvN': 35, 'ghiN': 62, 'klmN': 51, 'nopN': 50, 'qrsN': 28, 'defN': 128}
82 6 {'wxyN': 32, 'abcN': 223, 'tuvN': 41, 'ghiN': 76, 'klmN': 64, 'nopN': 43, 'qrsN': 34, 'defN': 107}
83 0 {'wxyN': 25, 'abcN': 210, 'tuvN': 38, 'ghiN': 79, 'klmN': 60, 'nopN': 45, 'qrsN': 42, 'defN': 103}
84 18 {'wxyN': 22, 'abcN': 222, 'tuvN': 26, 'ghiN': 63, 'klmN': 52, 'nopN': 40, 'qrsN': 40, 'defN': 109}
85 1925 {'wxyN': 22, 'abcN': 223, 'tuvN': 25, 'ghiN': 76, 'klmN': 59, 'nopN': 42, 'qrsN': 47, 'defN': 109}
86 800 {'wxyN': 27, 'abcN': 224, 'tuvN': 26, 'ghiN': 77,

162 1717 {'wxyN': 17, 'abcN': 122, 'tuvN': 16, 'ghiN': 43, 'klmN': 33, 'nopN': 25, 'qrsN': 12, 'defN': 62}
163 22 {'wxyN': 12, 'abcN': 97, 'tuvN': 21, 'ghiN': 44, 'klmN': 29, 'nopN': 20, 'qrsN': 27, 'defN': 52}
164 0 {'wxyN': 21, 'abcN': 114, 'tuvN': 19, 'ghiN': 44, 'klmN': 37, 'nopN': 27, 'qrsN': 12, 'defN': 48}
165 3 {'wxyN': 13, 'abcN': 113, 'tuvN': 11, 'ghiN': 29, 'klmN': 30, 'nopN': 27, 'qrsN': 21, 'defN': 57}
166 0 {'wxyN': 13, 'abcN': 109, 'tuvN': 15, 'ghiN': 37, 'klmN': 28, 'nopN': 24, 'qrsN': 18, 'defN': 56}
167 199 {'wxyN': 14, 'abcN': 110, 'tuvN': 19, 'ghiN': 40, 'klmN': 28, 'nopN': 23, 'qrsN': 18, 'defN': 58}
168 327 {'wxyN': 15, 'abcN': 87, 'tuvN': 24, 'ghiN': 40, 'klmN': 20, 'nopN': 26, 'qrsN': 25, 'defN': 50}
169 28 {'wxyN': 14, 'abcN': 113, 'tuvN': 16, 'ghiN': 40, 'klmN': 28, 'nopN': 24, 'qrsN': 16, 'defN': 54}
170 798 {'wxyN': 14, 'abcN': 124, 'tuvN': 11, 'ghiN': 31, 'klmN': 20, 'nopN': 21, 'qrsN': 16, 'defN': 60}
171 205 {'wxyN': 12, 'abcN': 96, 'tuvN': 21, 'ghiN': 36

252 67 {'wxyN': 5, 'abcN': 61, 'tuvN': 10, 'ghiN': 16, 'klmN': 13, 'nopN': 13, 'qrsN': 12, 'defN': 28}
253 997 {'wxyN': 10, 'abcN': 85, 'tuvN': 15, 'ghiN': 26, 'klmN': 20, 'nopN': 10, 'qrsN': 16, 'defN': 33}
254 2 {'wxyN': 4, 'abcN': 71, 'tuvN': 14, 'ghiN': 24, 'klmN': 18, 'nopN': 17, 'qrsN': 18, 'defN': 40}
255 2067 {'wxyN': 12, 'abcN': 72, 'tuvN': 18, 'ghiN': 26, 'klmN': 15, 'nopN': 14, 'qrsN': 14, 'defN': 41}
256 33 {'wxyN': 10, 'abcN': 70, 'tuvN': 6, 'ghiN': 24, 'klmN': 20, 'nopN': 15, 'qrsN': 10, 'defN': 44}
257 3596 {'wxyN': 10, 'abcN': 83, 'tuvN': 12, 'ghiN': 27, 'klmN': 18, 'nopN': 11, 'qrsN': 13, 'defN': 24}
258 158 {'wxyN': 7, 'abcN': 66, 'tuvN': 14, 'ghiN': 12, 'klmN': 19, 'nopN': 19, 'qrsN': 17, 'defN': 30}
259 104 {'wxyN': 15, 'abcN': 68, 'tuvN': 9, 'ghiN': 20, 'klmN': 16, 'nopN': 10, 'qrsN': 9, 'defN': 33}
260 34 {'wxyN': 11, 'abcN': 67, 'tuvN': 10, 'ghiN': 31, 'klmN': 8, 'nopN': 15, 'qrsN': 14, 'defN': 32}
261 3470 {'wxyN': 8, 'abcN': 65, 'tuvN': 11, 'ghiN': 21, 'klmN': 

352 3 {'wxyN': 10, 'abcN': 52, 'tuvN': 10, 'ghiN': 15, 'klmN': 11, 'nopN': 13, 'qrsN': 10, 'defN': 33}
353 17 {'wxyN': 9, 'abcN': 62, 'tuvN': 10, 'ghiN': 17, 'klmN': 9, 'nopN': 13, 'qrsN': 5, 'defN': 25}
354 11 {'wxyN': 8, 'abcN': 58, 'tuvN': 6, 'ghiN': 21, 'klmN': 15, 'nopN': 9, 'qrsN': 5, 'defN': 19}
355 2032 {'wxyN': 6, 'abcN': 35, 'tuvN': 10, 'ghiN': 14, 'klmN': 9, 'nopN': 15, 'qrsN': 10, 'defN': 19}
356 7 {'wxyN': 5, 'abcN': 56, 'tuvN': 7, 'ghiN': 15, 'klmN': 15, 'nopN': 4, 'qrsN': 2, 'defN': 32}
357 297 {'wxyN': 6, 'abcN': 49, 'tuvN': 8, 'ghiN': 18, 'klmN': 16, 'nopN': 14, 'qrsN': 8, 'defN': 17}
358 46 {'wxyN': 3, 'abcN': 57, 'tuvN': 6, 'ghiN': 18, 'klmN': 17, 'nopN': 11, 'qrsN': 9, 'defN': 38}
359 3671 {'wxyN': 4, 'abcN': 48, 'tuvN': 5, 'ghiN': 15, 'klmN': 14, 'nopN': 13, 'qrsN': 7, 'defN': 27}
360 3517 {'wxyN': 10, 'abcN': 53, 'tuvN': 7, 'ghiN': 15, 'klmN': 13, 'nopN': 15, 'qrsN': 12, 'defN': 25}
361 0 {'wxyN': 3, 'abcN': 48, 'tuvN': 6, 'ghiN': 17, 'klmN': 9, 'nopN': 14, 'qrsN'

435 332 {'wxyN': 5, 'abcN': 48, 'tuvN': 4, 'ghiN': 12, 'klmN': 9, 'nopN': 5, 'qrsN': 10, 'defN': 21}
436 4785 {'wxyN': 3, 'abcN': 47, 'tuvN': 6, 'ghiN': 13, 'klmN': 5, 'nopN': 6, 'qrsN': 4, 'defN': 20}
437 5 {'wxyN': 7, 'abcN': 47, 'tuvN': 4, 'ghiN': 14, 'klmN': 8, 'nopN': 10, 'qrsN': 10, 'defN': 30}
438 2974 {'wxyN': 4, 'abcN': 41, 'tuvN': 4, 'ghiN': 11, 'klmN': 18, 'nopN': 7, 'qrsN': 9, 'defN': 17}
439 7 {'wxyN': 9, 'abcN': 37, 'tuvN': 7, 'ghiN': 23, 'klmN': 19, 'nopN': 6, 'qrsN': 7, 'defN': 22}
440 38 {'wxyN': 4, 'abcN': 37, 'tuvN': 7, 'ghiN': 13, 'klmN': 17, 'nopN': 9, 'qrsN': 5, 'defN': 14}
441 0 {'wxyN': 10, 'abcN': 51, 'tuvN': 7, 'ghiN': 21, 'klmN': 7, 'nopN': 12, 'qrsN': 5, 'defN': 22}
442 125 {'wxyN': 7, 'abcN': 47, 'tuvN': 6, 'ghiN': 18, 'klmN': 11, 'nopN': 5, 'qrsN': 9, 'defN': 15}
443 5 {'wxyN': 7, 'abcN': 46, 'tuvN': 8, 'ghiN': 8, 'klmN': 9, 'nopN': 7, 'qrsN': 7, 'defN': 24}
444 44 {'wxyN': 5, 'abcN': 47, 'tuvN': 9, 'ghiN': 12, 'klmN': 16, 'nopN': 4, 'qrsN': 6, 'defN': 14}

522 303 {'wxyN': 7, 'abcN': 42, 'tuvN': 6, 'ghiN': 9, 'klmN': 11, 'nopN': 5, 'qrsN': 10, 'defN': 22}
523 2511 {'wxyN': 5, 'abcN': 26, 'tuvN': 3, 'ghiN': 10, 'klmN': 10, 'nopN': 9, 'qrsN': 5, 'defN': 25}
524 770 {'wxyN': 6, 'abcN': 34, 'tuvN': 5, 'ghiN': 11, 'klmN': 12, 'nopN': 9, 'qrsN': 7, 'defN': 15}
526 299 {'wxyN': 4, 'abcN': 30, 'tuvN': 5, 'ghiN': 7, 'klmN': 7, 'nopN': 5, 'qrsN': 5, 'defN': 14}
527 528 {'wxyN': 4, 'abcN': 32, 'tuvN': 3, 'ghiN': 17, 'klmN': 7, 'nopN': 6, 'qrsN': 3, 'defN': 17}
528 409 {'wxyN': 2, 'abcN': 37, 'tuvN': 5, 'ghiN': 18, 'klmN': 5, 'nopN': 10, 'qrsN': 2, 'defN': 19}
529 429 {'wxyN': 5, 'abcN': 36, 'tuvN': 5, 'ghiN': 14, 'klmN': 13, 'nopN': 9, 'qrsN': 4, 'defN': 21}
530 1056 {'wxyN': 3, 'abcN': 35, 'tuvN': 7, 'ghiN': 13, 'klmN': 12, 'nopN': 8, 'qrsN': 5, 'defN': 14}
531 1 {'wxyN': 3, 'abcN': 28, 'tuvN': 7, 'ghiN': 8, 'klmN': 5, 'nopN': 5, 'qrsN': 6, 'defN': 17}
532 3 {'wxyN': 6, 'abcN': 32, 'tuvN': 2, 'ghiN': 19, 'klmN': 4, 'nopN': 10, 'qrsN': 8, 'defN': 1

613 22 {'wxyN': 2, 'abcN': 37, 'tuvN': 7, 'ghiN': 7, 'klmN': 11, 'nopN': 7, 'qrsN': 4, 'defN': 10}
614 471 {'wxyN': 3, 'abcN': 29, 'tuvN': 6, 'ghiN': 11, 'klmN': 1, 'nopN': 6, 'qrsN': 4, 'defN': 20}
615 3 {'wxyN': 9, 'abcN': 22, 'tuvN': 7, 'ghiN': 5, 'klmN': 10, 'nopN': 5, 'qrsN': 7, 'defN': 12}
616 519 {'wxyN': 2, 'abcN': 36, 'tuvN': 1, 'ghiN': 8, 'klmN': 6, 'nopN': 4, 'qrsN': 4, 'defN': 15}
617 234 {'wxyN': 6, 'abcN': 25, 'tuvN': 6, 'ghiN': 7, 'klmN': 9, 'nopN': 4, 'qrsN': 6, 'defN': 13}
618 0 {'wxyN': 1, 'abcN': 35, 'tuvN': 6, 'ghiN': 4, 'klmN': 7, 'nopN': 4, 'qrsN': 4, 'defN': 22}
619 6 {'wxyN': 2, 'abcN': 29, 'tuvN': 7, 'ghiN': 9, 'klmN': 8, 'nopN': 6, 'qrsN': 7, 'defN': 18}
620 974 {'wxyN': 7, 'abcN': 33, 'tuvN': 6, 'ghiN': 13, 'klmN': 11, 'nopN': 2, 'qrsN': 4, 'defN': 15}
621 6 {'wxyN': 3, 'abcN': 26, 'tuvN': 4, 'ghiN': 8, 'klmN': 3, 'nopN': 8, 'qrsN': 5, 'defN': 15}
622 12 {'wxyN': 4, 'abcN': 31, 'tuvN': 7, 'ghiN': 4, 'klmN': 9, 'nopN': 1, 'qrsN': 5, 'defN': 18}
623 1127 {'wxyN

708 562 {'wxyN': 5, 'abcN': 39, 'tuvN': 2, 'ghiN': 7, 'klmN': 8, 'nopN': 8, 'qrsN': 5, 'defN': 9}
709 1 {'wxyN': 1, 'abcN': 30, 'tuvN': 4, 'ghiN': 11, 'klmN': 6, 'nopN': 7, 'qrsN': 2, 'defN': 10}
710 1556 {'wxyN': 3, 'abcN': 37, 'tuvN': 7, 'ghiN': 11, 'klmN': 3, 'nopN': 6, 'qrsN': 7, 'defN': 11}
711 0 {'wxyN': 1, 'abcN': 27, 'tuvN': 2, 'ghiN': 10, 'klmN': 2, 'nopN': 1, 'qrsN': 2, 'defN': 12}
712 10 {'wxyN': 3, 'abcN': 23, 'tuvN': 3, 'ghiN': 8, 'klmN': 6, 'nopN': 3, 'qrsN': 4, 'defN': 12}
713 59 {'wxyN': 7, 'abcN': 30, 'tuvN': 4, 'ghiN': 9, 'klmN': 5, 'nopN': 3, 'qrsN': 4, 'defN': 12}
714 16 {'wxyN': 1, 'abcN': 23, 'tuvN': 5, 'ghiN': 9, 'klmN': 11, 'nopN': 4, 'qrsN': 3, 'defN': 8}
715 3288 {'wxyN': 2, 'abcN': 25, 'tuvN': 6, 'ghiN': 9, 'klmN': 6, 'nopN': 5, 'qrsN': 4, 'defN': 13}
716 166 {'wxyN': 2, 'abcN': 19, 'tuvN': 4, 'ghiN': 9, 'klmN': 7, 'nopN': 4, 'qrsN': 4, 'defN': 12}
717 471 {'wxyN': 4, 'abcN': 23, 'tuvN': 3, 'ghiN': 9, 'klmN': 14, 'nopN': 3, 'qrsN': 3, 'defN': 23}
719 2351 {'w

808 2 {'wxyN': 2, 'abcN': 29, 'tuvN': 5, 'ghiN': 10, 'klmN': 8, 'nopN': 2, 'qrsN': 3, 'defN': 13}
809 99 {'wxyN': 2, 'abcN': 32, 'tuvN': 6, 'ghiN': 8, 'klmN': 8, 'nopN': 5, 'qrsN': 5, 'defN': 9}
810 94 {'wxyN': 5, 'abcN': 27, 'tuvN': 2, 'ghiN': 5, 'klmN': 7, 'nopN': 8, 'qrsN': 3, 'defN': 12}
811 38 {'wxyN': 2, 'abcN': 19, 'tuvN': 4, 'ghiN': 13, 'klmN': 5, 'nopN': 6, 'qrsN': 5, 'defN': 11}
812 248 {'wxyN': 8, 'abcN': 22, 'tuvN': 5, 'ghiN': 9, 'klmN': 5, 'nopN': 2, 'qrsN': 3, 'defN': 9}
813 11 {'wxyN': 1, 'abcN': 24, 'tuvN': 4, 'ghiN': 6, 'klmN': 6, 'nopN': 4, 'qrsN': 4, 'defN': 12}
814 1 {'wxyN': 1, 'abcN': 16, 'tuvN': 3, 'ghiN': 10, 'klmN': 4, 'nopN': 2, 'qrsN': 3, 'defN': 9}
816 54 {'wxyN': 2, 'abcN': 23, 'tuvN': 7, 'ghiN': 8, 'klmN': 8, 'nopN': 2, 'qrsN': 6, 'defN': 10}
817 330 {'wxyN': 1, 'abcN': 25, 'tuvN': 2, 'ghiN': 6, 'klmN': 10, 'nopN': 2, 'qrsN': 5, 'defN': 16}
818 407 {'wxyN': 3, 'abcN': 28, 'tuvN': 2, 'ghiN': 11, 'klmN': 7, 'nopN': 2, 'qrsN': 3, 'defN': 14}
819 1 {'wxyN': 3,

907 36 {'wxyN': 2, 'abcN': 17, 'tuvN': 4, 'ghiN': 2, 'klmN': 4, 'nopN': 4, 'qrsN': 6, 'defN': 11}
908 27 {'wxyN': 3, 'abcN': 26, 'tuvN': 5, 'ghiN': 6, 'klmN': 6, 'nopN': 4, 'qrsN': 10, 'defN': 11}
909 2902 {'wxyN': 4, 'abcN': 21, 'tuvN': 5, 'ghiN': 10, 'klmN': 5, 'nopN': 2, 'qrsN': 7, 'defN': 9}
910 1550 {'wxyN': 2, 'abcN': 25, 'tuvN': 6, 'ghiN': 5, 'klmN': 7, 'nopN': 6, 'qrsN': 5, 'defN': 10}
911 6210 {'wxyN': 2, 'abcN': 22, 'tuvN': 4, 'ghiN': 12, 'klmN': 3, 'nopN': 6, 'qrsN': 2, 'defN': 11}
912 190 {'wxyN': 1, 'abcN': 18, 'tuvN': 2, 'ghiN': 6, 'klmN': 8, 'nopN': 6, 'qrsN': 4, 'defN': 8}
913 174 {'wxyN': 4, 'abcN': 17, 'tuvN': 1, 'ghiN': 7, 'klmN': 6, 'nopN': 1, 'qrsN': 1, 'defN': 10}
914 0 {'wxyN': 1, 'abcN': 16, 'tuvN': 3, 'ghiN': 9, 'klmN': 5, 'nopN': 4, 'qrsN': 7, 'defN': 10}
915 121 {'wxyN': 4, 'abcN': 16, 'tuvN': 6, 'ghiN': 7, 'klmN': 2, 'nopN': 4, 'qrsN': 4, 'defN': 12}
916 11 {'wxyN': 1, 'abcN': 18, 'tuvN': 2, 'ghiN': 4, 'klmN': 3, 'nopN': 3, 'qrsN': 4, 'defN': 11}
917 99 {'wx

1046 15 {'wxyN': 1, 'abcN': 18, 'tuvN': 7, 'ghiN': 6, 'klmN': 1, 'nopN': 6, 'qrsN': 2, 'defN': 7}
1047 6 {'wxyN': 2, 'abcN': 15, 'tuvN': 3, 'ghiN': 5, 'klmN': 3, 'nopN': 4, 'qrsN': 3, 'defN': 7}
1048 426 {'wxyN': 1, 'abcN': 13, 'tuvN': 2, 'ghiN': 3, 'klmN': 5, 'nopN': 3, 'qrsN': 3, 'defN': 10}
1049 260 {'wxyN': 3, 'abcN': 14, 'tuvN': 1, 'ghiN': 4, 'klmN': 3, 'nopN': 5, 'qrsN': 2, 'defN': 4}
1050 0 {'wxyN': 1, 'abcN': 15, 'tuvN': 4, 'ghiN': 2, 'klmN': 1, 'nopN': 1, 'qrsN': 1, 'defN': 9}
1052 50 {'wxyN': 4, 'abcN': 22, 'tuvN': 4, 'ghiN': 6, 'klmN': 4, 'nopN': 4, 'qrsN': 1, 'defN': 3}
1053 2 {'wxyN': 1, 'abcN': 30, 'tuvN': 3, 'ghiN': 10, 'klmN': 4, 'nopN': 3, 'qrsN': 1, 'defN': 8}
1055 979 {'wxyN': 2, 'abcN': 20, 'tuvN': 2, 'ghiN': 5, 'klmN': 8, 'nopN': 5, 'qrsN': 1, 'defN': 9}
1057 838 {'wxyN': 2, 'abcN': 14, 'tuvN': 4, 'ghiN': 2, 'klmN': 5, 'nopN': 4, 'qrsN': 3, 'defN': 9}
1058 1264 {'wxyN': 2, 'abcN': 12, 'tuvN': 2, 'ghiN': 4, 'klmN': 4, 'nopN': 3, 'qrsN': 2, 'defN': 9}
1061 170 {'wxyN

1192 0 {'wxyN': 2, 'abcN': 11, 'tuvN': 4, 'ghiN': 7, 'klmN': 5, 'nopN': 3, 'qrsN': 2, 'defN': 6}
1195 1588 {'wxyN': 1, 'abcN': 10, 'tuvN': 2, 'ghiN': 5, 'klmN': 6, 'nopN': 8, 'qrsN': 3, 'defN': 9}
1198 1067 {'wxyN': 2, 'abcN': 8, 'tuvN': 3, 'ghiN': 9, 'klmN': 4, 'nopN': 2, 'qrsN': 4, 'defN': 8}
1200 5210 {'wxyN': 4, 'abcN': 10, 'tuvN': 5, 'ghiN': 11, 'klmN': 1, 'nopN': 6, 'qrsN': 2, 'defN': 8}
1201 72 {'wxyN': 3, 'abcN': 8, 'tuvN': 2, 'ghiN': 7, 'klmN': 3, 'nopN': 4, 'qrsN': 1, 'defN': 5}
1204 61 {'wxyN': 2, 'abcN': 21, 'tuvN': 3, 'ghiN': 2, 'klmN': 6, 'nopN': 2, 'qrsN': 1, 'defN': 7}
1205 1 {'wxyN': 4, 'abcN': 14, 'tuvN': 1, 'ghiN': 9, 'klmN': 3, 'nopN': 5, 'qrsN': 4, 'defN': 7}
1207 11 {'wxyN': 1, 'abcN': 22, 'tuvN': 2, 'ghiN': 2, 'klmN': 5, 'nopN': 1, 'qrsN': 2, 'defN': 6}
1208 0 {'wxyN': 5, 'abcN': 13, 'tuvN': 2, 'ghiN': 2, 'klmN': 3, 'nopN': 3, 'qrsN': 1, 'defN': 13}
1210 7 {'wxyN': 2, 'abcN': 19, 'tuvN': 2, 'ghiN': 6, 'klmN': 2, 'nopN': 3, 'qrsN': 2, 'defN': 8}
1211 28 {'wxyN': 1

1349 58 {'wxyN': 3, 'abcN': 15, 'tuvN': 3, 'ghiN': 6, 'klmN': 4, 'nopN': 2, 'qrsN': 3, 'defN': 7}
1350 2 {'wxyN': 5, 'abcN': 13, 'tuvN': 1, 'ghiN': 4, 'klmN': 3, 'nopN': 5, 'qrsN': 2, 'defN': 7}
1355 0 {'wxyN': 2, 'abcN': 16, 'tuvN': 3, 'ghiN': 3, 'klmN': 2, 'nopN': 1, 'qrsN': 2, 'defN': 7}
1356 8 {'wxyN': 4, 'abcN': 7, 'tuvN': 8, 'ghiN': 4, 'klmN': 2, 'nopN': 3, 'qrsN': 1, 'defN': 9}
1360 21 {'wxyN': 2, 'abcN': 10, 'tuvN': 2, 'ghiN': 2, 'klmN': 4, 'nopN': 4, 'qrsN': 2, 'defN': 6}
1362 4992 {'wxyN': 3, 'abcN': 20, 'tuvN': 4, 'ghiN': 3, 'klmN': 4, 'nopN': 2, 'qrsN': 3, 'defN': 5}
1363 298 {'wxyN': 1, 'abcN': 10, 'tuvN': 3, 'ghiN': 8, 'klmN': 4, 'nopN': 2, 'qrsN': 1, 'defN': 7}
1366 369 {'wxyN': 1, 'abcN': 12, 'tuvN': 3, 'ghiN': 6, 'klmN': 4, 'nopN': 2, 'qrsN': 3, 'defN': 5}
1368 500 {'wxyN': 1, 'abcN': 19, 'tuvN': 5, 'ghiN': 2, 'klmN': 5, 'nopN': 1, 'qrsN': 2, 'defN': 3}
1369 1491 {'wxyN': 2, 'abcN': 14, 'tuvN': 1, 'ghiN': 5, 'klmN': 2, 'nopN': 1, 'qrsN': 3, 'defN': 6}
1370 106 {'wxyN':

1506 61 {'wxyN': 2, 'abcN': 18, 'tuvN': 5, 'ghiN': 4, 'klmN': 1, 'nopN': 5, 'qrsN': 5, 'defN': 7}
1507 446 {'wxyN': 1, 'abcN': 13, 'tuvN': 4, 'ghiN': 4, 'klmN': 2, 'nopN': 5, 'qrsN': 2, 'defN': 6}
1508 90 {'wxyN': 2, 'abcN': 17, 'tuvN': 3, 'ghiN': 7, 'klmN': 1, 'nopN': 2, 'qrsN': 2, 'defN': 4}
1509 4055 {'wxyN': 4, 'abcN': 8, 'tuvN': 3, 'ghiN': 1, 'klmN': 7, 'nopN': 1, 'qrsN': 1, 'defN': 8}
1513 6060 {'wxyN': 2, 'abcN': 15, 'tuvN': 1, 'ghiN': 4, 'klmN': 3, 'nopN': 1, 'qrsN': 3, 'defN': 6}
1514 622 {'wxyN': 1, 'abcN': 15, 'tuvN': 1, 'ghiN': 5, 'klmN': 4, 'nopN': 3, 'qrsN': 3, 'defN': 7}
1517 14 {'wxyN': 3, 'abcN': 11, 'tuvN': 4, 'ghiN': 5, 'klmN': 4, 'nopN': 2, 'qrsN': 1, 'defN': 8}
1519 8 {'wxyN': 2, 'abcN': 11, 'tuvN': 2, 'ghiN': 4, 'klmN': 1, 'nopN': 1, 'qrsN': 1, 'defN': 4}
1521 20 {'wxyN': 2, 'abcN': 19, 'tuvN': 1, 'ghiN': 4, 'klmN': 1, 'nopN': 3, 'qrsN': 1, 'defN': 8}
1523 17 {'wxyN': 1, 'abcN': 18, 'tuvN': 1, 'ghiN': 3, 'klmN': 4, 'nopN': 4, 'qrsN': 2, 'defN': 6}
1525 21 {'wxyN':

1713 6376 {'wxyN': 1, 'abcN': 16, 'tuvN': 2, 'ghiN': 1, 'klmN': 3, 'nopN': 4, 'qrsN': 4, 'defN': 7}
1714 9 {'wxyN': 3, 'abcN': 10, 'tuvN': 3, 'ghiN': 3, 'klmN': 1, 'nopN': 5, 'qrsN': 2, 'defN': 5}
1717 816 {'wxyN': 3, 'abcN': 16, 'tuvN': 3, 'ghiN': 7, 'klmN': 1, 'nopN': 2, 'qrsN': 2, 'defN': 7}
1718 394 {'wxyN': 1, 'abcN': 17, 'tuvN': 1, 'ghiN': 3, 'klmN': 2, 'nopN': 2, 'qrsN': 3, 'defN': 4}
1729 5235 {'wxyN': 2, 'abcN': 10, 'tuvN': 5, 'ghiN': 13, 'klmN': 6, 'nopN': 3, 'qrsN': 1, 'defN': 7}
1734 433 {'wxyN': 2, 'abcN': 11, 'tuvN': 2, 'ghiN': 1, 'klmN': 2, 'nopN': 2, 'qrsN': 1, 'defN': 5}
1735 200 {'wxyN': 2, 'abcN': 13, 'tuvN': 1, 'ghiN': 3, 'klmN': 2, 'nopN': 2, 'qrsN': 2, 'defN': 4}
1736 48 {'wxyN': 1, 'abcN': 12, 'tuvN': 1, 'ghiN': 5, 'klmN': 3, 'nopN': 6, 'qrsN': 2, 'defN': 5}
1739 5 {'wxyN': 1, 'abcN': 11, 'tuvN': 3, 'ghiN': 4, 'klmN': 3, 'nopN': 2, 'qrsN': 1, 'defN': 7}
1741 399 {'wxyN': 1, 'abcN': 14, 'tuvN': 4, 'ghiN': 3, 'klmN': 3, 'nopN': 2, 'qrsN': 2, 'defN': 6}
1744 0 {'wxy

1946 32 {'wxyN': 1, 'abcN': 8, 'tuvN': 2, 'ghiN': 3, 'klmN': 2, 'nopN': 1, 'qrsN': 2, 'defN': 4}
1950 92 {'wxyN': 3, 'abcN': 9, 'tuvN': 1, 'ghiN': 1, 'klmN': 4, 'nopN': 3, 'qrsN': 3, 'defN': 3}
1953 241 {'wxyN': 3, 'abcN': 10, 'tuvN': 1, 'ghiN': 7, 'klmN': 1, 'nopN': 1, 'qrsN': 2, 'defN': 7}
1954 97 {'wxyN': 1, 'abcN': 10, 'tuvN': 3, 'ghiN': 5, 'klmN': 2, 'nopN': 3, 'qrsN': 2, 'defN': 7}
1955 2 {'wxyN': 2, 'abcN': 11, 'tuvN': 3, 'ghiN': 3, 'klmN': 4, 'nopN': 2, 'qrsN': 1, 'defN': 7}
1965 208 {'wxyN': 1, 'abcN': 4, 'tuvN': 1, 'ghiN': 2, 'klmN': 1, 'nopN': 1, 'qrsN': 2, 'defN': 1}
1967 0 {'wxyN': 1, 'abcN': 6, 'tuvN': 3, 'ghiN': 5, 'klmN': 1, 'nopN': 2, 'qrsN': 4, 'defN': 4}
1968 0 {'wxyN': 3, 'abcN': 7, 'tuvN': 1, 'ghiN': 1, 'klmN': 4, 'nopN': 3, 'qrsN': 2, 'defN': 1}
1973 167 {'wxyN': 2, 'abcN': 12, 'tuvN': 3, 'ghiN': 4, 'klmN': 2, 'nopN': 2, 'qrsN': 2, 'defN': 6}
1975 27 {'wxyN': 3, 'abcN': 12, 'tuvN': 2, 'ghiN': 4, 'klmN': 2, 'nopN': 1, 'qrsN': 1, 'defN': 4}
1980 8 {'wxyN': 1, 'abcN'

2287 3 {'wxyN': 2, 'abcN': 14, 'tuvN': 1, 'ghiN': 3, 'klmN': 2, 'nopN': 2, 'qrsN': 4, 'defN': 5}
2291 0 {'wxyN': 1, 'abcN': 9, 'tuvN': 1, 'ghiN': 6, 'klmN': 1, 'nopN': 1, 'qrsN': 1, 'defN': 3}
2309 3 {'wxyN': 4, 'abcN': 7, 'tuvN': 2, 'ghiN': 2, 'klmN': 1, 'nopN': 6, 'qrsN': 6, 'defN': 2}
2320 21 {'wxyN': 1, 'abcN': 12, 'tuvN': 2, 'ghiN': 4, 'klmN': 2, 'nopN': 1, 'qrsN': 1, 'defN': 3}
2321 0 {'wxyN': 1, 'abcN': 10, 'tuvN': 1, 'ghiN': 6, 'klmN': 3, 'nopN': 2, 'qrsN': 2, 'defN': 4}
2328 4 {'wxyN': 1, 'abcN': 10, 'tuvN': 2, 'ghiN': 3, 'klmN': 5, 'nopN': 1, 'qrsN': 1, 'defN': 3}
2329 40 {'wxyN': 1, 'abcN': 6, 'tuvN': 1, 'ghiN': 3, 'klmN': 4, 'nopN': 1, 'qrsN': 1, 'defN': 5}
2338 18 {'wxyN': 1, 'abcN': 8, 'tuvN': 2, 'ghiN': 3, 'klmN': 2, 'nopN': 1, 'qrsN': 2, 'defN': 5}
2339 606 {'wxyN': 1, 'abcN': 8, 'tuvN': 2, 'ghiN': 5, 'klmN': 1, 'nopN': 3, 'qrsN': 1, 'defN': 3}
2340 2459 {'wxyN': 1, 'abcN': 7, 'tuvN': 1, 'ghiN': 1, 'klmN': 3, 'nopN': 3, 'qrsN': 2, 'defN': 5}
2349 30 {'wxyN': 1, 'abcN': 

2926 4 {'wxyN': 1, 'abcN': 4, 'tuvN': 1, 'ghiN': 2, 'klmN': 1, 'nopN': 1, 'qrsN': 2, 'defN': 1}
2932 7 {'wxyN': 1, 'abcN': 3, 'tuvN': 1, 'ghiN': 2, 'klmN': 1, 'nopN': 2, 'qrsN': 2, 'defN': 2}
2952 224 {'wxyN': 2, 'abcN': 5, 'tuvN': 2, 'ghiN': 5, 'klmN': 2, 'nopN': 2, 'qrsN': 2, 'defN': 4}
2969 105 {'wxyN': 1, 'abcN': 7, 'tuvN': 1, 'ghiN': 2, 'klmN': 1, 'nopN': 2, 'qrsN': 1, 'defN': 4}
2971 1104 {'wxyN': 1, 'abcN': 6, 'tuvN': 2, 'ghiN': 4, 'klmN': 2, 'nopN': 1, 'qrsN': 2, 'defN': 4}
2979 0 {'wxyN': 1, 'abcN': 5, 'tuvN': 2, 'ghiN': 3, 'klmN': 3, 'nopN': 1, 'qrsN': 2, 'defN': 2}
2984 2950 {'wxyN': 1, 'abcN': 8, 'tuvN': 1, 'ghiN': 1, 'klmN': 1, 'nopN': 1, 'qrsN': 1, 'defN': 7}
2987 5 {'wxyN': 1, 'abcN': 9, 'tuvN': 2, 'ghiN': 5, 'klmN': 2, 'nopN': 1, 'qrsN': 1, 'defN': 4}
2994 25 {'wxyN': 2, 'abcN': 2, 'tuvN': 1, 'ghiN': 2, 'klmN': 2, 'nopN': 1, 'qrsN': 1, 'defN': 6}
2997 454 {'wxyN': 2, 'abcN': 7, 'tuvN': 1, 'ghiN': 5, 'klmN': 2, 'nopN': 1, 'qrsN': 2, 'defN': 5}
3007 87 {'wxyN': 2, 'abcN':

7013 0 {'wxyN': 1, 'abcN': 5, 'tuvN': 1, 'ghiN': 1, 'klmN': 1, 'nopN': 1, 'qrsN': 1, 'defN': 3}
7084 970 {'wxyN': 1, 'abcN': 2, 'tuvN': 1, 'ghiN': 1, 'klmN': 1, 'nopN': 1, 'qrsN': 1, 'defN': 1}
7948 253 {'wxyN': 1, 'abcN': 4, 'tuvN': 1, 'ghiN': 1, 'klmN': 2, 'nopN': 1, 'qrsN': 1, 'defN': 1}
9728 1123 {'wxyN': 1, 'abcN': 4, 'tuvN': 2, 'ghiN': 1, 'klmN': 1, 'nopN': 2, 'qrsN': 1, 'defN': 1}


### Nombre de formes du tirage brut
Le nombre de formes du tirage brut correspond au nombre de tokens paramétré si ce nombre est plus élevé que le nombre de formes calculé via le *ratio*, le tirage est réduit pour obtenir un nombre de forme de l'ordre de grandeur désiré

In [13]:
tirages[marksN].replace(0,np.nan).count().sum()

53084

## Réduction du nombre de types
Pour obtenir un nombre de type compatible avec l'ordre de grandeur fixé via *ratio*, on fixe un seuil de tokens pour inclure les formes dans le tirage.
- si le seuil est fixé à 3, par exemple, les formes ayant moins de 3 attestations sont éliminées
- le seuil est calculé pour s'approcher de l'ordre de grandeur par le haut

Les lexèmes qui n'ont aucune forme dans l'échantillon sont éliminés.
- result=result.dropna(thresh=len(marks)+2+1) => un lexème qui a au moins une forme doit avoir ses 2 colonnes CF, P remplies ainsi que toutes les marques, len(marks), plus au moins une forme tirée (+1)

In [14]:
def reduceTirages(df,seuil):
    result=df.copy()
    for n in range(seuil+1):
        result[marksN]=result[marksN].replace(n,np.nan)
    result=result.dropna(thresh=len(marks)+2+1)
    return result

In [15]:
tiragesReduits=tirages.copy()
for i in range(nbFormes):
    if reduceTirages(tirages,i)[marksN].count().sum()<nbFormes:
        break
if i>0:
    tiragesReduits=reduceTirages(tirages,i-1)
print "Nombre de types réduit pris en compte",tiragesReduits[marksN].count().sum()

Nombre de types réduit pris en compte 33442


In [16]:
#tiragesReduits

In [17]:
print "Nombre de lexèmes dans l'échantillon",len(tiragesReduits)

Nombre de lexèmes dans l'échantillon 9467


## Ajouter un champ pour regex

In [18]:
def ajouterChampParadigme(x):
    result=""
    for c in marks:
        if x[c+"N"]>0 and x[c]==x[c]:
            result+=x[c]
        else:
            result+="."
    return result

In [19]:
tiragesReduits["regex"]=tiragesReduits.apply(ajouterChampParadigme,axis=1)

In [20]:
tiragesReduits.reset_index()

Unnamed: 0,index,CF,P,abc,def,ghi,klm,nop,qrs,tuv,wxy,abcN,defN,ghiN,klmN,nopN,qrsN,tuvN,wxyN,regex
0,0,1465,0.102170,c,e,i,l,p,r,t,y,18438.0,9436.0,6293.0,4581.0,3782.0,3142.0,2675.0,2277.0,ceilprty
1,1,6408,0.051085,a,e,g,m,n,q,v,y,9609.0,4697.0,3081.0,2275.0,1933.0,1670.0,1388.0,1230.0,aegmnqvy
2,2,4,0.034057,b,e,h,k,p,r,t,y,6160.0,3091.0,2115.0,1570.0,1249.0,1064.0,943.0,800.0,behkprty
3,3,1,0.025543,a,e,i,l,p,s,u,x,4607.0,2300.0,1571.0,1193.0,947.0,767.0,666.0,598.0,aeilpsux
4,4,266,0.020434,c,d,i,l,o,s,v,w,3726.0,1850.0,1242.0,920.0,737.0,620.0,520.0,466.0,cdilosvw
5,5,397,0.017028,c,f,g,m,o,q,u,w,3187.0,1584.0,1016.0,801.0,633.0,517.0,403.0,396.0,cfgmoquw
6,6,1889,0.014596,a,f,i,l,p,r,t,w,2714.0,1370.0,919.0,665.0,555.0,451.0,417.0,361.0,afilprtw
7,7,62,0.012771,b,f,i,l,o,q,t,w,2381.0,1201.0,821.0,562.0,460.0,403.0,317.0,300.0,bfiloqtw
8,8,0,0.011352,b,e,h,k,p,r,t,x,2116.0,1045.0,666.0,516.0,406.0,324.0,317.0,256.0,behkprtx
9,9,16,0.010217,b,e,i,m,o,s,v,w,1864.0,975.0,607.0,470.0,354.0,286.0,292.0,253.0,beimosvw


In [119]:
fullParadigms=tiragesReduits.dropna()
print
print "Nombre de lexèmes avec un paradigme complet dans l'échantillon",len(fullParadigms)
nbCompleteCF=len(fullParadigms.groupby("regex"))
print "Nombre de CF exemplaires pleines dans l'échantillon",nbCompleteCF


Nombre de lexèmes avec un paradigme complet dans l'échantillon 915
Nombre de CF exemplaires pleines dans l'échantillon 488


In [114]:
paradigmsGroups=tiragesReduits.groupby(["regex"]).groups.keys()
print len(paradigmsGroups),paradigmsGroups

3027 ['cdi.n...', 'bfhl.s..', 'ae.lpr..', 'afgk.quy', '.f.k....', '...kn...', 'aehm.r..', 'bfhlnrtx', 'cdil...w', 'cfh.n.v.', 'afh.....', 'cf.m....', 'b.gkn.vy', '.egm....', 'beg.ostw', 'ae.l...y', 'aehm.r.y', 'cfhl.q..', 'cdilp...', 'cdil....', 'a.hk..t.', 'aegl.s..', 'aeg.pqt.', 'afhmp.t.', 'af..or..', 'ad..p.vx', 'cegmn.tw', 'aeg..rvx', 'afgk.qu.', 'c.h.o...', 'cdhkosvy', '..g.ps..', 'aehkor..', 'afglpr..', 'cf.m...x', 'afh....x', 'ce....v.', 'behknr..', 'cdgm....', 'bdh...u.', 'cdiko.u.', 'beh...vy', 'aeh.or..', 'adi.p...', 'cfi.nr..', 'b.gm.s..', 'cfil.q..', 'a.h.n...', 'adhk.r.y', '..im...w', 'adglo.uy', 'c.gm..v.', 'bf..pr..', 'cf.ko...', 'af.l....', 'af..p.t.', 'a.g..r..', 'begloqvx', 'cfil....', 'adi.p..y', 'afgknrvx', 'bdim..u.', 'aeg.pr.x', 'cdiko.uy', 'cdgmns.y', 'af.mp.u.', 'bei..r..', 'afg..quy', 'afhlnqv.', 'ae....u.', 'bd.k...y', 'c..kn...', 'b.i.n.v.', 'c.gm..vw', '.dh.....', 'beglosty', '..i..su.', 'a.i.prty', 'bd....u.', 'aeilp.u.', 'a...o.t.', 'cdglo...', 'bdgk..v.'

In [104]:
paradigmMappings={}
for p in paradigmsGroups:
    for cfRegex in paradigmsGroups:
        m=re.match(p,cfRegex)
        if m:
            if not p in paradigmMappings:
                paradigmMappings[p]=set()
            paradigmMappings[p].add(cfRegex)
paradigmMappings

{'cdi.n...': {'cdi.n...',
  'cdikn..x',
  'cdiknqvy',
  'cdiknrvy',
  'cdiknstx',
  'cdilnqty',
  'cdilnqux',
  'cdilnsvx',
  'cdimnq..',
  'cdimnrvx'},
 'bfhl.s..': {'bfhl.s..', 'bfhlosvw'},
 'ae.lpr..': {'ae.lpr..',
  'aeglpruy',
  'aeglprvw',
  'aehlpr..',
  'aehlpruw',
  'aeilpr..',
  'aeilpru.',
  'aeilprux'},
 'afgk.quy': {'afgk.quy', 'afgknquy'},
 '.f.k....': {'.f.k....',
  '.f.kn...',
  '.f.kn.u.',
  '.fgkn...',
  'af.k....',
  'af.k...w',
  'af.k..u.',
  'af.k..uy',
  'af.k..vy',
  'af.k.rty',
  'af.k.s..',
  'af.kn...',
  'af.kn.u.',
  'af.knq..',
  'af.knq.y',
  'af.ko..w',
  'af.kp...',
  'af.kpr..',
  'af.kpsvy',
  'afgk....',
  'afgk...w',
  'afgk...x',
  'afgk...y',
  'afgk..u.',
  'afgk..uy',
  'afgk..v.',
  'afgk.q..',
  'afgk.q.y',
  'afgk.qu.',
  'afgk.quy',
  'afgk.s..',
  'afgk.svx',
  'afgkn...',
  'afgkn..y',
  'afgkn.u.',
  'afgkn.uy',
  'afgknq..',
  'afgknq.y',
  'afgknqu.',
  'afgknquy',
  'afgknrvx',
  'afgknsux',
  'afgkoq..',
  'afgkoqux',
  'afgkor..',
  

In [105]:
paradigmMap=nx.DiGraph()

In [120]:
sampleCF=[]
mapLevels={}
mapLevels[0]=paradigmMappings
mapLevels[1]={}
for p in mapLevels[0]:
    if len(mapLevels[0][p])==1 and mapLevels[0][p]&set([p]):
        sampleCF.append(p)
        paradigmMap.add_edge(p,"CF")
for p in mapLevels[0]:
    mapLevels[1][p]=[c for c in mapLevels[0][p] if not c in sampleCF]
print "Nombre de CF exemplaires dans l'échantillon",len(sampleCF),"complètes",nbCompleteCF,"et partielles",len(sampleCF)-nbCompleteCF

Nombre de CF exemplaires dans l'échantillon 758 complètes 488 et partielles 270


In [93]:
def graphTreeCF(level):
    more=False
    treeCF=[]
    mapLevels[level+1]={}
    for p in mapLevels[level]:
        if len(mapLevels[level][p])==1:
            treeCF.append(p)
            for c in [m for m in mapLevels[level-1][p] if not m==p]:
                paradigmMap.add_edge(p,c)
        else:
            more=True
    for p in mapLevels[level]:
        mapLevels[level+1][p]=[c for c in mapLevels[level][p] if not c in treeCF]
    return more

In [94]:
for i in range(1,2):
    graphTreeCF(i)

In [112]:
paradigmMap.edge

{'CF': {},
 'a.gkn.uw': {'CF': {}},
 'a.h.ortw': {'CF': {}},
 'a.ilnq.x': {'CF': {}},
 'ad..p.vx': {'CF': {}},
 'ad.lorv.': {'CF': {}},
 'ad.losty': {'CF': {}},
 'adg.nr.x': {'CF': {}},
 'adg.ostx': {'CF': {}},
 'adgk.suy': {'CF': {}},
 'adgknsvw': {'CF': {}},
 'adgknsvx': {'CF': {}},
 'adgkoruw': {'CF': {}},
 'adgkorvx': {'CF': {}},
 'adgkost.': {'CF': {}},
 'adgkosux': {'CF': {}},
 'adgkpquy': {'CF': {}},
 'adgkprtw': {'CF': {}},
 'adglo.uy': {'CF': {}},
 'adgloqtw': {'CF': {}},
 'adglort.': {'CF': {}},
 'adgmn.v.': {'CF': {}},
 'adgmnrtw': {'CF': {}},
 'adgmnsty': {'CF': {}},
 'adgmortw': {'CF': {}},
 'adgmos.y': {'CF': {}},
 'adgmostw': {'CF': {}},
 'adgmosux': {'CF': {}},
 'adgmpquw': {'CF': {}},
 'adh.pqvw': {'CF': {}},
 'adhk.rty': {'CF': {}},
 'adhk.rv.': {'CF': {}},
 'adhkn.tw': {'CF': {}},
 'adhknruw': {'CF': {}},
 'adhknrux': {'CF': {}},
 'adhkoqtx': {'CF': {}},
 'adhkorux': {'CF': {}},
 'adhkosuy': {'CF': {}},
 'adhkpquw': {'CF': {}},
 'adhkpqux': {'CF': {}},
 'adhkpqvy': {

In [113]:
nx.write_gexf(paradigmMap, "paradigmMap.gexf")