In [1]:
import pandas as pd
import numpy as np
import itertools as it
import random as rd
import networkx as nx
import re
debug=False

In [2]:
%matplotlib

Using matplotlib backend: MacOSX


In [3]:
def probElements(elements):
    dictResult={}
    sumFreq=sum([v for k,v in elements.iteritems()])
    for k,v in elements.iteritems():
        dictResult[k]=float(v)/sumFreq
    return dictResult
    
def distElements(elements):
    prop=[float(1)/c for c in range(1,len(elements)+1)]
    sumProp=sum(prop)
    dist=[p/sumProp for p in prop]
    return dist

## Paramétrage du paradigme
Les marks définissent les cases du paradigme
- le nom de la case correspond à la concaténation des valeurs possibles
 - abc correspond à une case qui peut prendre comme valeur a, b ou c
 
Les classes sont calculées en faisant le produit cartésien des différentes valeurs x cases

In [4]:
marks=["abc","def","ghi","klm","nop","qrs","tuv","wxy"]
marksN=[m+"N" for m in marks]
distMarks=distElements(marksN)

classes=pd.DataFrame(columns=marks)

cf=0
for element in it.product(*marks):
    cf+=1
    row = pd.Series({m:element[n] for n,m in enumerate(marks)},name=cf)
    classes=classes.append(row)

nbClasses=len(classes)
classes = classes.sample(frac=1).reset_index(drop=True)
#classes

In [5]:
nbGrammaireCF=len(classes)

## Paramètres du lexique
On fixe le nombre de lexèmes présents dans le corpus d'apprentissage et le ratio de formes attestées
- nombre de lexèmes
- ratio des formes-cases attestées par rapport au potentiel complet
 - 25% correspond au ratio observé pour les verbes dans Lex3
- nbFormes est l'ordre de grandeur à respecter pour l'échantillon en nombre de types

In [6]:
ratio=0.25
nbLexemes=5000

nbFormes=int(nbLexemes*len(marks)*ratio)
nbFormes

10000

## Distribution des lexèmes et des CF
Les différents éléments suivent des distributions par Zipf.
- distElements renvoie une liste de probabilités correspondant au nombre d'éléments suivant une Zipf(x)

Chaque lexème a une fréquence de lemme qui correspond à son rang et reçoit une CF tirée au hasard suivant une loi de Zipf.
- np.random.choice(cfs,p=distCFs)

In [7]:
cfs=range(nbClasses)
distCFs=distElements(cfs)

lexs=range(nbLexemes)
distLexs=distElements(lexs)

lexemes=pd.DataFrame(columns=["CF","P"]+marks)
for l in range(nbLexemes):
    cf=np.random.choice(cfs,p=distCFs)
    dictL={"CF":int(cf), "P":distLexs[l]}
    dictL.update(classes.iloc[cf].to_dict())
    row=pd.Series(dictL,name=l)
    lexemes=lexemes.append(row)
#lexemes.index+=1

In [8]:
lexemes

Unnamed: 0,CF,P,abc,def,ghi,klm,nop,qrs,tuv,wxy
0,405,0.109956,b,d,g,k,p,r,t,x
1,40,0.054978,c,e,g,k,n,q,t,w
2,15,0.036652,a,e,h,k,n,s,t,w
3,64,0.027489,a,f,i,l,p,s,t,x
4,93,0.021991,a,d,i,k,n,q,u,y
5,2,0.018326,a,f,h,l,o,q,v,y
6,13,0.015708,b,f,i,m,p,q,t,y
7,1870,0.013745,a,e,h,l,o,s,u,x
8,14,0.012217,a,f,g,k,o,q,t,x
9,592,0.010996,b,f,g,k,o,q,t,w


In [9]:
print "Nombre de types potentiel",len(lexemes.groupby(marks)[["CF"]].count())

Nombre de types potentiel 1589


## Nombre de CF tirées dans le lexique
Ce nombre représente le maximum de CF qui pourraient être nécessaires pour la description.
Le nombre de CF nécessaire est au moins égal au nombre de CF qui possèdent un paradigme exemplaire, mais les CF qui n'ont qu'une représentation partielle de leur paradigme ne sont pas obligatoirement nécessaires à la description.

In [10]:
nbLexiqueCF=len(lexemes.groupby("CF").groups.keys())
print nbLexiqueCF,"CF dans le lexique sur",nbGrammaireCF,"CF dans la grammaire"

1589 CF dans le lexique sur 6561 CF dans la grammaire


## Constitution du DF pour le tirage

In [11]:
tiragesColonnes=pd.DataFrame(0, index=np.arange(len(lexs)), columns=marksN)
tirages=pd.concat([lexemes, tiragesColonnes], axis=1, sort=False)
#tirages

### Tirage des lexèmes pour les formes attestées
- lexTirs contient le nombre de token à tirer pour chaque lexème

=> changer le random.choice pour faire le choix en une seule fois et ensuite calculer les tirs pour chaque lexème

In [12]:
#tirages=pd.DataFrame(0, index=np.arange(len(lexs)), columns=marks)
#tirages.index+=1

nbTokens=5000000
lexTirs={}
listeTirs=np.random.choice(lexs,nbTokens,replace=True,p=distLexs)
for t in listeTirs:
    if not t in lexTirs:
        lexTirs[t]=0
    lexTirs[t]+=1
#lexTirs

In [13]:
newLexFreq=probElements(lexTirs)

In [14]:
for l in newLexFreq:
    print l,"%0.2f"%((newLexFreq[l]-distLexs[l])/distLexs[l]*100)

0 -0.02
1 -0.36
2 -0.30
3 0.16
4 0.28
5 -0.33
6 -0.30
7 0.13
8 -0.94
9 0.30
10 -0.17
11 0.81
12 0.96
13 -0.07
14 0.53
15 0.59
16 0.18
17 0.39
18 -0.14
19 -0.31
20 -0.29
21 0.16
22 0.09
23 0.27
24 0.73
25 -1.01
26 0.01
27 1.15
28 0.36
29 -0.02
30 0.81
31 0.22
32 -0.01
33 0.03
34 0.46
35 -0.99
36 -0.07
37 -0.69
38 -0.15
39 0.77
40 0.51
41 -1.30
42 -0.08
43 0.61
44 -1.16
45 0.23
46 -0.97
47 -0.47
48 -0.88
49 1.62
50 -0.04
51 -1.33
52 -0.43
53 1.18
54 -0.43
55 -2.01
56 -0.11
57 0.13
58 -0.59
59 0.77
60 1.73
61 -0.75
62 -1.23
63 -1.17
64 -0.01
65 -0.18
66 -0.31
67 0.01
68 -0.48
69 -0.52
70 -0.37
71 -2.04
72 -0.48
73 0.95
74 0.44
75 0.24
76 1.39
77 1.31
78 0.30
79 -0.40
80 0.57
81 0.02
82 -0.15
83 -1.38
84 1.70
85 0.39
86 -1.83
87 -0.20
88 -0.05
89 1.45
90 0.22
91 0.27
92 -1.91
93 -0.66
94 -1.56
95 0.77
96 -0.30
97 0.14
98 1.87
99 -0.91
100 -0.26
101 1.47
102 -1.61
103 -0.54
104 1.32
105 0.70
106 1.38
107 0.42
108 -1.62
109 -0.66
110 -0.95
111 0.60
112 0.10
113 0.40
114 -1.21
115 0.41
116 -1

1464 2.86
1465 2.39
1466 2.20
1467 -3.34
1468 -2.47
1469 1.34
1470 8.90
1471 -2.01
1472 3.15
1473 3.49
1474 -10.39
1475 5.78
1476 10.68
1477 -2.95
1478 -2.35
1479 4.18
1480 -1.41
1481 -8.35
1482 -5.86
1483 -7.96
1484 -0.33
1485 3.79
1486 2.78
1487 -6.08
1488 -0.60
1489 7.05
1490 -8.88
1491 5.84
1492 -3.60
1493 2.18
1494 -0.48
1495 -5.58
1496 -0.34
1497 -4.36
1498 3.61
1499 2.31
1500 1.02
1501 -7.66
1502 1.97
1503 -7.26
1504 7.03
1505 6.28
1506 -1.87
1507 -5.10
1508 0.18
1509 2.45
1510 3.61
1511 -8.42
1512 1.00
1513 5.75
1514 3.89
1515 3.40
1516 6.51
1517 4.37
1518 -5.78
1519 2.85
1520 13.15
1521 -3.66
1522 -7.48
1523 1.73
1524 -0.42
1525 2.98
1526 -8.62
1527 13.95
1528 2.90
1529 -0.93
1530 -1.14
1531 3.38
1532 13.49
1533 -6.53
1534 -5.07
1535 7.56
1536 2.60
1537 0.43
1538 4.41
1539 2.80
1540 0.34
1541 9.95
1542 -15.80
1543 -12.66
1544 4.26
1545 -3.83
1546 13.68
1547 4.18
1548 6.78
1549 -0.20
1550 -4.65
1551 2.47
1552 15.81
1553 4.58
1554 0.41
1555 -13.11
1556 -0.03
1557 2.02
1558 1.52


2404 4.99
2405 0.22
2406 8.58
2407 -4.52
2408 -10.17
2409 4.33
2410 3.93
2411 4.42
2412 0.95
2413 -5.60
2414 1.47
2415 8.10
2416 6.39
2417 3.80
2418 6.04
2419 -2.28
2420 -0.92
2421 -11.01
2422 8.42
2423 5.38
2424 -2.52
2425 8.11
2426 18.31
2427 -6.82
2428 -5.01
2429 -15.58
2430 -1.84
2431 -3.12
2432 4.88
2433 2.27
2434 1.87
2435 -6.51
2436 -1.15
2437 -1.55
2438 -2.40
2439 -4.14
2440 -0.10
2441 -8.06
2442 -0.46
2443 12.02
2444 3.62
2445 -4.35
2446 5.93
2447 0.63
2448 0.67
2449 -0.62
2450 -1.48
2451 4.81
2452 -11.21
2453 6.68
2454 -9.35
2455 4.09
2456 -5.26
2457 14.01
2458 3.32
2459 1.57
2460 1.61
2461 1.21
2462 -0.54
2463 -4.99
2464 0.88
2465 -11.64
2466 2.76
2467 1.45
2468 3.74
2469 -5.65
2470 -9.66
2471 -6.93
2472 -1.94
2473 4.40
2474 5.79
2475 -13.08
2476 -8.54
2477 -3.09
2478 -8.92
2479 4.65
2480 1.99
2481 -10.61
2482 4.78
2483 -15.06
2484 14.36
2485 -4.14
2486 -14.96
2487 5.44
2488 7.75
2489 1.45
2490 4.66
2491 0.63
2492 -0.69
2493 5.24
2494 -11.96
2495 1.70
2496 10.82
2497 -6.86
2

3941 3.97
3942 -4.61
3943 9.04
3944 8.35
3945 -16.02
3946 -1.64
3947 -5.21
3948 -10.21
3949 4.18
3950 9.23
3951 -0.08
3952 -14.44
3953 6.44
3954 12.94
3955 -22.29
3956 -7.87
3957 6.55
3958 5.86
3959 -1.32
3960 -4.18
3961 17.47
3962 16.05
3963 -5.55
3964 -17.78
3965 -0.45
3966 -5.48
3967 -3.29
3968 13.34
3969 -0.35
3970 -3.94
3971 -5.36
3972 0.45
3973 -8.92
3974 -5.29
3975 -6.71
3976 1.27
3977 -12.45
3978 2.05
3979 7.86
3980 -8.04
3981 18.06
3982 -5.09
3983 0.73
3984 1.48
3985 0.05
3986 -9.35
3987 -3.52
3988 18.27
3989 -0.57
3990 -14.34
3991 16.18
3992 11.85
3993 -19.36
3994 -8.44
3995 0.30
3996 -14.21
3997 -1.10
3998 -1.80
3999 4.04
4000 -13.40
4001 1.91
4002 -3.89
4003 12.88
4004 -9.67
4005 1.28
4006 -0.88
4007 1.33
4008 -3.75
4009 15.97
4010 -0.78
4011 5.08
4012 -2.19
4013 3.68
4014 0.05
4015 3.73
4016 -23.28
4017 2.32
4018 0.88
4019 12.60
4020 -7.11
4021 2.42
4022 3.91
4023 -4.85
4024 -18.74
4025 -1.14
4026 -4.78
4027 4.77
4028 -4.73
4029 -17.17
4030 -9.08
4031 -3.19
4032 13.70
4033

### Tirage des formes-cases
- cellTirs contient le nombre de fois que chaque forme-case a été tirée

In [15]:
for l in lexTirs:
    cellTirs={}
    listeCases=np.random.choice(marksN,lexTirs[l],replace=True,p=distMarks)
    for c in listeCases:
        if not c in cellTirs:
            cellTirs[c]=0
        cellTirs[c]+=1
    if len(cellTirs)==8 and debug:
        print l,tirages.loc[l,"CF"], cellTirs
    for c in cellTirs:
        tirages.loc[l,c]=cellTirs[c]
    

### Nombre de formes du tirage brut
Le nombre de formes du tirage brut correspond au nombre de tokens paramétré si ce nombre est plus élevé que le nombre de formes calculé via le *ratio*, le tirage est réduit pour obtenir un nombre de forme de l'ordre de grandeur désiré

In [16]:
tirages[marksN].replace(0,np.nan).count().sum()

39982

## Nouvelles fréquences des cases

In [17]:
newDistMarks=probElements(tirages[marksN].sum().to_dict())
distMarks,newDistMarks

([0.3679369250985546,
  0.1839684625492773,
  0.12264564169951818,
  0.09198423127463864,
  0.07358738501971092,
  0.06132282084975909,
  0.05256241787122208,
  0.04599211563731932],
 {'abcN': 0.367998,
  'defN': 0.1841382,
  'ghiN': 0.122625,
  'klmN': 0.0920338,
  'nopN': 0.073578,
  'qrsN': 0.0611246,
  'tuvN': 0.05273,
  'wxyN': 0.0457724})

## Réduction du nombre de types
Pour obtenir un nombre de type compatible avec l'ordre de grandeur fixé via *ratio*, on fixe un seuil de tokens pour inclure les formes dans le tirage.
- si le seuil est fixé à 3, par exemple, les formes ayant moins de 3 attestations sont éliminées
- le seuil est calculé pour s'approcher de l'ordre de grandeur par le haut

Les lexèmes qui n'ont aucune forme dans l'échantillon sont éliminés.
- result=result.dropna(thresh=len(marks)+2+1) => un lexème qui a au moins une forme doit avoir ses 2 colonnes CF, P remplies ainsi que toutes les marques, len(marks), plus au moins une forme tirée (+1)

In [18]:
def reduceTirages(df,seuil):
    result=df.copy()
    for n in range(seuil+1):
        result[marksN]=result[marksN].replace(n,np.nan)
    result=result.dropna(thresh=len(marks)+2+1)
    return result

In [19]:
tiragesReduits=tirages.copy()
for i in range(nbFormes):
    if reduceTirages(tirages,i)[marksN].count().sum()<nbFormes:
        break
if i>0:
    tiragesReduits=reduceTirages(tirages,i-1)
print "Nombre de types réduit pris en compte",tiragesReduits[marksN].count().sum()

Nombre de types réduit pris en compte 10163


In [20]:
#tiragesReduits

In [21]:
print "Nombre de lexèmes dans l'échantillon",len(tiragesReduits)

Nombre de lexèmes dans l'échantillon 3734


## Ajouter un champ pour regex

In [22]:
def ajouterChampParadigme(x):
    result=""
    for c in marks:
        if x[c+"N"]>0 and x[c]==x[c]:
            result+=x[c]
        else:
            result+="."
    return result

In [23]:
tiragesReduits["regex"]=tiragesReduits.apply(ajouterChampParadigme,axis=1)

In [24]:
tiragesReduits[marksN]=tiragesReduits[marksN].replace({0:np.nan})

In [25]:
fullParadigms=tiragesReduits.dropna()
print
print "Nombre de lexèmes avec un paradigme complet dans l'échantillon",len(fullParadigms)
fullParadigms


Nombre de lexèmes avec un paradigme complet dans l'échantillon 448


Unnamed: 0,CF,P,abc,def,ghi,klm,nop,qrs,tuv,wxy,abcN,defN,ghiN,klmN,nopN,qrsN,tuvN,wxyN,regex
0,405,0.109956,b,d,g,k,p,r,t,x,201811.0,101399.0,67338.0,50525.0,40370.0,33663.0,29093.0,25462.0,bdgkprtx
1,40,0.054978,c,e,g,k,n,q,t,w,101009.0,50380.0,33585.0,25114.0,20066.0,16740.0,14509.0,12485.0,cegknqtw
2,15,0.036652,a,e,h,k,n,s,t,w,66954.0,33830.0,22348.0,16837.0,13580.0,11128.0,9676.0,8354.0,aehknstw
3,64,0.027489,a,f,i,l,p,s,t,x,50644.0,25116.0,17026.0,12810.0,10191.0,8353.0,7229.0,6300.0,afilpstx
4,93,0.021991,a,d,i,k,n,q,u,y,40481.0,20353.0,13534.0,10241.0,8115.0,6612.0,5862.0,5062.0,adiknquy
5,2,0.018326,a,f,h,l,o,q,v,y,33411.0,16920.0,11138.0,8412.0,6831.0,5583.0,4715.0,4320.0,afhloqvy
6,13,0.015708,b,f,i,m,p,q,t,y,29010.0,14479.0,9420.0,7109.0,5795.0,4774.0,4180.0,3541.0,bfimpqty
7,1870,0.013745,a,e,h,l,o,s,u,x,25212.0,12716.0,8450.0,6276.0,5158.0,4225.0,3605.0,3170.0,aehlosux
8,14,0.012217,a,f,g,k,o,q,t,x,22379.0,10961.0,7333.0,5588.0,4513.0,3649.0,3197.0,2892.0,afgkoqtx
9,592,0.010996,b,f,g,k,o,q,t,w,20265.0,10215.0,6769.0,5046.0,3953.0,3416.0,2926.0,2552.0,bfgkoqtw


In [26]:
nbCompleteCF=len(fullParadigms.groupby("regex"))
print "Nombre de CF exemplaires pleines dans l'échantillon",nbCompleteCF

Nombre de CF exemplaires pleines dans l'échantillon 260


In [27]:
#fullParadigms.groupby("regex").count()[["P"]].sort_values("P")

In [28]:
paradigmsCounts=tiragesReduits.groupby(["regex"]).count()
print len(paradigmsCounts)
paradigmsCounts=paradigmsCounts["P"].sort_values().to_dict()

593


In [29]:
paradigmsGroups=tiragesReduits.groupby(["regex"]).groups.keys()
print len(paradigmsGroups)

593


In [30]:
paradigmMappings={}
for p in paradigmsGroups:
    for cfRegex in paradigmsGroups:
        m=re.match(p,cfRegex)
        if m:
            if not p in paradigmMappings:
                paradigmMappings[p]=set()
            paradigmMappings[p].add(cfRegex)
paradigmMappings

{'a.......': {'a.......',
  'ad......',
  'ad.k....',
  'adg.....',
  'adgk....',
  'adgkn.v.',
  'adgknq..',
  'adgknqt.',
  'adgknrux',
  'adgkns..',
  'adgkos..',
  'adgkosvw',
  'adgkp...',
  'adgkpqvw',
  'adgl....',
  'adglnrvx',
  'adgloqvx',
  'adglp...',
  'adglps..',
  'adglpst.',
  'adglpstx',
  'adgmn...',
  'adgmnqvx',
  'adgmnrtw',
  'adgmnruy',
  'adgmort.',
  'adgmosvx',
  'adh.....',
  'adh.o...',
  'adh.p...',
  'adhk....',
  'adhknrvx',
  'adhknst.',
  'adhkosuy',
  'adhkp...',
  'adhkpq..',
  'adhkpqt.',
  'adhl.q..',
  'adhlosuy',
  'adhlp...',
  'adhlpruw',
  'adhm....',
  'adhm.q..',
  'adhmn.t.',
  'adhmo...',
  'adhmorvw',
  'adhmosvy',
  'adhmpquy',
  'adi.....',
  'adik....',
  'adikn...',
  'adiknqu.',
  'adiknquy',
  'adikosux',
  'adikp...',
  'adikprvw',
  'adikpstw',
  'adikpsuw',
  'adikpsux',
  'adil....',
  'adil..t.',
  'adiln...',
  'adilnqty',
  'adilns..',
  'adilnst.',
  'adilnstx',
  'adilosty',
  'adilosu.',
  'adilpqvw',
  'adilpruy',
  'adim.

## Trouver les paradigmes restants

In [31]:
paradigmTops={}
for p in paradigmMappings:
    if len(paradigmMappings[p])==1:
        paradigmTops[p]=paradigmsCounts[p]
len(paradigmTops),paradigmTops

(325,
 {'adgkn.v.': 1,
  'adgknqt.': 1,
  'adgknrux': 1,
  'adgkns..': 1,
  'adgkosvw': 2,
  'adgkpqvw': 2,
  'adglnrvx': 1,
  'adgloqvx': 1,
  'adglpstx': 2,
  'adgmnqvx': 1,
  'adgmnrtw': 1,
  'adgmnruy': 1,
  'adgmort.': 1,
  'adgmosvx': 1,
  'adhknrvx': 1,
  'adhknst.': 1,
  'adhkosuy': 1,
  'adhkpqt.': 1,
  'adhl.q..': 1,
  'adhlosuy': 1,
  'adhlpruw': 1,
  'adhmn.t.': 1,
  'adhmorvw': 2,
  'adhmosvy': 2,
  'adhmpquy': 1,
  'adiknquy': 1,
  'adikosux': 1,
  'adikprvw': 1,
  'adikpstw': 1,
  'adikpsuw': 1,
  'adikpsux': 1,
  'adilnqty': 1,
  'adilnstx': 26,
  'adilosty': 1,
  'adilosu.': 1,
  'adilpqvw': 1,
  'adilpruy': 1,
  'adimnrvy': 1,
  'adimoqux': 1,
  'aegknrvy': 1,
  'aegknsvw': 1,
  'aegkoquw': 1,
  'aegkoqvx': 1,
  'aegkosux': 1,
  'aegln...': 1,
  'aegloq..': 1,
  'aeglprty': 1,
  'aegmorux': 1,
  'aegmosux': 1,
  'aegmpstx': 1,
  'aehknrvy': 1,
  'aehknstw': 3,
  'aehknstx': 1,
  'aehkosux': 2,
  'aehlnstx': 1,
  'aehlorvx': 1,
  'aehlosux': 1,
  'aehlpsux': 1,
  'aehl

## Trouver les correspondances des partiels vers les restants

=> mettre à jour le lexique avec les nouvelles classes flexionnelles

In [32]:
import operator
paradigm2Top={}
top2Paradigms={}
for p in paradigmMappings:
    lTops={m:paradigmTops[m] for m in paradigmMappings[p] if m in paradigmTops}
    topMax=max(lTops.iteritems(), key=operator.itemgetter(1))[0]
    if topMax not in top2Paradigms:
        top2Paradigms[topMax]=[]
    top2Paradigms[topMax].append((p,paradigmsCounts[p]))
    paradigm2Top[p]=topMax
top2Paradigms

{'adgkn.v.': [('adgkn.v.', 1)],
 'adgknqt.': [('adgknq..', 1), ('adgknqt.', 1)],
 'adgknrux': [('adgknrux', 1)],
 'adgkns..': [('adgkns..', 1)],
 'adgkosvw': [('adgkosvw', 2), ('adgkos..', 1), ('adgk....', 1)],
 'adgkpqvw': [('ad.k....', 1),
  ('adgkp...', 1),
  ('adgkpqvw', 2),
  ('adg.....', 10)],
 'adglnrvx': [('adglnrvx', 1)],
 'adgloqvx': [('adgloqvx', 1)],
 'adglpstx': [('adglpst.', 2),
  ('adglpstx', 2),
  ('adgl....', 1),
  ('adglp...', 2),
  ('adglps..', 2)],
 'adgmnqvx': [('adgmnqvx', 1)],
 'adgmnrtw': [('adgmnrtw', 1)],
 'adgmnruy': [('adgmnruy', 1), ('adgmn...', 1)],
 'adgmort.': [('adgmort.', 1)],
 'adgmosvx': [('adgmosvx', 1)],
 'adhknrvx': [('adhknrvx', 1)],
 'adhknst.': [('adhknst.', 1)],
 'adhkosuy': [('adhk....', 1), ('adhkosuy', 1)],
 'adhkpqt.': [('adhkp...', 1), ('adhkpqt.', 1), ('adhkpq..', 2)],
 'adhl.q..': [('adhl.q..', 1)],
 'adhlosuy': [('adhlosuy', 1)],
 'adhlpruw': [('adhlp...', 1), ('adhlpruw', 1)],
 'adhmn.t.': [('adhmn.t.', 1)],
 'adhmorvw': [('adh.o...',

In [33]:
tiragesReduits["newRegex"]=tiragesReduits["regex"].apply(lambda x: paradigm2Top[x])

In [34]:
tiragesReduits["Freq"]=tiragesReduits[marksN].fillna(0).apply(lambda x: sum(x),axis=1)

In [35]:
for m in newDistMarks:
    tiragesReduits[m]=tiragesReduits[m].fillna(0)+newDistMarks[m]

In [36]:
tiragesReduits

Unnamed: 0,CF,P,abc,def,ghi,klm,nop,qrs,tuv,wxy,...,defN,ghiN,klmN,nopN,qrsN,tuvN,wxyN,regex,newRegex,Freq
0,405,0.109956,b,d,g,k,p,r,t,x,...,101399.184138,67338.122625,50525.092034,40370.073578,33663.061125,29093.05273,25462.045772,bdgkprtx,bdgkprtx,549661.0
1,40,0.054978,c,e,g,k,n,q,t,w,...,50380.184138,33585.122625,25114.092034,20066.073578,16740.061125,14509.05273,12485.045772,cegknqtw,cegknqtw,273888.0
2,15,0.036652,a,e,h,k,n,s,t,w,...,33830.184138,22348.122625,16837.092034,13580.073578,11128.061125,9676.05273,8354.045772,aehknstw,aehknstw,182707.0
3,64,0.027489,a,f,i,l,p,s,t,x,...,25116.184138,17026.122625,12810.092034,10191.073578,8353.061125,7229.05273,6300.045772,afilpstx,afilpstx,137669.0
4,93,0.021991,a,d,i,k,n,q,u,y,...,20353.184138,13534.122625,10241.092034,8115.073578,6612.061125,5862.05273,5062.045772,adiknquy,adiknquy,110260.0
5,2,0.018326,a,f,h,l,o,q,v,y,...,16920.184138,11138.122625,8412.092034,6831.073578,5583.061125,4715.05273,4320.045772,afhloqvy,afhloqvy,91330.0
6,13,0.015708,b,f,i,m,p,q,t,y,...,14479.184138,9420.122625,7109.092034,5795.073578,4774.061125,4180.05273,3541.045772,bfimpqty,bfimpqty,78308.0
7,1870,0.013745,a,e,h,l,o,s,u,x,...,12716.184138,8450.122625,6276.092034,5158.073578,4225.061125,3605.05273,3170.045772,aehlosux,aehlosux,68812.0
8,14,0.012217,a,f,g,k,o,q,t,x,...,10961.184138,7333.122625,5588.092034,4513.073578,3649.061125,3197.05273,2892.045772,afgkoqtx,afgkoqtx,60512.0
9,592,0.010996,b,f,g,k,o,q,t,w,...,10215.184138,6769.122625,5046.092034,3953.073578,3416.061125,2926.05273,2552.045772,bfgkoqtw,bfgkoqtw,55142.0


## Compter les effectifs des CF restantes

In [37]:
newParadigmsCount={}
for t in top2Paradigms:
    newParadigmsCount[t]=sum([c for r,c in top2Paradigms[t]])
dfNewParadigms=pd.DataFrame.from_dict(newParadigmsCount,orient="index")
dfNewParadigms.columns=dfNewParadigms.columns.astype(str)
dfNewParadigms.columns=["effectif"]
dfNewParadigms.sort_values("effectif",ascending=False)

Unnamed: 0,effectif
bdglortx,933
cegmnsux,770
adilnstx,708
afhloqvy,116
beimoqvx,107
cdhlosty,94
bfimpqty,72
cfgmnsuy,52
aehmosuy,49
cehknqux,33


In [38]:
newLexs=tiragesReduits.index.tolist()
newDistLexs=[v for k,v in probElements(tiragesReduits["Freq"].to_dict()).iteritems()]
#newDistLexs

In [39]:
newLexTirs={}
newListeTirs=np.random.choice(newLexs,nbTokens,replace=True,p=newDistLexs)
for t in newListeTirs:
    if not t in newLexTirs:
        newLexTirs[t]=0
    newLexTirs[t]+=1

## Faire un nouveau tirage

=> mettre à jour les lexèmes et les classes flexionnelles

In [40]:
newTirages=pd.concat([lexemes, tiragesColonnes], axis=1, sort=False)
for l in newLexTirs:
    lMarksFreq=probElements(tiragesReduits.loc[l,marksN].to_dict())
    lDistMarks=[lMarksFreq[m] for m in sorted(lMarksFreq.keys())]
    
    newCellTirs={}
    listeCases=np.random.choice(marksN,newLexTirs[l],replace=True,p=lDistMarks)
    for c in listeCases:
        if not c in newCellTirs:
            newCellTirs[c]=0
        newCellTirs[c]+=1
    if len(newCellTirs)==8 and debug:
        print l,tirages.loc[l,"CF"], newCellTirs
    for c in newCellTirs:
        newTirages.loc[l,c]=newCellTirs[c]


In [43]:
newTirages#.groupby(marks).count()

Unnamed: 0,CF,P,abc,def,ghi,klm,nop,qrs,tuv,wxy,abcN,defN,ghiN,klmN,nopN,qrsN,tuvN,wxyN
0,405,0.109956,b,d,g,k,p,r,t,x,229217,115034,76394,57651,45990,38197,32859,28818
1,40,0.054978,c,e,g,k,n,q,t,w,114455,57594,37986,28449,23118,18951,16536,14260
2,15,0.036652,a,e,h,k,n,s,t,w,76017,38545,25223,19141,15437,12598,11032,9440
3,64,0.027489,a,f,i,l,p,s,t,x,57437,28320,19544,14416,11451,9581,8267,7116
4,93,0.021991,a,d,i,k,n,q,u,y,46253,23232,15420,11704,9223,7551,6593,5778
5,2,0.018326,a,f,h,l,o,q,v,y,37952,19162,12683,9633,7808,6229,5310,4898
6,13,0.015708,b,f,i,m,p,q,t,y,32963,16550,10775,8165,6556,5406,4603,4018
7,1870,0.013745,a,e,h,l,o,s,u,x,28480,14331,9537,7210,5956,4856,4218,3591
8,14,0.012217,a,f,g,k,o,q,t,x,25456,12447,8367,6338,5071,4211,3612,3315
9,592,0.010996,b,f,g,k,o,q,t,w,23064,11567,7641,5788,4491,3805,3319,2930


# Nouvelle architecture

1. initialiser le lexique
 - définir les cases, marques
    - nombre de cases
    - nombre de marques pour chaque case
 - définir le lexique-corpus
    - nombre de lexèmes
    - nombre de tokens
 - faire le tirage aléatoire
    - distribution des lexèmes
    - distribution des cases
    - distribution des marques
1. classer le lexique
 - emboîtement des classes flexionnelles
 - classes flexionnelles maximales
    - effectifs
 - rassemblement des paradigmes partiels
   - distribution dans les classes flexionnelles maximales
     - dans la classes maximale la plus fréquente
     - statistiquement entre les classes maximales 

# Initialiser le lexique


In [1]:
import pandas as pd
import numpy as np
import itertools as it
import random as rd
import networkx as nx
from matplotlib import pyplot as plt
import re,pickle
from IPython.display import display
debug=False

In [2]:
%matplotlib inline

## Choix des paramètres

In [3]:
nbCases=8
maxNbExposants=3
nbLexemes=6*1000
nbTokens=1000*1000*1000
epoques=25
initLexDist="c1.9"
initMarkDist="flat"
initCaseDist="c0.5"
epsilonFreq=0.1
lexProfileAleatoire=False
nbExposantsUniforme=True
sepPar="-"

### Fréquences et distributions

In [4]:
def distElements(elements,dist):
    nbElements=len(elements)
    if dist=="zipf":
        result=[float(nbElements)/(n+1) for n in range(nbElements)]
    elif dist=="flat":
        result=[1 for n in range(nbElements)]
    elif dist=="geo":
        result=[float(nbElements)/(n+1)**2 for n in range(nbElements)]
    elif dist.startswith("c"):
        coef=float(dist[1:])
        result=[float(nbElements)/(n+1)**coef for n in range(nbElements)]
    return result

def freq2prob(freq):
    totalFreq=sum(freq)
    result=[float(f)/totalFreq for f in freq]
    return result


In [5]:
distElements(range(10000),"c1.8")[-10:]

[0.0006319807847581873,
 0.0006318669416964166,
 0.0006317531305317552,
 0.0006316393512520754,
 0.0006315256038452553,
 0.0006314118882991789,
 0.000631298204601736,
 0.0006311845527408221,
 0.0006310709327043387,
 0.000630957344480193]

In [6]:
def initialiserMarques(nbCases):
    if nbExposantsUniforme:
        listeMarques=[maxNbExposants for i in range(nbCases)]
    else:
        listeMarques=[0]
        while max(listeMarques)<maxNbExposants:
            listeMarques=list(np.random.choice(range(1,maxNbExposants+1),nbCases))
    return listeMarques
    
def initialiserParadigme(nbCases,nbMarques=3,listeMarques=[]):
    if listeMarques==[]:
        listeMarques=[nbMarques for i in range(nbCases)]
    paradigme={i:listeMarques[i] for i in range(nbCases)}
    return paradigme

def initialiserListeDistMarques(listeMarques,dist=initMarkDist,distMarques=[]):
    if distMarques==[]:
        distMarques=[dist for i in listeMarques]
    result={i:distMarques[i] if i < len(distMarques) else dist for i in listeMarques}
    return result

def initialiserCasesDistMarques(nbMarkCase,listeDistMarques):
    result={}
    for i in nbMarkCase:
        result[i]=freq2prob(distElements(range(nbMarkCase[i]),dist=listeDistMarques[i]))
    return result

In [7]:
lexs=range(nbLexemes)
listeMarques=initialiserMarques(nbCases)
nbMarkCase=initialiserParadigme(nbCases,listeMarques=listeMarques)
listeDistMarques=initialiserListeDistMarques(nbMarkCase,initMarkDist,distMarques=[])
#casesDistMarques=initialiserCasesDistMarques(nbMarkCase,listeDistMarques)
probCasesMarques=initialiserCasesDistMarques(nbMarkCase,listeDistMarques)

In [8]:
nbPotentielClasses=reduce(lambda x, y: x * y, listeMarques)
print nbPotentielClasses,"classes flexionnelles potentielles"

6561 classes flexionnelles potentielles


In [9]:
probCasesMarques#=newProbCasesMarques
nbMarkCase

{0: 3, 1: 3, 2: 3, 3: 3, 4: 3, 5: 3, 6: 3, 7: 3}

In [10]:
lexProbs=freq2prob(distElements(lexs,initLexDist))

In [11]:
marks=["c%02d"%i for i in range(nbCases)]
marksN=["n%02d"%i for i in range(nbCases)]
dfColumns=marks+marksN+["paradigme","CF"]

tirages=pd.DataFrame("", index=np.arange(len(lexs)), columns=dfColumns)
tirages[marksN]=0

In [12]:
distCases=freq2prob(distElements(marks,initCaseDist))

In [13]:
def tirerLexemes(lexProbs):
    lexTirs={}
    listeTirs=np.random.choice(lexs,nbTokens,replace=True,p=lexProbs)
    for t in listeTirs:
        if not t in lexTirs:
            lexTirs[t]=0
        lexTirs[t]+=1
    return lexTirs

In [14]:
def calculerChampParadigme(x):
    result=[]
    for c in marks:
        caseCount=c.replace("c","n")
#        print c,x[c],caseCount,x[caseCount]
        if x[c] in range(maxNbExposants):
            result.append(str(x[c]))
        else:
            result.append(".")
    return sepPar.join(result)

In [15]:
def getLDistCases(paradigme):
    lDistCases=[]
    lMarks=paradigme.split(sepPar)
    for n,m in enumerate(lMarks):
        if m==".":
            lDistCases.append(0)
        else:
            lDistCases.append(distCases[n])
    return freq2prob(lDistCases)

In [16]:
def getTirages(lexProbs,distCases):
    lexTirs=tirerLexemes(lexProbs)

    for l in lexTirs:
        if l%1000==0:
            print ".",
        cellTirs={}
        cellValue={}
        if lexProfileAleatoire:
            distCases=freq2prob(list(np.random.randint(1,101,size=nbCases)))
        if tirages.loc[l,"CF"] and "." in tirages.loc[l,"CF"]:
            lDistCases=getLDistCases(tirages.loc[l,"CF"])
        else:
            lDistCases=distCases
        listeCases=np.random.choice(marksN,lexTirs[l],replace=True,p=lDistCases)
        for c in listeCases:
            numCase=int(c[1:])
            v=c.replace("n","c")
            if not c in cellTirs:
                cellTirs[c]=0
            cellTirs[c]+=1
            if not v in cellValue:
                if tirages.loc[l,"CF"]:
                    cellValue[v]=int(tirages.loc[l,"CF"].split(sepPar)[numCase])
                else:
                    cellValue[v]=np.random.choice(range(nbMarkCase[numCase]),p=probCasesMarques[numCase])
        for c in cellTirs:
            tirages.loc[l,c]=cellTirs[c]
        for c in cellValue:
            tirages.loc[l,c]=cellValue[c]
    tirages["paradigme"]=tirages.apply(calculerChampParadigme,axis=1)

In [17]:
def matchParadigmes(p1,p2):
    chunks1=p1.split(sepPar)
    chunks2=p2.split(sepPar)
    return all([p1[i]==p2[i] or p1[i]=="." for i in range(len(p1))])

In [18]:
getTirages(lexProbs,distCases)

. . . . . .


In [19]:
def getParadimCountsGroups(tirages):
    paradigmeCounts=tirages.groupby(["paradigme"]).count()
    dictParadigmsCounts=paradigmeCounts["CF"].sort_values().to_dict()
    paradigmsGroups=tirages.groupby(["paradigme"]).groups.keys()
    print "nombre de paradigmes différents",len(dictParadigmsCounts)
    return dictParadigmsCounts,paradigmsGroups

In [20]:
#dictParadigmsCounts

In [21]:
def getParadigmMappings(paradigmsGroups):
    paradigmMappings={}
    for p in paradigmsGroups:
        for p2 in paradigmsGroups:
            m=matchParadigmes(p,p2)
            if m:
                if not p in paradigmMappings:
                    paradigmMappings[p]=set()
                paradigmMappings[p].add(p2)
    return paradigmMappings

In [22]:
dictParadigmsCounts,paradigmsGroups=getParadimCountsGroups(tirages)
paradigmMappings=getParadigmMappings(paradigmsGroups)

nombre de paradigmes différents 3998


In [23]:
def getParadigmTops(paradigmMappings,dictParadigmsCounts):
    paradigmTops={}
    for p in paradigmMappings:
        if len(paradigmMappings[p])==1:
            paradigmTops[p]=dictParadigmsCounts[p]
    return paradigmTops

In [24]:
paradigmTops=getParadigmTops(paradigmMappings,dictParadigmsCounts)

In [25]:
import operator
def getParadigm2Top(paradigmMappings,paradigmTops,dictParadigmsCounts):
    paradigm2Top={}
    top2Paradigms={}
    for p in paradigmMappings:
        lTops={m:paradigmTops[m] for m in paradigmMappings[p] if m in paradigmTops}
        topMax=max(lTops.iteritems(), key=operator.itemgetter(1))[0]
        if topMax not in top2Paradigms:
            top2Paradigms[topMax]=[]
        top2Paradigms[topMax].append((p,dictParadigmsCounts[p]))
        paradigm2Top[p]=topMax
    return top2Paradigms,paradigm2Top

In [26]:
top2Paradigms,paradigm2Top=getParadigm2Top(paradigmMappings,paradigmTops,dictParadigmsCounts)

In [27]:
def updateTirages():
    tirages["CF"]=tirages["paradigme"].apply(lambda x: paradigm2Top[x])
#    tirages["freq"]=tirages[marksN].replace({"":0}).apply(lambda x: sum(x),axis=1)
    tirages["freq"]=tirages[marksN].apply(lambda x: sum(x),axis=1)

In [28]:
updateTirages()

In [29]:
def updateProbCases():
    newProbCasesMarques={}
    for c in marks:
        lExpCount=tirages.groupby(c).count()["paradigme"]
        lNbExp=max([i for i in lExpCount.index.tolist() if i!=""])+1
        lExpFreq=[0 for f in range(lNbExp)]
        for k,v in tirages.groupby(c).count()["paradigme"].iteritems():
            if k!='':
                lExpFreq[k]=v
        if c not in newProbCasesMarques:
            newProbCasesMarques[c]=freq2prob(lExpFreq)
    return newProbCasesMarques

In [30]:
def updateLexProbs():
    newLexProbs=freq2prob([f+epsilonFreq for f in tirages["freq"].tolist()])
    return newLexProbs

In [31]:
def updateDistCases():
    newDistCases=freq2prob(tirages[marksN].sum().tolist())
    return newDistCases

In [32]:
probCasesMarques=updateProbCases()
lexProbs=updateLexProbs()
distCases=updateDistCases()

In [33]:
def archiveTirages(historique=[]):
    historique.append(tirages.copy())
    tirages[marks+["paradigme"]]=""
    tirages[marksN]=0
    return historique

In [34]:
historique=archiveTirages()

In [35]:
tirages

Unnamed: 0,c00,c01,c02,c03,c04,c05,c06,c07,n00,n01,n02,n03,n04,n05,n06,n07,paradigme,CF,freq
0,,,,,,,,,0,0,0,0,0,0,0,0,,1-1-0-0-2-0-1-1,571678701
1,,,,,,,,,0,0,0,0,0,0,0,0,,1-2-1-1-1-0-0-0,153170873
2,,,,,,,,,0,0,0,0,0,0,0,0,,1-0-1-0-2-1-0-1,70892461
3,,,,,,,,,0,0,0,0,0,0,0,0,,1-0-2-1-0-2-0-2,41036784
4,,,,,,,,,0,0,0,0,0,0,0,0,,0-1-2-1-2-2-1-0,26855541
5,,,,,,,,,0,0,0,0,0,0,0,0,,1-1-2-0-0-0-1-0,18991433
6,,,,,,,,,0,0,0,0,0,0,0,0,,1-2-1-1-2-0-2-1,14173086
7,,,,,,,,,0,0,0,0,0,0,0,0,,1-2-2-1-2-0-1-2,10998247
8,,,,,,,,,0,0,0,0,0,0,0,0,,2-1-2-1-2-0-2-2,8797076
9,,,,,,,,,0,0,0,0,0,0,0,0,,0-0-1-0-0-2-2-0,7195166


In [36]:
for i in range(epoques):
    getTirages(lexProbs,distCases)

    dictParadigmsCounts,paradigmsGroups=getParadimCountsGroups(tirages)
    paradigmMappings=getParadigmMappings(paradigmsGroups)

    paradigmTops=getParadigmTops(paradigmMappings,dictParadigmsCounts)
    top2Paradigms,paradigm2Top=getParadigm2Top(paradigmMappings,paradigmTops,dictParadigmsCounts)

    updateTirages()

    probCasesMarques=updateProbCases()
    lexProbs=updateLexProbs()
    distCases=updateDistCases()

    historique=archiveTirages(historique)

. . . . . . nombre de paradigmes différents 3976
. . . . . . nombre de paradigmes différents 3985
. . . . . . nombre de paradigmes différents 3960
. . . . . . nombre de paradigmes différents 3915
. . . . . . nombre de paradigmes différents 3909
. . . . . . nombre de paradigmes différents 3901
. . . . . . nombre de paradigmes différents 3900
. . . . . . nombre de paradigmes différents 3863
. . . . . . nombre de paradigmes différents 3875
. . . . . . nombre de paradigmes différents 3861
. . . . . . nombre de paradigmes différents 3867
. . . . . . nombre de paradigmes différents 3852
. . . . . . nombre de paradigmes différents 3827
. . . . . . nombre de paradigmes différents 3827
. . . . . . nombre de paradigmes différents 3828
. . . . . . nombre de paradigmes différents 3802
. . . . . . nombre de paradigmes différents 3810
. . . . . . nombre de paradigmes différents 3819
. . . . . . nombre de paradigmes différents 3787
. . . . . . nombre de paradigmes différents 3802
. . . . . . nombre d

In [37]:
for n,h in enumerate(historique):
    hLex=h[h["freq"]>=1]
    df=hLex.groupby("CF").count()[["paradigme"]].sort_values("paradigme")
    df["freq"]=hLex.groupby("CF").sum()[["freq"]]
    df["maxF"]=hLex.groupby("CF").max()[["freq"]]
    df["minF"]=hLex.groupby("CF").min()[["freq"]]
    df["medF"]=hLex.groupby("CF").median()[["freq"]]
    df["meanF"]=hLex.groupby("CF").mean()[["freq"]]
    print n,len(df)
    display (df)
    print
    print

0 3909


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0-0-0-0-0-0-0-0,1,45,45,45,45.0,45.000000
1-2-0-2-1-1-2-1,1,187,187,187,187.0,187.000000
1-2-0-2-1-1-2-2,1,367,367,367,367.0,367.000000
1-2-0-2-1-2-0-0,1,625,625,625,625.0,625.000000
1-2-0-2-1-2-1-0,1,67,67,67,67.0,67.000000
1-2-0-2-1-2-1-1,1,76,76,76,76.0,76.000000
1-2-0-2-1-2-2-2,1,282,282,282,282.0,282.000000
1-2-0-2-2-0-2-0,1,2043,2043,2043,2043.0,2043.000000
1-2-0-2-2-0-2-1,1,500,500,500,500.0,500.000000
1-2-0-2-2-1-0-0,1,91,91,91,91.0,91.000000




1 3879


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0-0-0-0-0-0-0-0,1,49,49,49,49.0,49.000000
1-2-0-2-0-2-2-2,1,55,55,55,55.0,55.000000
1-2-0-2-1-0-0-1,1,28575,28575,28575,28575.0,28575.000000
1-2-0-2-1-0-0-2,1,134,134,134,134.0,134.000000
1-2-0-2-1-1-0-0,1,162,162,162,162.0,162.000000
1-2-0-2-1-1-1-2,1,379,379,379,379.0,379.000000
1-2-0-2-1-1-2-1,1,166,166,166,166.0,166.000000
1-2-0-2-1-1-2-2,1,355,355,355,355.0,355.000000
1-2-0-2-1-2-0-0,1,637,637,637,637.0,637.000000
1-2-0-2-0-2-2-0,1,12117,12117,12117,12117.0,12117.000000




2 3831


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0-0-0-0-0-0-0-0,1,40,40,40,40.0,40.000000
1-2-0-2-1-0-0-2,1,127,127,127,127.0,127.000000
1-2-0-2-1-1-0-0,1,168,168,168,168.0,168.000000
1-2-0-2-1-1-1-2,1,376,376,376,376.0,376.000000
1-2-0-2-1-1-2-1,1,171,171,171,171.0,171.000000
1-2-0-2-1-1-2-2,1,384,384,384,384.0,384.000000
1-2-0-2-1-2-0-0,1,613,613,613,613.0,613.000000
1-2-0-2-1-2-1-0,1,67,67,67,67.0,67.000000
1-2-0-2-1-2-2-2,1,271,271,271,271.0,271.000000
1-2-0-2-1-0-0-1,1,28235,28235,28235,28235.0,28235.000000




3 3784


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0-0-0-0-0-0-0-0,1,43,43,43,43.0,43.000000
1-2-0-2-0-2-2-2,1,51,51,51,51.0,51.000000
1-2-0-2-1-0-0-1,1,28251,28251,28251,28251.0,28251.000000
1-2-0-2-1-0-0-2,1,121,121,121,121.0,121.000000
1-2-0-2-1-1-0-0,1,158,158,158,158.0,158.000000
1-2-0-2-1-1-1-2,1,371,371,371,371.0,371.000000
1-2-0-2-1-1-2-1,1,190,190,190,190.0,190.000000
1-2-0-2-1-1-2-2,1,395,395,395,395.0,395.000000
1-2-0-2-1-2-0-0,1,588,588,588,588.0,588.000000
1-2-0-2-0-2-2-0,1,12257,12257,12257,12257.0,12257.000000




4 3757


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0-0-0-0-0-0-0-0,1,45,45,45,45.0,4.500000e+01
1-0-2-1-1-1-2-2,1,136,136,136,136.0,1.360000e+02
1-0-2-1-1-2-0-0,1,116,116,116,116.0,1.160000e+02
1-0-2-1-1-2-1-0,1,100,100,100,100.0,1.000000e+02
2-1-0-1-2-0-0-0,1,3281,3281,3281,3281.0,3.281000e+03
2-1-0-1-1-2-2-2,1,81,81,81,81.0,8.100000e+01
1-0-2-1-2-0-0-2,1,32,32,32,32.0,3.200000e+01
2-1-0-1-2-0-1-1,1,1322,1322,1322,1322.0,1.322000e+03
1-0-2-1-2-0-1-0,1,1159,1159,1159,1159.0,1.159000e+03
1-0-2-1-2-0-2-1,1,306,306,306,306.0,3.060000e+02




5 3713


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2-1-1-2-0-1-0-2,1,1548,1548,1548,1548.0,1.548000e+03
2-1-0-1-2-0-0-0,1,3310,3310,3310,3310.0,3.310000e+03
1-0-2-1-1-1-1-0,1,73,73,73,73.0,7.300000e+01
1-0-2-1-1-1-1-2,1,186,186,186,186.0,1.860000e+02
2-1-0-1-1-2-2-2,1,81,81,81,81.0,8.100000e+01
1-0-2-1-1-1-2-2,1,127,127,127,127.0,1.270000e+02
1-0-2-1-1-2-0-0,1,123,123,123,123.0,1.230000e+02
2-1-0-1-2-0-1-1,1,1360,1360,1360,1360.0,1.360000e+03
1-0-2-1-1-2-1-0,1,107,107,107,107.0,1.070000e+02
1-0-2-1-2-0-0-2,1,30,30,30,30.0,3.000000e+01




6 3687


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-1-1-1-0-1-.-2,1,62,62,62,62.0,6.200000e+01
1-0-2-1-2-1-2-2,1,106,106,106,106.0,1.060000e+02
2-1-0-1-1-2-2-2,1,80,80,80,80.0,8.000000e+01
1-0-2-1-2-1-0-0,1,156,156,156,156.0,1.560000e+02
2-1-0-1-2-0-0-0,1,3351,3351,3351,3351.0,3.351000e+03
1-0-2-1-2-0-2-1,1,344,344,344,344.0,3.440000e+02
1-0-2-1-2-0-1-2,1,136,136,136,136.0,1.360000e+02
2-1-0-1-2-0-1-1,1,1309,1309,1309,1309.0,1.309000e+03
1-0-2-1-2-0-1-0,1,1211,1211,1211,1211.0,1.211000e+03
1-0-2-1-2-0-0-2,1,24,24,24,24.0,2.400000e+01




7 3660


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-1-1-1-0-1-.-2,1,56,56,56,56.0,56.000000
1-2-0-2-1-0-0-1,1,27902,27902,27902,27902.0,27902.000000
1-2-0-2-1-0-0-2,1,110,110,110,110.0,110.000000
1-2-0-2-1-1-0-0,1,185,185,185,185.0,185.000000
1-2-0-2-1-1-0-2,1,9634,9634,9634,9634.0,9634.000000
1-2-0-2-1-1-1-2,1,376,376,376,376.0,376.000000
1-2-0-2-1-1-2-1,1,221,221,221,221.0,221.000000
1-2-0-2-1-1-2-2,1,427,427,427,427.0,427.000000
1-2-0-2-1-2-0-0,1,510,510,510,510.0,510.000000
1-2-0-2-1-2-1-0,1,52,52,52,52.0,52.000000




8 3636


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-1-1-1-0-2-2-0,1,3003,3003,3003,3003.0,3003.000000
1-2-0-2-1-1-0-2,1,9815,9815,9815,9815.0,9815.000000
1-2-0-2-1-1-1-2,1,375,375,375,375.0,375.000000
1-2-0-2-1-1-2-1,1,185,185,185,185.0,185.000000
1-2-0-2-1-1-2-2,1,415,415,415,415.0,415.000000
1-2-0-2-1-2-0-0,1,539,539,539,539.0,539.000000
1-2-0-2-1-2-1-0,1,57,57,57,57.0,57.000000
1-2-0-2-1-2-2-2,1,255,255,255,255.0,255.000000
1-2-0-2-2-0-2-0,1,2147,2147,2147,2147.0,2147.000000
1-2-0-2-2-0-2-1,1,520,520,520,520.0,520.000000




9 3614


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-1-1-1-0-2-2-0,1,3078,3078,3078,3078.0,3078.000000
1-2-0-2-1-1-0-2,1,9702,9702,9702,9702.0,9702.000000
1-2-0-2-1-1-1-2,1,356,356,356,356.0,356.000000
1-2-0-2-1-1-2-1,1,194,194,194,194.0,194.000000
1-2-0-2-1-1-2-2,1,416,416,416,416.0,416.000000
1-2-0-2-1-2-0-0,1,556,556,556,556.0,556.000000
1-2-0-2-1-2-1-0,1,65,65,65,65.0,65.000000
1-2-0-2-1-2-2-2,1,263,263,263,263.0,263.000000
1-2-0-2-2-0-2-0,1,2175,2175,2175,2175.0,2175.000000
1-2-0-2-2-0-2-1,1,485,485,485,485.0,485.000000




10 3580


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-1-1-1-0-2-2-1,1,1189,1189,1189,1189.0,1189.000000
1-2-0-2-0-2-2-0,1,12481,12481,12481,12481.0,12481.000000
1-2-0-2-1-0-0-1,1,28035,28035,28035,28035.0,28035.000000
1-2-0-2-1-0-0-2,1,106,106,106,106.0,106.000000
1-2-0-2-1-1-0-0,1,191,191,191,191.0,191.000000
1-2-0-2-1-1-0-2,1,9668,9668,9668,9668.0,9668.000000
1-2-0-2-1-1-1-2,1,389,389,389,389.0,389.000000
1-2-0-2-1-1-2-1,1,205,205,205,205.0,205.000000
1-2-0-2-0-2-0-1,1,741,741,741,741.0,741.000000
1-2-0-2-1-1-2-2,1,406,406,406,406.0,406.000000




11 3556


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-1-1-1-1-0-1-1,1,967,967,967,967.0,967.000000
1-2-0-2-1-0-0-2,1,117,117,117,117.0,117.000000
1-2-0-2-1-1-0-0,1,210,210,210,210.0,210.000000
1-2-0-2-1-1-0-2,1,9804,9804,9804,9804.0,9804.000000
1-2-0-2-1-1-1-2,1,381,381,381,381.0,381.000000
1-2-0-2-1-1-2-1,1,194,194,194,194.0,194.000000
1-2-0-2-1-1-2-2,1,415,415,415,415.0,415.000000
1-2-0-2-1-2-0-0,1,506,506,506,506.0,506.000000
1-2-0-2-1-0-0-1,1,28292,28292,28292,28292.0,28292.000000
1-2-0-2-1-2-1-0,1,68,68,68,68.0,68.000000




12 3540


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-1-1-1-1-0-1-1,1,975,975,975,975.0,975.000000
1-2-0-2-1-0-0-1,1,28194,28194,28194,28194.0,28194.000000
1-2-0-2-1-0-0-2,1,120,120,120,120.0,120.000000
1-2-0-2-1-1-0-0,1,187,187,187,187.0,187.000000
1-2-0-2-1-1-0-2,1,9736,9736,9736,9736.0,9736.000000
1-2-0-2-1-1-1-2,1,390,390,390,390.0,390.000000
1-2-0-2-1-1-2-1,1,175,175,175,175.0,175.000000
1-2-0-2-1-1-2-2,1,397,397,397,397.0,397.000000
1-2-0-2-0-2-2-0,1,12404,12404,12404,12404.0,12404.000000
1-2-0-2-1-2-0-0,1,521,521,521,521.0,521.000000




13 3518


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-1-1-1-1-0-1-1,1,997,997,997,997.0,997.000000
1-2-0-2-0-1-2-1,1,67,67,67,67.0,67.000000
1-2-0-2-0-2-0-1,1,792,792,792,792.0,792.000000
1-2-0-2-0-2-2-0,1,12388,12388,12388,12388.0,12388.000000
1-2-0-2-1-0-0-1,1,28232,28232,28232,28232.0,28232.000000
1-2-0-2-1-0-0-2,1,118,118,118,118.0,118.000000
1-2-0-2-1-1-0-0,1,200,200,200,200.0,200.000000
1-2-0-2-1-1-0-2,1,9803,9803,9803,9803.0,9803.000000
1-2-0-2-0-1-1-2,1,57,57,57,57.0,57.000000
1-2-0-2-1-1-1-2,1,397,397,397,397.0,397.000000




14 3501


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2-2-2-2-2-2-1-1,1,711,711,711,711.0,711.000000
1-1-1-2-0-2-1-0,1,28832,28832,28832,28832.0,28832.000000
1-1-1-2-0-2-2-0,1,229,229,229,229.0,229.000000
1-1-1-2-0-2-2-1,1,221,221,221,221.0,221.000000
1-1-1-2-1-0-0-1,1,170,170,170,170.0,170.000000
2-2-0-1-1-0-2-2,1,541,541,541,541.0,541.000000
1-1-1-2-1-0-1-2,1,1827,1827,1827,1827.0,1827.000000
2-2-0-1-1-0-2-1,1,120,120,120,120.0,120.000000
1-1-1-2-1-0-2-1,1,1584,1584,1584,1584.0,1584.000000
1-1-1-2-1-0-2-2,1,62,62,62,62.0,62.000000




15 3482


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-1-1-1-1-0-1-1,1,992,992,992,992.0,992.000000
1-2-0-2-0-1-2-1,1,46,46,46,46.0,46.000000
1-2-0-2-0-2-0-1,1,746,746,746,746.0,746.000000
1-2-0-2-0-2-2-0,1,12097,12097,12097,12097.0,12097.000000
1-2-0-2-1-0-0-1,1,28170,28170,28170,28170.0,28170.000000
1-2-0-2-1-0-0-2,1,121,121,121,121.0,121.000000
1-2-0-2-1-1-0-0,1,172,172,172,172.0,172.000000
1-2-0-2-1-1-0-2,1,10011,10011,10011,10011.0,10011.000000
1-2-0-2-0-1-1-2,1,67,67,67,67.0,67.000000
1-2-0-2-1-1-1-2,1,429,429,429,429.0,429.000000




16 3466


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2-2-2-2-2-2-1-1,1,647,647,647,647.0,647.000000
1-0-2-2-0-1-0-1,1,3225,3225,3225,3225.0,3225.000000
1-0-2-2-0-0-1-2,1,93709,93709,93709,93709.0,93709.000000
1-0-2-2-0-0-0-2,1,74,74,74,74.0,74.000000
1-0-2-1-2-2-2-2,1,43,43,43,43.0,43.000000
2-1-0-1-1-0-1-2,1,208,208,208,208.0,208.000000
1-0-2-1-2-2-0-1,1,2362,2362,2362,2362.0,2362.000000
1-0-2-1-2-2-.-0,1,28,28,28,28.0,28.000000
1-0-2-1-2-1-2-2,1,104,104,104,104.0,104.000000
2-1-0-1-1-0-2-1,1,37,37,37,37.0,37.000000




17 3450


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2-2-2-2-2-2-1-1,1,668,668,668,668.0,6.680000e+02
1-0-2-2-0-0-0-2,1,70,70,70,70.0,7.000000e+01
1-0-2-1-2-2-2-2,1,40,40,40,40.0,4.000000e+01
2-1-0-1-1-0-1-2,1,210,210,210,210.0,2.100000e+02
1-0-2-1-2-2-0-1,1,2262,2262,2262,2262.0,2.262000e+03
1-0-2-1-2-2-.-0,1,28,28,28,28.0,2.800000e+01
1-0-2-1-2-1-2-2,1,108,108,108,108.0,1.080000e+02
1-0-2-1-2-1-0-0,1,161,161,161,161.0,1.610000e+02
1-0-2-1-2-0-2-1,1,241,241,241,241.0,2.410000e+02
1-0-2-1-2-0-1-2,1,112,112,112,112.0,1.120000e+02




18 3420


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-1-1-1-1-0-1-1,1,1007,1007,1007,1007.0,1007.000000
1-2-0-2-0-1-1-2,1,70,70,70,70.0,70.000000
1-2-0-2-0-2-0-1,1,717,717,717,717.0,717.000000
1-2-0-2-0-2-2-0,1,12479,12479,12479,12479.0,12479.000000
1-2-0-2-1-0-0-1,1,28494,28494,28494,28494.0,28494.000000
1-2-0-2-1-0-0-2,1,136,136,136,136.0,136.000000
1-2-0-2-1-1-0-0,1,167,167,167,167.0,167.000000
1-2-0-2-1-1-0-2,1,10045,10045,10045,10045.0,10045.000000
1-2-0-2-1-1-1-2,1,466,466,466,466.0,466.000000
1-2-0-2-1-1-2-1,1,214,214,214,214.0,214.000000




19 3404


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-1-1-1-1-0-1-1,1,1009,1009,1009,1009.0,1009.000000
1-2-0-2-0-1-1-2,1,76,76,76,76.0,76.000000
1-2-0-2-0-2-0-1,1,733,733,733,733.0,733.000000
1-2-0-2-0-2-2-0,1,12495,12495,12495,12495.0,12495.000000
1-2-0-2-1-0-0-1,1,28591,28591,28591,28591.0,28591.000000
1-2-0-2-1-0-0-2,1,130,130,130,130.0,130.000000
1-2-0-2-1-1-0-0,1,164,164,164,164.0,164.000000
1-2-0-2-1-1-0-2,1,10171,10171,10171,10171.0,10171.000000
1-2-0-2-1-1-1-2,1,492,492,492,492.0,492.000000
1-2-0-2-1-1-2-1,1,203,203,203,203.0,203.000000




20 3388


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2-2-2-2-2-2-1-1,1,662,662,662,662.0,6.620000e+02
2-1-0-1-2-0-0-0,1,3423,3423,3423,3423.0,3.423000e+03
1-0-2-1-2-2-0-1,1,2251,2251,2251,2251.0,2.251000e+03
1-0-2-1-2-1-2-2,1,98,98,98,98.0,9.800000e+01
2-1-0-1-2-0-1-1,1,1305,1305,1305,1305.0,1.305000e+03
1-0-2-1-2-1-0-0,1,150,150,150,150.0,1.500000e+02
1-0-2-1-2-0-2-1,1,255,255,255,255.0,2.550000e+02
1-0-2-1-2-0-1-2,1,110,110,110,110.0,1.100000e+02
2-1-0-1-2-0-2-2,1,264,264,264,264.0,2.640000e+02
1-0-2-1-2-0-1-0,1,1035,1035,1035,1035.0,1.035000e+03




21 3376


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-1-1-1-0-2-2-2,1,681,681,681,681.0,681.000000
1-2-0-2-1-0-0-1,1,28592,28592,28592,28592.0,28592.000000
1-2-0-2-1-0-0-2,1,144,144,144,144.0,144.000000
1-2-0-2-1-1-0-0,1,168,168,168,168.0,168.000000
1-2-0-2-1-1-0-2,1,10070,10070,10070,10070.0,10070.000000
1-2-0-2-1-1-1-2,1,489,489,489,489.0,489.000000
1-2-0-2-1-1-2-1,1,220,220,220,220.0,220.000000
1-2-0-2-1-1-2-2,1,358,358,358,358.0,358.000000
1-2-0-2-1-2-0-0,1,601,601,601,601.0,601.000000
1-2-0-2-1-2-1-0,1,95,95,95,95.0,95.000000




22 3363


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-1-1-1-0-2-0-2,1,176,176,176,176.0,176.000000
1-2-0-2-1-0-0-2,1,155,155,155,155.0,155.000000
1-2-0-2-1-1-0-0,1,185,185,185,185.0,185.000000
1-2-0-2-1-1-0-2,1,9907,9907,9907,9907.0,9907.000000
1-2-0-2-1-1-1-2,1,471,471,471,471.0,471.000000
1-2-0-2-1-1-2-1,1,226,226,226,226.0,226.000000
1-2-0-2-1-1-2-2,1,327,327,327,327.0,327.000000
1-2-0-2-1-2-0-0,1,601,601,601,601.0,601.000000
1-2-0-2-1-2-1-0,1,85,85,85,85.0,85.000000
1-2-0-2-1-2-2-2,1,237,237,237,237.0,237.000000




23 3352


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-1-1-1-0-2-0-2,1,161,161,161,161.0,161.000000
1-2-0-2-1-0-0-1,1,28492,28492,28492,28492.0,28492.000000
1-2-0-2-1-0-0-2,1,171,171,171,171.0,171.000000
1-2-0-2-1-1-0-0,1,191,191,191,191.0,191.000000
1-2-0-2-1-1-0-2,1,9891,9891,9891,9891.0,9891.000000
1-2-0-2-1-1-1-2,1,504,504,504,504.0,504.000000
1-2-0-2-1-1-2-1,1,240,240,240,240.0,240.000000
1-2-0-2-1-1-2-2,1,322,322,322,322.0,322.000000
1-2-0-2-1-2-0-0,1,622,622,622,622.0,622.000000
1-2-0-2-1-2-1-0,1,71,71,71,71.0,71.000000




24 3340


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-1-1-1-0-1-1-0,1,906,906,906,906.0,906.000000
1-2-0-2-0-2-2-0,1,11953,11953,11953,11953.0,11953.000000
1-2-0-2-1-0-0-1,1,28150,28150,28150,28150.0,28150.000000
1-2-0-2-1-0-0-2,1,181,181,181,181.0,181.000000
1-2-0-2-1-1-0-0,1,201,201,201,201.0,201.000000
1-2-0-2-1-1-0-2,1,10084,10084,10084,10084.0,10084.000000
1-2-0-2-1-1-1-2,1,511,511,511,511.0,511.000000
1-2-0-2-1-1-2-1,1,232,232,232,232.0,232.000000
1-2-0-2-1-1-2-2,1,338,338,338,338.0,338.000000
1-2-0-2-1-2-0-0,1,625,625,625,625.0,625.000000




25 3330


Unnamed: 0_level_0,paradigme,freq,maxF,minF,medF,meanF
CF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-1-1-1-0-0-2-1,1,268,268,268,268.0,268.000000
1-2-0-2-0-0-2-2,1,247,247,247,247.0,247.000000
1-2-0-2-0-1-1-2,1,81,81,81,81.0,81.000000
1-2-0-2-0-2-0-1,1,663,663,663,663.0,663.000000
1-2-0-2-0-2-2-0,1,11858,11858,11858,11858.0,11858.000000
1-2-0-2-1-0-0-1,1,27674,27674,27674,27674.0,27674.000000
1-2-0-2-1-0-0-2,1,208,208,208,208.0,208.000000
1-2-0-2-1-1-0-0,1,236,236,236,236.0,236.000000
1-2-0-2-1-1-0-2,1,9990,9990,9990,9990.0,9990.000000
1-2-0-2-1-1-1-2,1,483,483,483,483.0,483.000000






In [None]:
with open("historique-C%02d-E%02d-%s.pkl"%(nbCases,maxNbExposants,(initLexDist[0]+initCaseDist[0]+initMarkDist[0]).upper()),"wb") as outFile:
    pickle.dump(historique, outFile, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
h=historique[-1]
hLex=h[h["freq"]>=1]
df=hLex.groupby("CF").count()[["paradigme"]].sort_values("paradigme")
df["freq"]=hLex.groupby("CF").sum()[["freq"]]
df["maxF"]=hLex.groupby("CF").max()[["freq"]]
df["minF"]=hLex.groupby("CF").min()[["freq"]]
df["medF"]=hLex.groupby("CF").median()[["freq"]]
df["meanF"]=hLex.groupby("CF").mean()[["freq"]]

In [None]:
df[df["paradigme"]==1].sort_values("meanF")

In [None]:
df["paradigme"].sum()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist

In [None]:
df.index.sort_values("paradigme",ascending=False)[0].tolist()

In [None]:
def categoricalExponents(cf):
    cfMarks=cf.split("-")
    result=[]
    for k,v in nbMarkCase.iteritems():
        for i in range(v):
            if cfMarks[k]=="." or int(cfMarks[k])==i:
                result.append(1)
            else:
                result.append(0)
    return result

In [None]:
#categoricalExponents(".-2-2-2-2-2-2-2")

In [None]:
classesFlexionnelles=[]
labelCF=df.index.sort_values("paradigme",ascending=False)[0].tolist()
for cf in labelCF:
    classesFlexionnelles.append(categoricalExponents(cf))

In [None]:
#classesFlexionnelles,labelCF

In [None]:
Z = linkage(classesFlexionnelles)

In [None]:
# calculate full dendrogram
plt.figure(figsize=(250, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
    labels=labelCF
)
plt.show()



In [None]:
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=12,  # show only the last p merged clusters
    show_leaf_counts=False,  # otherwise numbers in brackets are counts
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,  # to get a distribution impression in truncated branches
    labels=labelCF
)
plt.show()

In [None]:
with open("historique-C%02d-E%02d-%s.pkl"%(nbCases,maxNbExposants,(initLexDist[0]+initCaseDist[0]+initMarkDist[0]).upper()),"rb") as inFile:
    history=pickle.load(inFile)

In [None]:
h=history[-1]
h=h[(h["freq"]>0)]
dfGroup=h.groupby("CF")
countCF=dfGroup.count()[["paradigme"]]

In [None]:
ax=countCF[countCF["paradigme"]>1].sort_values("paradigme",ascending=False).plot()
plt.figure(figsize=(25, 10))
plt.title(u'Lexème par CF')
plt.xlabel('CF')
plt.ylabel(u'Nb de lexèmes')
