In [1]:
import pandas as pd
import numpy as np
import itertools as it
import random as rd
import networkx as nx
import re
debug=False

In [2]:
%matplotlib

Using matplotlib backend: MacOSX


In [3]:
def probElements(elements):
    dictResult={}
    sumFreq=sum([v for k,v in elements.iteritems()])
    for k,v in elements.iteritems():
        dictResult[k]=float(v)/sumFreq
    return dictResult
    
def distElements(elements):
    prop=[float(1)/c for c in range(1,len(elements)+1)]
    sumProp=sum(prop)
    dist=[p/sumProp for p in prop]
    return dist

## Paramétrage du paradigme
Les marks définissent les cases du paradigme
- le nom de la case correspond à la concaténation des valeurs possibles
 - abc correspond à une case qui peut prendre comme valeur a, b ou c
 
Les classes sont calculées en faisant le produit cartésien des différentes valeurs x cases

In [4]:
marks=["abc","def","ghi","klm","nop","qrs","tuv","wxy"]
marksN=[m+"N" for m in marks]
distMarks=distElements(marksN)

classes=pd.DataFrame(columns=marks)

cf=0
for element in it.product(*marks):
    cf+=1
    row = pd.Series({m:element[n] for n,m in enumerate(marks)},name=cf)
    classes=classes.append(row)

nbClasses=len(classes)
classes = classes.sample(frac=1).reset_index(drop=True)
#classes

In [5]:
nbGrammaireCF=len(classes)

## Paramètres du lexique
On fixe le nombre de lexèmes présents dans le corpus d'apprentissage et le ratio de formes attestées
- nombre de lexèmes
- ratio des formes-cases attestées par rapport au potentiel complet
 - 25% correspond au ratio observé pour les verbes dans Lex3
- nbFormes est l'ordre de grandeur à respecter pour l'échantillon en nombre de types

In [6]:
ratio=1.0
nbLexemes=25000

nbFormes=int(nbLexemes*len(marks)*ratio)
nbFormes

200000

## Distribution des lexèmes et des CF
Les différents éléments suivent des distributions par Zipf.
- distElements renvoie une liste de probabilités correspondant au nombre d'éléments suivant une Zipf(x)

Chaque lexème a une fréquence de lemme qui correspond à son rang et reçoit une CF tirée au hasard suivant une loi de Zipf.
- np.random.choice(cfs,p=distCFs)

In [7]:
cfs=range(nbClasses)
distCFs=distElements(cfs)

lexs=range(nbLexemes)
distLexs=distElements(lexs)

lexemes=pd.DataFrame(columns=["CF","P"]+marks)
for l in range(nbLexemes):
    cf=np.random.choice(cfs,p=distCFs)
    dictL={"CF":int(cf), "P":distLexs[l]}
    dictL.update(classes.iloc[cf].to_dict())
    row=pd.Series(dictL,name=l)
    lexemes=lexemes.append(row)
#lexemes.index+=1

In [8]:
lexemes

Unnamed: 0,CF,P,abc,def,ghi,klm,nop,qrs,tuv,wxy
0,1,0.093424,c,e,h,m,o,q,u,w
1,1478,0.046712,b,f,i,m,n,r,v,y
2,86,0.031141,c,f,i,l,n,r,t,y
3,0,0.023356,c,d,g,k,o,q,u,y
4,11,0.018685,a,d,i,k,n,q,t,w
5,120,0.015571,c,d,h,k,n,s,v,w
6,2318,0.013346,a,e,g,l,o,q,u,x
7,3910,0.011678,c,d,h,l,n,s,u,w
8,11,0.010380,a,d,i,k,n,q,t,w
9,2980,0.009342,c,f,g,k,o,q,u,w


In [9]:
print "Nombre de types potentiel",lexemes[marks].count().sum()

Nombre de types potentiel 200000


## Nombre de CF tirées dans le lexique
Ce nombre représente le maximum de CF qui pourraient être nécessaires pour la description.
Le nombre de CF nécessaire est au moins égal au nombre de CF qui possèdent un paradigme exemplaire, mais les CF qui n'ont qu'une représentation partielle de leur paradigme ne sont pas obligatoirement nécessaires à la description.

In [10]:
nbLexiqueCF=len(lexemes.groupby("CF").groups.keys())
print nbLexiqueCF,"CF dans le lexique sur",nbGrammaireCF,"CF dans la grammaire"

4025 CF dans le lexique sur 6561 CF dans la grammaire


## Constitution du DF pour le tirage

In [11]:
tiragesColonnes=pd.DataFrame(0, index=np.arange(len(lexs)), columns=marksN)
tirages=pd.concat([lexemes, tiragesColonnes], axis=1, sort=False)
#tirages

### Tirage des lexèmes pour les formes attestées
- lexTirs contient le nombre de token à tirer pour chaque lexème

=> changer le random.choice pour faire le choix en une seule fois et ensuite calculer les tirs pour chaque lexème

In [12]:
#tirages=pd.DataFrame(0, index=np.arange(len(lexs)), columns=marks)
#tirages.index+=1

nbTokens=5000000
lexTirs={}
listeTirs=np.random.choice(lexs,nbTokens,replace=True,p=distLexs)
for t in listeTirs:
    if not t in lexTirs:
        lexTirs[t]=0
    lexTirs[t]+=1
#lexTirs

In [13]:
newLexFreq=probElements(lexTirs)

In [14]:
for l in newLexFreq:
    print l,"%0.2f"%((newLexFreq[l]-distLexs[l])/distLexs[l]*100)

0 -0.06
1 0.04
2 -0.10
3 -0.61
4 -0.07
5 -0.37
6 -0.09
7 0.22
8 0.46
9 -0.51
10 -0.81
11 0.66
12 0.09
13 0.02
14 -0.72
15 0.27
16 -0.20
17 0.31
18 -1.09
19 -0.22
20 0.03
21 -1.00
22 -0.40
23 0.82
24 -1.25
25 0.14
26 1.15
27 0.02
28 0.95
29 0.73
30 0.81
31 -0.33
32 -0.73
33 1.28
34 -0.47
35 0.43
36 1.57
37 0.41
38 1.47
39 0.23
40 -0.73
41 0.04
42 0.68
43 -0.83
44 0.36
45 -0.16
46 0.10
47 -0.99
48 0.19
49 -1.92
50 0.83
51 -0.82
52 -0.63
53 0.15
54 1.25
55 1.39
56 0.58
57 2.08
58 -1.03
59 0.29
60 1.10
61 1.64
62 1.06
63 1.61
64 -0.24
65 1.39
66 1.25
67 -1.85
68 1.72
69 -1.16
70 1.62
71 0.06
72 1.49
73 0.75
74 -2.83
75 0.53
76 -1.62
77 -0.78
78 1.56
79 1.28
80 -2.22
81 0.02
82 -1.74
83 -1.51
84 0.21
85 1.30
86 -0.84
87 -0.66
88 1.04
89 -0.22
90 -0.02
91 0.03
92 4.34
93 -0.25
94 -0.53
95 -0.14
96 -1.49
97 2.76
98 4.00
99 2.69
100 -0.67
101 -1.35
102 0.72
103 0.50
104 0.95
105 -1.79
106 0.24
107 0.39
108 0.83
109 1.26
110 2.18
111 3.32
112 3.85
113 0.28
114 -1.38
115 2.61
116 -3.09
117 1.78


1756 6.82
1757 -4.41
1758 2.80
1759 10.77
1760 -5.38
1761 -4.57
1762 -5.65
1763 2.34
1764 2.40
1765 1.70
1766 1.38
1767 2.19
1768 -7.60
1769 3.07
1770 -8.25
1771 18.36
1772 9.69
1773 1.02
1774 5.26
1775 0.75
1776 4.61
1777 -5.98
1778 -1.36
1779 -2.45
1780 -6.97
1781 6.05
1782 -5.72
1783 -2.61
1784 -3.32
1785 -0.21
1786 0.99
1787 0.67
1788 1.49
1789 5.76
1790 3.52
1791 -4.86
1792 2.49
1793 -5.91
1794 1.06
1795 -2.34
1796 10.02
1797 -4.54
1798 0.90
1799 4.81
1800 -0.53
1801 9.17
1802 3.06
1803 0.41
1804 11.67
1805 -2.57
1806 0.58
1807 9.15
1808 -13.64
1809 2.68
1810 -0.75
1811 7.06
1812 13.72
1813 -3.69
1814 -10.24
1815 -3.59
1816 -4.31
1817 -8.54
1818 5.14
1819 -11.56
1820 -0.20
1821 -1.71
1822 -7.90
1823 8.16
1824 0.80
1825 5.15
1826 -1.44
1827 -7.25
1828 -13.86
1829 -0.10
1830 7.79
1831 -6.27
1832 7.52
1833 -4.59
1834 -10.04
1835 2.19
1836 3.43
1837 -2.81
1838 -5.91
1839 2.02
1840 9.56
1841 1.34
1842 -1.76
1843 2.64
1844 0.32
1845 -6.34
1846 3.60
1847 -8.61
1848 0.54
1849 14.46
1850 -

2968 -3.39
2969 6.18
2970 -9.05
2971 -8.38
2972 15.83
2973 6.32
2974 -3.19
2975 -1.89
2976 -5.04
2977 2.64
2978 3.31
2979 -5.58
2980 11.04
2981 12.35
2982 -10.60
2983 -2.90
2984 -4.15
2985 -1.56
2986 9.35
2987 5.54
2988 -8.50
2989 15.22
2990 -1.39
2991 -3.28
2992 -8.38
2993 1.91
2994 17.97
2995 10.96
2996 -4.40
2997 14.88
2998 9.14
2999 -10.09
3000 0.86
3001 -13.88
3002 -2.93
3003 13.18
3004 -13.80
3005 -7.98
3006 1.71
3007 -8.56
3008 12.08
3009 -7.85
3010 -8.47
3011 -13.60
3012 -10.34
3013 10.33
3014 4.56
3015 0.72
3016 -0.54
3017 -4.38
3018 -2.41
3019 -14.01
3020 -3.64
3021 11.92
3022 6.13
3023 -5.48
3024 0.38
3025 2.35
3026 0.44
3027 1.12
3028 -4.68
3029 -12.43
3030 5.12
3031 -1.34
3032 -9.75
3033 -4.52
3034 0.71
3035 3.99
3036 -7.03
3037 10.56
3038 13.85
3039 4.13
3040 15.23
3041 3.54
3042 6.84
3043 4.92
3044 1.04
3045 16.07
3046 -6.07
3047 -2.12
3048 -0.79
3049 10.35
3050 -3.99
3051 7.81
3052 3.92
3053 9.18
3054 0.72
3055 -0.56
3056 4.06
3057 -4.42
3058 6.09
3059 -2.39
3060 2.88
3

4068 0.17
4069 9.78
4070 -5.88
4071 12.45
4072 2.02
4073 -4.06
4074 9.05
4075 0.35
4076 -2.25
4077 -10.95
4078 2.17
4079 -10.04
4080 4.84
4081 -7.37
4082 4.02
4083 -16.07
4084 17.18
4085 -1.16
4086 4.12
4087 -1.11
4088 6.79
4089 -3.69
4090 10.35
4091 16.51
4092 16.54
4093 0.79
4094 -4.45
4095 6.98
4096 -7.91
4097 7.91
4098 1.79
4099 6.20
4100 0.08
4101 -12.19
4102 -3.38
4103 16.85
4104 -4.21
4105 -3.31
4106 -12.08
4107 -1.50
4108 -13.79
4109 8.22
4110 14.41
4111 -5.81
4112 -18.11
4113 17.13
4114 5.71
4115 -13.65
4116 4.00
4117 0.50
4118 -8.29
4119 -14.45
4120 -1.19
4121 -10.87
4122 0.62
4123 -12.60
4124 7.73
4125 -16.09
4126 -4.58
4127 -3.68
4128 0.77
4129 4.33
4130 -3.61
4131 4.38
4132 12.37
4133 10.62
4134 0.91
4135 -15.88
4136 -7.01
4137 -14.07
4138 -3.42
4139 3.69
4140 2.83
4141 -3.35
4142 -20.18
4143 -3.30
4144 -2.39
4145 -12.13
4146 8.31
4147 3.01
4148 7.47
4149 14.61
4150 -16.47
4151 -2.23
4152 4.02
4153 -17.30
4154 -4.82
4155 7.65
4156 1.45
4157 25.51
4158 3.28
4159 -0.26
4160 

5236 -1.34
5237 -18.14
5238 3.18
5239 9.93
5240 14.44
5241 -18.08
5242 -3.47
5243 -3.45
5244 -14.66
5245 8.94
5246 3.34
5247 -3.38
5248 6.75
5249 14.64
5250 -5.57
5251 -2.18
5252 4.58
5253 28.22
5254 -29.13
5255 1.27
5256 3.54
5257 8.06
5258 -3.18
5259 -17.80
5260 8.12
5261 22.79
5262 -4.23
5263 2.55
5264 4.82
5265 -18.83
5266 -7.54
5267 3.75
5268 -6.38
5269 -6.36
5270 2.68
5271 10.60
5272 4.98
5273 7.26
5274 -11.92
5275 15.21
5276 3.93
5277 -1.70
5278 -2.81
5279 2.86
5280 0.62
5281 4.03
5282 -19.70
5283 1.81
5284 6.35
5285 -10.60
5286 7.52
5287 -3.78
5288 -4.89
5289 -10.53
5290 5.34
5291 -0.31
5292 -16.15
5293 7.67
5294 6.55
5295 14.51
5296 -10.42
5297 -2.46
5298 -24.00
5299 7.79
5300 -6.94
5301 4.42
5302 -1.23
5303 -15.98
5304 16.98
5305 2.23
5306 -12.52
5307 -9.09
5308 -2.26
5309 -11.33
5310 -9.04
5311 8.03
5312 2.37
5313 11.49
5314 -5.56
5315 0.15
5316 -5.53
5317 -24.86
5318 -6.63
5319 -15.72
5320 -5.45
5321 9.37
5322 15.09
5323 0.30
5324 12.86
5325 -0.80
5326 16.32
5327 7.22
5328 

6350 1.97
6351 -2.09
6352 -8.88
6353 -10.22
6354 -15.65
6355 18.38
6356 3.43
6357 -8.81
6358 2.10
6359 0.75
6360 -3.32
6361 6.23
6362 2.16
6363 -19.62
6364 -7.34
6365 2.21
6366 -7.31
6367 -3.21
6368 -4.56
6369 -14.09
6370 2.29
6371 3.67
6372 -16.78
6373 -8.58
6374 -37.22
6375 -1.72
6376 -0.34
6377 -5.79
6378 9.25
6379 7.90
6380 21.58
6381 -0.26
6382 -11.18
6383 12.07
6384 6.62
6385 -0.20
6386 1.18
6387 1.20
6388 5.32
6389 -9.72
6390 1.24
6391 -2.84
6392 -11.04
6393 4.03
6394 4.05
6395 6.80
6396 -19.20
6397 -8.23
6398 -2.74
6399 20.57
6400 -10.93
6401 6.90
6402 12.40
6403 -10.89
6404 23.40
6405 11.08
6406 13.84
6407 5.63
6408 26.23
6409 9.78
6410 7.05
6411 -13.52
6412 5.71
6413 -20.36
6414 1.62
6415 -7.97
6416 -3.84
6417 14.04
6418 -9.31
6419 4.45
6420 4.47
6421 7.23
6422 4.50
6423 8.64
6424 -25.73
6425 7.30
6426 -6.44
6427 14.22
6428 5.98
6429 4.62
6430 15.65
6431 19.79
6432 11.55
6433 15.70
6434 0.56
6435 17.11
6436 -14.56
6437 -15.93
6438 11.65
6439 13.05
6440 -2.10
6441 0.67
6442 7.

7511 -8.34
7512 6.15
7513 -5.09
7514 11.01
7515 1.37
7516 -5.06
7517 14.27
7518 -5.03
7519 19.13
7520 -14.67
7521 -3.38
7522 25.62
7523 4.70
7524 1.49
7525 16.00
7526 -8.15
7527 6.36
7528 4.77
7529 6.39
7530 -1.65
7531 -1.64
7532 8.05
7533 12.90
7534 -4.83
7535 1.64
7536 6.49
7537 0.05
7538 -12.85
7539 9.76
7540 -6.37
7541 -4.74
7542 -11.19
7543 -6.33
7544 4.99
7545 6.62
7546 -17.60
7547 -4.66
7548 0.20
7549 17.99
7550 -11.09
7551 -1.38
7552 5.10
7553 9.97
7554 6.75
7555 -14.27
7556 10.01
7557 10.02
7558 -7.76
7559 14.91
7560 1.97
7561 -6.11
7562 11.72
7563 8.49
7564 -4.45
7565 -6.06
7566 10.15
7567 23.13
7568 -2.78
7569 3.72
7570 0.49
7571 -10.85
7572 10.24
7573 -10.82
7574 15.14
7575 -28.64
7576 11.92
7577 11.94
7578 3.84
7579 -5.88
7580 -5.87
7581 -7.48
7582 13.63
7583 13.65
7584 -20.43
7585 13.68
7586 -15.54
7587 3.96
7588 5.60
7589 5.62
7590 8.88
7591 0.77
7592 -0.85
7593 26.80
7594 17.07
7595 7.32
7596 -0.79
7597 18.74
7598 15.50
7599 9.01
7600 25.29
7601 -15.37
7602 12.31
7603 -

8698 -3.16
8699 0.57
8700 2.45
8701 0.60
8702 13.65
8703 -10.56
8704 -6.82
8705 13.69
8706 -16.12
8707 -17.98
8708 8.14
8709 -8.63
8710 -10.49
8711 11.90
8712 8.18
8713 2.60
8714 30.60
8715 4.49
8716 6.37
8717 8.25
8718 28.79
8719 -1.06
8720 10.15
8721 -30.91
8722 4.57
8723 10.19
8724 -1.01
8725 28.89
8726 -27.14
8727 12.11
8728 -4.70
8729 14.00
8730 21.49
8731 -6.53
8732 -15.87
8733 0.97
8734 -15.85
8735 -2.75
8736 -12.09
8737 4.75
8738 -2.72
8739 14.13
8740 2.92
8741 -17.66
8742 4.81
8743 -6.41
8744 6.71
8745 2.98
8746 8.61
8747 8.62
8748 -13.84
8749 -19.45
8750 -6.33
8751 -10.07
8752 6.81
8753 -26.91
8754 -6.29
8755 -4.40
8756 3.11
8757 -2.51
8758 3.13
8759 -32.49
8760 27.54
8761 -4.34
8762 -13.71
8763 -17.45
8764 20.09
8765 31.36
8766 -0.53
8767 6.99
8768 -8.02
8769 -2.37
8770 -4.24
8771 8.92
8772 -9.85
8773 8.94
8774 -9.83
8775 -0.43
8776 3.34
8777 -9.80
8778 -2.27
8779 29.69
8780 3.39
8781 -7.88
8782 3.41
8783 -0.34
8784 -17.25
8785 -2.19
8786 -19.11
8787 3.47
8788 -26.62
8789 7.

9754 14.86
9755 -12.28
9756 4.44
9757 31.60
9758 -20.61
9759 27.45
9760 -3.88
9761 2.40
9762 0.32
9763 -10.12
9764 4.52
9765 -16.37
9766 -8.00
9767 29.65
9768 -1.71
9769 10.85
9770 10.86
9771 -1.68
9772 -10.04
9773 8.80
9774 -28.85
9775 17.20
9776 2.56
9777 -3.71
9778 0.49
9779 -20.44
9780 0.51
9781 6.80
9782 -16.23
9783 -14.12
9784 -12.02
9785 6.84
9786 29.90
9787 -16.18
9788 -9.89
9789 -5.69
9790 2.71
9791 -9.86
9792 0.63
9793 -9.84
9794 -1.45
9795 -1.44
9796 9.06
9797 -14.00
9798 32.16
9799 -9.79
9800 -9.78
9801 -11.87
9802 -3.46
9803 9.14
9804 -3.44
9805 -1.34
9806 -13.92
9807 2.88
9808 -9.70
9809 -18.10
9810 0.82
9811 -11.78
9812 15.54
9813 11.35
9814 11.36
9815 28.18
9816 -20.14
9817 9.29
9818 34.53
9819 -13.81
9820 21.94
9821 -28.51
9822 7.25
9823 -36.91
9824 15.68
9825 -9.55
9826 -9.54
9827 7.30
9828 -13.73
9829 -3.20
9830 19.96
9831 9.45
9832 -1.06
9833 13.68
9834 24.22
9835 17.92
9836 3.19
9837 13.73
9838 -15.75
9839 -15.74
9840 5.34
9841 20.10
9842 1.14
9843 11.69
9844 -5.16

10962 -38.98
10963 -22.54
10964 -27.23
10965 -22.53
10966 -10.78
10967 24.44
10968 19.76
10969 3.33
10970 5.69
10971 -6.05
10972 -3.69
10973 12.77
10974 -3.67
10975 -24.81
10976 22.20
10977 1.06
10978 -8.34
10979 -13.03
10980 -5.97
10981 -3.61
10982 8.16
10983 5.81
10984 1.12
10985 -5.93
10986 -3.57
10987 -22.37
10988 -36.48
10989 -1.19
10990 -15.29
10991 -19.99
10992 15.31
10993 -8.21
10994 10.63
10995 -12.90
10996 -1.12
10997 27.14
10998 15.38
10999 24.81
11000 -10.51
11001 3.63
11002 -8.14
11003 -15.19
11004 24.86
11005 8.38
11006 -3.39
11007 10.76
11008 -19.87
11009 6.06
11010 1.36
11011 6.08
11012 17.88
11013 -10.40
11014 -10.39
11015 17.91
11016 15.57
11017 -29.24
11018 1.43
11019 6.16
11020 -5.63
11021 17.98
11022 -10.33
11023 6.20
11024 3.85
11025 3.86
11026 27.47
11027 -7.93
11028 1.53
11029 39.32
11030 29.88
11031 -3.17
11032 3.92
11033 6.30
11034 -19.68
11035 6.32
11036 -22.03
11037 22.88
11038 25.25
11039 -7.83
11040 -26.73
11041 13.46
11042 -5.44
11043 8.76
11044 13.50
110

12048 -9.72
12049 -22.61
12050 -17.44
12051 -1.96
12052 -7.11
12053 -12.26
12054 -14.84
12055 13.56
12056 10.99
12057 0.67
12058 -19.97
12059 18.76
12060 13.61
12061 -1.88
12062 11.04
12063 5.89
12064 0.73
12065 8.49
12066 8.50
12067 -6.99
12068 0.76
12069 13.69
12070 -6.97
12071 -25.05
12072 -30.22
12073 -1.78
12074 8.57
12075 21.50
12076 18.93
12077 11.18
12078 13.78
12079 -1.73
12080 34.49
12081 -14.65
12082 -24.99
12083 13.82
12084 21.59
12085 8.67
12086 -1.67
12087 -12.02
12088 -12.01
12089 0.94
12090 8.71
12091 -11.99
12092 16.50
12093 3.56
12094 13.93
12095 0.99
12096 -1.59
12097 -27.48
12098 6.19
12099 1.02
12100 -19.69
12101 13.99
12102 16.59
12103 -9.31
12104 -4.12
12105 -14.48
12106 19.22
12107 -24.83
12108 -1.49
12109 1.11
12110 -17.03
12111 14.09
12112 -1.46
12113 -6.64
12114 -17.01
12115 16.72
12116 -22.18
12117 -6.61
12118 -4.01
12119 -6.59
12120 -14.37
12121 8.99
12122 -6.57
12123 27.18
12124 3.83
12125 -1.36
12126 -3.94
12127 24.62
12128 -32.49
12129 -6.52
12130 16.86


13247 -12.08
13248 -0.73
13249 19.13
13250 -14.90
13251 4.97
13252 -26.23
13253 -3.53
13254 13.50
13255 -14.87
13256 -20.54
13257 -9.18
13258 -9.17
13259 5.03
13260 2.20
13261 -14.83
13262 19.25
13263 2.22
13264 27.79
13265 -9.12
13266 -31.84
13267 -31.83
13268 -20.46
13269 -3.41
13270 -6.25
13271 19.33
13272 16.50
13273 -26.12
13274 5.15
13275 2.32
13276 -9.05
13277 8.02
13278 -11.88
13279 -0.50
13280 10.88
13281 -23.23
13282 -28.91
13283 13.75
13284 -11.84
13285 10.93
13286 5.24
13287 -6.13
13288 -20.34
13289 -6.11
13290 -14.64
13291 28.05
13292 19.52
13293 -17.47
13294 -17.46
13295 -6.07
13296 5.32
13297 -20.29
13298 -28.82
13299 39.51
13300 -25.97
13301 -11.72
13302 -25.96
13303 5.38
13304 36.72
13305 -0.30
13306 2.55
13307 -8.83
13308 5.42
13309 28.22
13310 -0.26
13311 -0.26
13312 -8.80
13313 14.01
13314 -20.19
13315 -5.93
13316 -5.92
13317 -5.91
13318 5.50
13319 8.36
13320 5.51
13321 -0.18
13322 2.68
13323 -0.17
13324 22.66
13325 -20.12
13326 -20.12
13327 8.42
13328 -14.40
13329 

14435 -10.38
14436 -38.19
14437 26.72
14438 -28.91
14439 11.29
14440 -7.26
14441 8.21
14442 51.50
14443 -22.70
14444 -1.04
14445 8.24
14446 -1.03
14447 8.25
14448 2.08
14449 -16.48
14450 -16.47
14451 -25.75
14452 17.57
14453 8.30
14454 -4.07
14455 11.41
14456 14.51
14457 -4.05
14458 -0.95
14459 -25.71
14460 20.74
14461 39.32
14462 20.75
14463 11.47
14464 -31.87
14465 5.29
14466 -0.89
14467 2.21
14468 57.97
14469 23.91
14470 -3.96
14471 -31.84
14472 -19.44
14473 -0.85
14474 -3.94
14475 11.56
14476 -19.42
14477 11.58
14478 -0.81
14479 -0.81
14480 2.30
14481 36.41
14482 -16.29
14483 20.93
14484 5.43
14485 5.44
14486 -6.96
14487 -28.66
14488 2.36
14489 -13.14
14490 24.09
14491 -19.34
14492 8.59
14493 -16.22
14494 -10.01
14495 -22.42
14496 -3.79
14497 -37.93
14498 -6.88
14499 2.44
14500 -37.91
14501 39.70
14502 2.46
14503 2.46
14504 21.10
14505 33.53
14506 -0.62
14507 -6.82
14508 8.71
14509 8.72
14510 -3.70
14511 -3.69
14512 -6.79
14513 -6.79
14514 11.86
14515 2.55
14516 -12.98
14517 -6.76


15481 -0.57
15482 -7.19
15483 -50.28
15484 -3.87
15485 -17.12
15486 -10.48
15487 6.10
15488 6.11
15489 -40.31
15490 22.70
15491 26.03
15492 -0.50
15493 6.14
15494 -40.29
15495 22.74
15496 12.80
15497 6.17
15498 -10.41
15499 -30.32
15500 46.01
15501 16.15
15502 19.48
15503 22.81
15504 12.86
15505 29.46
15506 -0.41
15507 12.88
15508 6.24
15509 -13.67
15510 -36.91
15511 -20.30
15512 -26.94
15513 -7.01
15514 -23.61
15515 -13.64
15516 -0.34
15517 -3.66
15518 12.96
15519 -20.26
15520 6.33
15521 -0.31
15522 12.99
15523 -0.30
15524 -20.23
15525 -6.93
15526 9.69
15527 19.67
15528 6.38
15529 -13.56
15530 26.34
15531 -0.25
15532 6.41
15533 -30.16
15534 33.03
15535 -6.87
15536 19.74
15537 16.42
15538 -3.53
15539 -6.85
15540 -6.84
15541 23.11
15542 -0.18
15543 3.16
15544 9.82
15545 16.48
15546 3.18
15547 -6.80
15548 -20.11
15549 -6.79
15550 3.20
15551 9.87
15552 16.53
15553 16.54
15554 -20.08
15555 -10.08
15556 6.57
15557 6.58
15558 -16.73
15559 -33.38
15560 9.93
15561 -0.06
15562 13.28
15563 -33.3

16683 3.58
16684 -10.70
16685 3.59
16686 -10.69
16687 10.75
16688 -3.54
16689 25.05
16690 3.62
16691 -21.39
16692 -28.53
16693 -21.38
16694 -7.08
16695 39.40
16696 -3.49
16697 10.81
16698 -7.05
16699 -7.05
16700 -21.34
16701 0.11
16702 50.18
16703 0.13
16704 -7.02
16705 0.14
16706 21.60
16707 10.88
16708 18.04
16709 18.05
16710 -6.99
16711 32.37
16712 3.76
16713 3.76
16714 0.19
16715 -14.12
16716 18.10
16717 0.21
16718 -24.84
16719 -10.52
16720 -10.51
16721 18.13
16722 -14.08
16723 -3.33
16724 32.48
16725 7.42
16726 -10.48
16727 -6.89
16728 -10.47
16729 0.28
16730 -3.29
16731 7.46
16732 -28.36
16733 36.13
16734 -3.27
16735 -10.43
16736 7.49
16737 3.91
16738 -10.41
16739 -21.16
16740 11.10
16741 0.35
16742 -17.56
16743 14.70
16744 21.88
16745 -10.38
16746 -35.47
16747 -10.37
16748 -3.19
16749 11.16
16750 0.41
16751 -24.69
16752 39.87
16753 29.12
16754 0.43
16755 14.79
16756 -28.25
16757 -3.14
16758 0.46
16759 -35.42
16760 0.47
16761 -31.82
16762 4.07
16763 14.84
16764 4.08
16765 54.34
1

17634 1.93
17635 -20.72
17636 -24.49
17637 -9.38
17638 -16.93
17639 -28.25
17640 1.97
17641 -1.80
17642 9.53
17643 -1.79
17644 -5.57
17645 13.33
17646 -16.89
17647 -20.66
17648 -16.88
17649 -13.10
17650 20.92
17651 5.81
17652 13.37
17653 -5.52
17654 -9.29
17655 -5.51
17656 -35.74
17657 -16.84
17658 -9.27
17659 20.98
17660 9.64
17661 20.99
17662 21.00
17663 5.88
17664 -31.93
17665 21.02
17666 -9.23
17667 -35.70
17668 9.69
17669 -1.65
17670 -24.34
17671 -1.64
17672 -1.63
17673 -1.63
17674 -12.97
17675 -16.75
17676 -1.61
17677 -16.74
17678 -9.17
17679 -5.38
17680 9.77
17681 -1.58
17682 -16.72
17683 -20.50
17684 13.58
17685 24.94
17686 9.81
17687 13.60
17688 -28.05
17689 -5.32
17690 -20.47
17691 32.56
17692 17.42
17693 -12.88
17694 -1.51
17695 -28.02
17696 13.66
17697 -5.28
17698 13.67
17699 13.68
17700 -16.63
17701 -5.26
17702 32.64
17703 -5.25
17704 2.34
17705 -16.61
17706 2.35
17707 -20.39
17708 2.36
17709 -31.76
17710 6.16
17711 -5.21
17712 2.38
17713 -27.95
17714 -31.74
17715 9.99
177

18743 -19.75
18744 -11.72
18745 4.34
18746 24.41
18747 -23.74
18748 4.36
18749 -35.78
18750 24.44
18751 -27.74
18752 -39.78
18753 -3.64
18754 8.41
18755 -23.71
18756 8.42
18757 -39.77
18758 4.41
18759 -15.66
18760 -23.69
18761 4.43
18762 28.54
18763 4.44
18764 -11.62
18765 -11.62
18766 12.49
18767 4.46
18768 -15.62
18769 16.53
18770 8.50
18771 -3.55
18772 -39.72
18773 20.57
18774 -35.69
18775 12.55
18776 4.51
18777 -23.62
18778 4.52
18779 4.53
18780 4.54
18781 -15.56
18782 36.71
18783 16.62
18784 40.75
18785 8.58
18786 -23.58
18787 -31.62
18788 -11.51
18789 -23.57
18790 -39.66
18791 24.71
18792 24.72
18793 4.61
18794 -19.53
18795 -27.57
18796 -3.42
18797 0.61
18798 -23.54
18799 0.62
18800 -7.43
18801 -19.50
18802 -51.70
18803 4.66
18804 -7.41
18805 0.65
18806 4.68
18807 -15.45
18808 0.66
18809 -3.36
18810 12.76
18811 4.71
18812 12.77
18813 -19.45
18814 -23.47
18815 8.76
18816 28.91
18817 -7.34
18818 4.75
18819 0.72
18820 16.85
18821 -15.38
18822 24.92
18823 32.98
18824 -11.34
18825 -15

19960 -14.54
19961 -35.90
19962 15.39
19963 15.39
19964 15.40
19965 28.23
19966 32.51
19967 -1.68
19968 6.87
19969 -27.32
19970 -5.94
19971 23.99
19972 19.72
19973 -23.03
19974 2.63
19975 19.74
19976 -18.74
19977 15.47
19978 28.31
19979 -31.56
19980 -27.28
19981 15.50
19982 2.67
19983 2.67
19984 6.96
19985 32.64
19986 15.53
19987 24.09
19988 -40.09
19989 6.99
19990 -5.85
19991 -44.36
19992 -22.96
19993 24.13
19994 -27.23
19995 2.74
19996 32.71
19997 2.75
19998 -5.81
19999 28.45
20000 7.04
20001 -31.49
20002 -14.36
20003 -35.76
20004 -35.76
20005 19.92
20006 41.34
20007 -5.77
20008 -5.76
20009 19.94
20010 7.10
20011 58.51
20012 28.53
20013 15.68
20014 -1.45
20015 -1.45
20016 -1.44
20017 -18.58
20018 -14.29
20019 -10.00
20020 -40.00
20021 7.16
20022 -48.56
20023 2.88
20024 7.17
20025 -14.26
20026 20.05
20027 -39.97
20028 -27.11
20029 -14.24
20030 -35.68
20031 -18.52
20032 20.08
20033 -9.93
20034 -1.35
20035 11.52
20036 28.68
20037 20.11
20038 -9.91
20039 24.41
20040 -14.19
20041 2.97
200

21046 17.15
21047 -0.87
21048 3.64
21049 3.65
21050 -9.87
21051 8.16
21052 -9.86
21053 -36.90
21054 -36.90
21055 -32.39
21056 -18.86
21057 17.21
21058 -0.82
21059 30.75
21060 26.24
21061 -18.84
21062 8.22
21063 -32.36
21064 -14.32
21065 3.72
21066 -27.84
21067 -5.29
21068 -18.81
21069 3.74
21070 8.26
21071 17.29
21072 26.32
21073 -0.75
21074 3.77
21075 21.82
21076 -0.73
21077 -23.29
21078 -32.31
21079 -14.26
21080 44.41
21081 -36.82
21082 -23.27
21083 12.84
21084 17.36
21085 -23.26
21086 -9.72
21087 -14.23
21088 -14.22
21089 39.96
21090 -9.70
21091 3.85
21092 8.37
21093 -0.65
21094 -14.20
21095 -0.64
21096 17.43
21097 -0.63
21098 -5.15
21099 -14.18
21100 8.41
21101 -5.13
21102 -0.61
21103 -32.23
21104 17.47
21105 8.44
21106 -5.11
21107 -5.11
21108 -23.18
21109 -9.62
21110 -9.61
21111 8.47
21112 -9.60
21113 -23.16
21114 -18.64
21115 3.97
21116 40.14
21117 17.54
21118 -5.06
21119 -32.18
21120 13.04
21121 -41.22
21122 -23.13
21123 -14.08
21124 -23.12
21125 53.77
21126 8.55
21127 -23.11
21

22231 -4.81
22232 9.47
22233 -0.04
22234 4.72
22235 14.25
22236 -23.83
22237 -28.59
22238 -38.11
22239 4.74
22240 -42.86
22241 -19.05
22242 61.90
22243 23.81
22244 -23.81
22245 4.77
22246 -9.51
22247 4.78
22248 33.36
22249 -33.31
22250 19.09
22251 -19.02
22252 -23.78
22253 -19.01
22254 19.11
22255 -9.47
22256 14.35
22257 4.83
22258 -4.70
22259 4.84
22260 -4.69
22261 19.14
22262 4.85
22263 -9.44
22264 14.39
22265 14.40
22266 -18.96
22267 19.18
22268 -4.65
22269 19.19
22270 0.12
22271 0.13
22272 -14.17
22273 -4.63
22274 9.68
22275 28.76
22276 -18.93
22277 -18.92
22278 4.93
22279 33.55
22280 24.02
22281 -9.37
22282 -14.13
22283 -4.59
22284 -4.59
22285 38.36
22286 4.97
22287 9.74
22288 47.92
22289 14.52
22290 4.98
22291 -9.33
22292 -4.55
22293 -18.87
22294 0.23
22295 33.65
22296 -14.08
22297 9.79
22298 9.80
22299 -9.30
22300 14.58
22301 9.81
22302 43.24
22303 0.27
22304 -4.50
22305 24.16
22306 33.71
22307 -14.04
22308 -4.48
22309 -4.48
22310 5.08
22311 -14.02
22312 -4.47
22313 43.31
22314 

23483 10.60
23484 -34.64
23485 -19.55
23486 40.78
23487 -19.55
23488 -44.69
23489 20.69
23490 5.61
23491 -24.56
23492 -19.53
23493 20.71
23494 -14.49
23495 -24.55
23496 10.66
23497 5.64
23498 -4.42
23499 -14.48
23500 40.87
23501 -44.66
23502 10.69
23503 -29.56
23504 10.70
23505 -24.52
23506 10.71
23507 -9.41
23508 0.65
23509 15.76
23510 -29.54
23511 -4.37
23512 15.77
23513 -54.70
23514 -4.35
23515 -34.55
23516 10.76
23517 -19.45
23518 -14.41
23519 25.88
23520 -29.51
23521 -24.47
23522 5.75
23523 46.04
23524 -29.49
23525 -49.64
23526 15.84
23527 10.81
23528 -14.37
23529 -34.52
23530 20.90
23531 -4.28
23532 10.83
23533 10.84
23534 20.92
23535 25.96
23536 20.93
23537 46.13
23538 -9.30
23539 -19.37
23540 -24.41
23541 15.92
23542 -19.36
23543 46.17
23544 5.85
23545 10.89
23546 -54.63
23547 -24.38
23548 -14.30
23549 36.12
23550 21.00
23551 10.92
23552 -9.24
23553 21.02
23554 0.85
23555 -4.19
23556 10.95
23557 10.95
23558 26.09
23559 5.92
23560 31.14
23561 26.10
23562 41.24
23563 -9.20
23564 

24569 5.20
24570 10.46
24571 5.21
24572 42.03
24573 21.00
24574 26.26
24575 5.22
24576 10.49
24577 10.49
24578 -31.60
24579 21.03
24580 -26.33
24581 21.04
24582 26.30
24583 -0.01
24584 -5.26
24585 15.79
24586 36.85
24587 -10.52
24588 -10.51
24589 10.55
24590 -26.30
24591 10.56
24592 -5.23
24593 21.10
24594 -15.76
24595 -21.02
24596 -47.34
24597 5.32
24598 0.06
24599 31.66
24600 -42.07
24601 21.13
24602 0.07
24603 21.14
24604 0.08
24605 52.76
24606 10.62
24607 31.70
24608 31.71
24609 5.37
24610 15.91
24611 10.65
24612 -15.69
24613 -15.69
24614 -5.15
24615 21.20
24616 37.02
24617 0.13
24618 -15.67
24619 5.41
24620 15.96
24621 -31.48
24622 42.32
24623 21.24
24624 21.25
24625 10.71
24626 21.26
24627 63.44
24628 -5.09
24629 5.45
24630 -26.18
24631 5.46
24632 -15.63
24633 31.84
24634 31.84
24635 -20.89
24636 -20.89
24637 42.41
24638 21.32
24639 42.42
24640 -20.87
24641 37.16
24642 10.79
24643 -15.59
24644 -5.03
24645 -41.96
24646 16.08
24647 -41.96
24648 10.81
24649 -5.01
24650 47.76
24651 -

### Tirage des formes-cases
- cellTirs contient le nombre de fois que chaque forme-case a été tirée

In [15]:
for l in lexTirs:
    cellTirs={}
    listeCases=np.random.choice(marksN,lexTirs[l],replace=True,p=distMarks)
    for c in listeCases:
        if not c in cellTirs:
            cellTirs[c]=0
        cellTirs[c]+=1
    if len(cellTirs)==8 and debug:
        print l,tirages.loc[l,"CF"], cellTirs
    for c in cellTirs:
        tirages.loc[l,c]=cellTirs[c]
    

### Nombre de formes du tirage brut
Le nombre de formes du tirage brut correspond au nombre de tokens paramétré si ce nombre est plus élevé que le nombre de formes calculé via le *ratio*, le tirage est réduit pour obtenir un nombre de forme de l'ordre de grandeur désiré

In [16]:
tirages[marksN].replace(0,np.nan).count().sum()

184261

## Nouvelles fréquences des cases

In [17]:
newDistMarks=probElements(tirages[marksN].sum().to_dict())
distMarks,newDistMarks

([0.3679369250985546,
  0.1839684625492773,
  0.12264564169951818,
  0.09198423127463864,
  0.07358738501971092,
  0.06132282084975909,
  0.05256241787122208,
  0.04599211563731932],
 {'abcN': 0.3680168,
  'defN': 0.1838878,
  'ghiN': 0.122598,
  'klmN': 0.091913,
  'nopN': 0.0736738,
  'qrsN': 0.0613472,
  'tuvN': 0.0526744,
  'wxyN': 0.045889})

## Réduction du nombre de types
Pour obtenir un nombre de type compatible avec l'ordre de grandeur fixé via *ratio*, on fixe un seuil de tokens pour inclure les formes dans le tirage.
- si le seuil est fixé à 3, par exemple, les formes ayant moins de 3 attestations sont éliminées
- le seuil est calculé pour s'approcher de l'ordre de grandeur par le haut

Les lexèmes qui n'ont aucune forme dans l'échantillon sont éliminés.
- result=result.dropna(thresh=len(marks)+2+1) => un lexème qui a au moins une forme doit avoir ses 2 colonnes CF, P remplies ainsi que toutes les marques, len(marks), plus au moins une forme tirée (+1)

In [18]:
def reduceTirages(df,seuil):
    result=df.copy()
    for n in range(seuil+1):
        result[marksN]=result[marksN].replace(n,np.nan)
    result=result.dropna(thresh=len(marks)+2+1)
    return result

In [19]:
tiragesReduits=tirages.copy()
for i in range(nbFormes):
    if reduceTirages(tirages,i)[marksN].count().sum()<nbFormes:
        break
if i>0:
    tiragesReduits=reduceTirages(tirages,i-1)
print "Nombre de types réduit pris en compte",tiragesReduits[marksN].count().sum()

Nombre de types réduit pris en compte 200000


In [20]:
#tiragesReduits

In [21]:
print "Nombre de lexèmes dans l'échantillon",len(tiragesReduits)

Nombre de lexèmes dans l'échantillon 25000


## Ajouter un champ pour regex

In [22]:
def ajouterChampParadigme(x):
    result=""
    for c in marks:
        if x[c+"N"]>0 and x[c]==x[c]:
            result+=x[c]
        else:
            result+="."
    return result

In [23]:
tiragesReduits["regex"]=tiragesReduits.apply(ajouterChampParadigme,axis=1)

In [24]:
tiragesReduits[marksN]=tiragesReduits[marksN].replace({0:np.nan})

In [25]:
fullParadigms=tiragesReduits.dropna()
print
print "Nombre de lexèmes avec un paradigme complet dans l'échantillon",len(fullParadigms)
fullParadigms


Nombre de lexèmes avec un paradigme complet dans l'échantillon 14763


Unnamed: 0,CF,P,abc,def,ghi,klm,nop,qrs,tuv,wxy,abcN,defN,ghiN,klmN,nopN,qrsN,tuvN,wxyN,regex
0,1,0.093424,c,e,h,m,o,q,u,w,171850.0,85801.0,57295.0,43058.0,34190.0,28717.0,24448.0,21467.0,cehmoquw
1,1478,0.046712,b,f,i,m,n,r,v,y,85795.0,42992.0,28603.0,21516.0,17233.0,14399.0,12303.0,10821.0,bfimnrvy
2,86,0.031141,c,f,i,l,n,r,t,y,57201.0,28472.0,19188.0,14415.0,11422.0,9513.0,8209.0,7130.0,cfilnrty
3,0,0.023356,c,d,g,k,o,q,u,y,42719.0,21205.0,14121.0,10667.0,8608.0,7143.0,6233.0,5370.0,cdgkoquy
4,11,0.018685,a,d,i,k,n,q,t,w,34475.0,17043.0,11549.0,8595.0,6810.0,5771.0,4968.0,4144.0,adiknqtw
5,120,0.015571,c,d,h,k,n,s,v,w,28427.0,14433.0,9428.0,7233.0,5824.0,4679.0,3936.0,3604.0,cdhknsvw
6,2318,0.013346,a,e,g,l,o,q,u,x,24516.0,12076.0,8078.0,6184.0,4992.0,4189.0,3599.0,3037.0,aegloqux
7,3910,0.011678,c,d,h,l,n,s,u,w,21541.0,10891.0,7100.0,5279.0,4368.0,3528.0,3120.0,2690.0,cdhlnsuw
8,11,0.010380,a,d,i,k,n,q,t,w,19237.0,9666.0,6348.0,4770.0,3881.0,3079.0,2720.0,2439.0,adiknqtw
9,2980,0.009342,c,f,g,k,o,q,u,w,17251.0,8525.0,5657.0,4269.0,3370.0,2886.0,2447.0,2067.0,cfgkoquw


In [26]:
nbCompleteCF=len(fullParadigms.groupby("regex"))
print "Nombre de CF exemplaires pleines dans l'échantillon",nbCompleteCF

Nombre de CF exemplaires pleines dans l'échantillon 3092


In [27]:
#fullParadigms.groupby("regex").count()[["P"]].sort_values("P")

In [28]:
paradigmsCounts=tiragesReduits.groupby(["regex"]).count()
print len(paradigmsCounts)
paradigmsCounts=paradigmsCounts["P"].sort_values().to_dict()

8178


In [29]:
paradigmsGroups=tiragesReduits.groupby(["regex"]).groups.keys()
print len(paradigmsGroups)

8178


In [30]:
paradigmMappings={}
for p in paradigmsGroups:
    for cfRegex in paradigmsGroups:
        m=re.match(p,cfRegex)
        if m:
            if not p in paradigmMappings:
                paradigmMappings[p]=set()
            paradigmMappings[p].add(cfRegex)
paradigmMappings

{'beimnsu.': {'beimnsu.', 'beimnsux', 'beimnsuy'},
 'bdg.nst.': {'bdg.nst.',
  'bdg.nstw',
  'bdgknst.',
  'bdgknstw',
  'bdgknstx',
  'bdgknsty',
  'bdglnst.',
  'bdglnstx',
  'bdgmnst.',
  'bdgmnstw'},
 'cfik..v.': {'cfik..v.',
  'cfik.rv.',
  'cfikn.v.',
  'cfiknqvx',
  'cfiknrvx',
  'cfiknsv.',
  'cfiknsvw',
  'cfiknsvx',
  'cfikoqvw',
  'cfikoqvy',
  'cfikorvx',
  'cfikosv.',
  'cfikosvw',
  'cfikosvx',
  'cfikosvy',
  'cfikpqv.',
  'cfikpqvw',
  'cfikpqvx',
  'cfikpsvw',
  'cfikpsvx'},
 'cfgkpr.x': {'cfgkpr.x', 'cfgkprtx', 'cfgkprux', 'cfgkprvx'},
 'afgln.vy': {'afgln.vy'},
 'bdi.ortx': {'bdi.ortx', 'bdikortx', 'bdilortx'},
 'cfh.nrty': {'cfh.nrty', 'cfhlnrty', 'cfhmnrty'},
 'bfhlnrtx': {'bfhlnrtx'},
 'bfhlnrty': {'bfhlnrty'},
 'adgmpstx': {'adgmpstx'},
 'bdgk.rvx': {'bdgk.rvx', 'bdgkorvx', 'bdgkprvx'},
 'behknr.w': {'behknr.w', 'behknrtw', 'behknruw', 'behknrvw'},
 'cegkoqvy': {'cegkoqvy'},
 'adil.r..': {'adil.r..',
  'adil.ru.',
  'adil.rux',
  'adil.ruy',
  'adil.rvw',
  'adil

## Trouver les paradigmes restants

In [31]:
paradigmTops={}
for p in paradigmMappings:
    if len(paradigmMappings[p])==1:
        paradigmTops[p]=paradigmsCounts[p]
len(paradigmTops),paradigmTops

(3276,
 {'afgmoqtw': 2,
  'cdhknqtx': 8,
  'cdhkoqtx': 1,
  'ceiloruy': 1,
  'ceilorux': 1,
  'ceiloruw': 1,
  'afgln.vy': 1,
  'cdhkoqtw': 2,
  'bfhlosty': 1,
  'bdhmoqvx': 7,
  'bfhkosuy': 1,
  'beimoquw': 2,
  'afhmoqtw': 1,
  'bfhlnrtx': 1,
  'bfhlnrty': 2,
  'cegkosvx': 2,
  'adgmpstx': 1,
  'cegkp.ty': 1,
  'ceimpquw': 1,
  'afilosvx': 4,
  'cegkoqvy': 1,
  'aeilnsux': 1,
  'cegk.sux': 1,
  'afiknqtw': 1,
  'aeilnsuw': 16,
  'adhmprux': 60,
  'bfhlpqu.': 1,
  'cdgkpruw': 4,
  'adhknqvw': 5,
  'aegmnsvy': 2,
  'aegmnsvx': 1,
  'cdhlnrvy': 1,
  'cdhlnrvx': 1,
  'cehmnqux': 1,
  'aegmpqvx': 2,
  'afimnsux': 1,
  'bfhmpquy': 4,
  'bfhmpqux': 2,
  'cfilpqux': 1,
  'beimnsuy': 1,
  'beimnsux': 1,
  'cdgmnquy': 5,
  'cdhkosvx': 1,
  'begknrvw': 1,
  'behmnsuw': 1,
  'cegmorux': 3,
  'begmnruw': 2,
  'afglnruy': 48,
  'cdimosvy': 1,
  'adhmnruw': 2,
  'adhlnruw': 1,
  'adhmnruy': 1,
  'adhmnrux': 1,
  'aehkorvw': 1,
  'cfhlpr.y': 1,
  'cegkosvw': 1,
  'bdimoqv.': 1,
  'beikosuw': 2,
  'c

## Trouver les correspondances des partiels vers les restants

=> mettre à jour le lexique avec les nouvelles classes flexionnelles

In [32]:
import operator
paradigm2Top={}
top2Paradigms={}
for p in paradigmMappings:
    lTops={m:paradigmTops[m] for m in paradigmMappings[p] if m in paradigmTops}
    topMax=max(lTops.iteritems(), key=operator.itemgetter(1))[0]
    if topMax not in top2Paradigms:
        top2Paradigms[topMax]=[]
    top2Paradigms[topMax].append((p,paradigmsCounts[p]))
    paradigm2Top[p]=topMax
top2Paradigms

{'cdhknqtx': [('cdhkn.tx', 2),
  ('cdhkn.t.', 1),
  ('cdhknqtx', 8),
  ('cdh..qtx', 1)],
 'cdhkoqtx': [('cdhkoqtx', 1), ('cdhko.tx', 1)],
 'ceiloruy': [('ceiloruy', 1)],
 'ceilorux': [('ceilorux', 1), ('ceilo.ux', 1)],
 'ceiloruw': [('ceiloruw', 1)],
 'afgln.vy': [('afgln.vy', 1)],
 'cdhkoqtw': [('cdhkoqtw', 2), ('cdhkoq.w', 1)],
 'bfhlosty': [('bf.losty', 1), ('bfhlosty', 1), ('bfhlost.', 1)],
 'bdhmoqvx': [('bdhm...x', 1),
  ('bdhmoqvx', 7),
  ('bdh.oqvx', 1),
  ('bdhmoq..', 1),
  ('bdh..qvx', 1),
  ('bdhmo.vx', 2)],
 'bfhkosuy': [('bfhkosuy', 1)],
 'beimoquw': [('beimoquw', 2), ('beimoqu.', 1)],
 'afhmoqtw': [('afhmoq.w', 1), ('afhmoqtw', 1)],
 'bfhlnrtx': [('bfhlnrtx', 1)],
 'bfhlnrty': [('bfhlnrty', 2)],
 'cegkosvx': [('cegkosvx', 2)],
 'adgmpstx': [('adgmpstx', 1), ('adgmps.x', 1)],
 'cegkp.ty': [('cegkp.ty', 1)],
 'afilosvx': [('afil.sv.', 1), ('afilos.x', 1), ('afilosvx', 4)],
 'cegkoqvy': [('cegkoqvy', 1)],
 'aeilnsux': [('aeilnsux', 1)],
 'cegk.sux': [('cegk.sux', 1)],
 'afik

In [33]:
tiragesReduits["newRegex"]=tiragesReduits["regex"].apply(lambda x: paradigm2Top[x])

In [34]:
tiragesReduits["Freq"]=tiragesReduits[marksN].fillna(0).apply(lambda x: sum(x),axis=1)

In [35]:
for m in newDistMarks:
    tiragesReduits[m]=tiragesReduits[m].fillna(0)+newDistMarks[m]

In [36]:
tiragesReduits

Unnamed: 0,CF,P,abc,def,ghi,klm,nop,qrs,tuv,wxy,...,defN,ghiN,klmN,nopN,qrsN,tuvN,wxyN,regex,newRegex,Freq
0,1,0.093424,c,e,h,m,o,q,u,w,...,85801.183888,57295.122598,43058.091913,34190.073674,28717.061347,24448.052674,21467.045889,cehmoquw,cehmoquw,466826.0
1,1478,0.046712,b,f,i,m,n,r,v,y,...,42992.183888,28603.122598,21516.091913,17233.073674,14399.061347,12303.052674,10821.045889,bfimnrvy,bfimnrvy,233662.0
2,86,0.031141,c,f,i,l,n,r,t,y,...,28472.183888,19188.122598,14415.091913,11422.073674,9513.061347,8209.052674,7130.045889,cfilnrty,cfilnrty,155550.0
3,0,0.023356,c,d,g,k,o,q,u,y,...,21205.183888,14121.122598,10667.091913,8608.073674,7143.061347,6233.052674,5370.045889,cdgkoquy,cdgkoquy,116066.0
4,11,0.018685,a,d,i,k,n,q,t,w,...,17043.183888,11549.122598,8595.091913,6810.073674,5771.061347,4968.052674,4144.045889,adiknqtw,adiknqtw,93355.0
5,120,0.015571,c,d,h,k,n,s,v,w,...,14433.183888,9428.122598,7233.091913,5824.073674,4679.061347,3936.052674,3604.045889,cdhknsvw,cdhknsvw,77564.0
6,2318,0.013346,a,e,g,l,o,q,u,x,...,12076.183888,8078.122598,6184.091913,4992.073674,4189.061347,3599.052674,3037.045889,aegloqux,aegloqux,66671.0
7,3910,0.011678,c,d,h,l,n,s,u,w,...,10891.183888,7100.122598,5279.091913,4368.073674,3528.061347,3120.052674,2690.045889,cdhlnsuw,cdhlnsuw,58517.0
8,11,0.010380,a,d,i,k,n,q,t,w,...,9666.183888,6348.122598,4770.091913,3881.073674,3079.061347,2720.052674,2439.045889,adiknqtw,adiknqtw,52140.0
9,2980,0.009342,c,f,g,k,o,q,u,w,...,8525.183888,5657.122598,4269.091913,3370.073674,2886.061347,2447.052674,2067.045889,cfgkoquw,cfgkoquw,46472.0


## Compter les effectifs des CF restantes

In [38]:
newParadigmsCount={}
for t in top2Paradigms:
    newParadigmsCount[t]=sum([c for r,c in top2Paradigms[t]])
dfNewParadigms=pd.DataFrame.from_dict(newParadigmsCount,orient="index")
dfNewParadigms.columns=dfNewParadigms.columns.astype(str)
dfNewParadigms.columns=["effectif"]
dfNewParadigms.sort_values("effectif",ascending=False)

Unnamed: 0,effectif
cdgkoquy,2757
cehmoquw,1351
bfiknsuw,880
afhlosux,614
behlnqux,537
cdglnsvx,449
aehknrtx,396
cehmprtw,347
bdgmnstw,345
afhmpsvy,276


In [52]:
newLexs=tiragesReduits.index.tolist()
newDistLexs=[v for k,v in probElements(tiragesReduits["Freq"].to_dict()).iteritems()]
#newDistLexs

In [53]:
newLexTirs={}
newListeTirs=np.random.choice(newLexs,nbTokens,replace=True,p=newDistLexs)
for t in newListeTirs:
    if not t in newLexTirs:
        newLexTirs[t]=0
    newLexTirs[t]+=1

## Faire un nouveau tirage

=> mettre à jour les lexèmes et les classes flexionnelles

In [63]:
newTirages=pd.concat([lexemes, tiragesColonnes], axis=1, sort=False)
for l in newLexTirs:
    lMarksFreq=probElements(tiragesReduits.loc[l,marksN].to_dict())
    lDistMarks=[lMarksFreq[m] for m in sorted(lMarksFreq.keys())]
    
    newCellTirs={}
    listeCases=np.random.choice(marksN,newLexTirs[l],replace=True,p=lDistMarks)
    for c in listeCases:
        if not c in newCellTirs:
            newCellTirs[c]=0
        newCellTirs[c]+=1
    if len(newCellTirs)==8 and debug:
        print l,tirages.loc[l,"CF"], newCellTirs
    for c in newCellTirs:
        newTirages.loc[l,c]=newCellTirs[c]


In [64]:
newTirages

Unnamed: 0,CF,P,abc,def,ghi,klm,nop,qrs,tuv,wxy,abcN,defN,ghiN,klmN,nopN,qrsN,tuvN,wxyN
0,1,0.093424,c,e,h,m,o,q,u,w,172077,85587,57444,42898,34621,28993,24431,21396
1,1478,0.046712,b,f,i,m,n,r,v,y,85672,43025,28545,21595,17162,14243,12330,10923
2,86,0.031141,c,f,i,l,n,r,t,y,57489,28321,19376,14586,11460,9589,8159,7007
3,0,0.023356,c,d,g,k,o,q,u,y,42208,21263,14038,10695,8499,7176,6328,5377
4,11,0.018685,a,d,i,k,n,q,t,w,34903,16921,11557,8723,6721,5714,4999,4202
5,120,0.015571,c,d,h,k,n,s,v,w,28334,14320,9369,7241,5868,4688,3923,3519
6,2318,0.013346,a,e,g,l,o,q,u,x,24778,11960,8040,6229,5149,4170,3643,3137
7,3910,0.011678,c,d,h,l,n,s,u,w,21389,10821,7085,5374,4346,3540,3047,2628
8,11,0.010380,a,d,i,k,n,q,t,w,19134,9633,6339,4778,3834,3041,2700,2369
9,2980,0.009342,c,f,g,k,o,q,u,w,17197,8542,5638,4286,3298,2855,2455,2091


# Tentatives de résolution graphique