# Script pour manipuler un tirage et comparer des colonnes

In [1]:
# -*- coding: utf8 -*-
import pandas as pd
import pickle, glob
import itertools as it
import networkx as nx
import numpy as np

In [2]:
def tiragesNom(nom):
    result=[tirage for tirage in listeTirages if nom in tirage]
    return result

listeTirages=glob.glob("/Users/gilles/Box Sync/2015-Data/*-Tirage.pkl")
listeTirages200=tiragesNom("200Mo")
listeTirages20=tiragesNom("20Mo")
listeTirages1=tiragesNom("1Mo")
listeTirages50k=tiragesNom("50Ko")
listeTirages100k=tiragesNom("100Ko")


In [3]:
def lireLexique(nomLexique):
    try:
        with open(nomLexique, 'rb') as input:
            lexique=pickle.load(input)
    except:
        lexique=None
    return lexique

In [4]:
nomLexique=listeTirages1[0]
lexique=lireLexique(nomLexique)
lexique1=lexique[lexique["tir1"]>0]
taille=lexique[lexique["tir1"]>0]["tir1"].count()
paradigmes=pd.pivot_table(lexique[lexique["tir1"]>0], values='phono', index=['lexeme'], columns=['case'], aggfunc=lambda x: ",".join(x)).reset_index().reindex()


In [5]:
def compPaire(c1,c2):
    syncretisms=[]
    c1Val=paradigmes[c1].notnull()
    c2Val=paradigmes[c2].notnull()
    c1Sur=paradigmes[c1].str.contains(",")
    c2Sur=paradigmes[c2].str.contains(",")
    l1=len(paradigmes[paradigmes[c1].notnull()])
    l2=len(paradigmes[paradigmes[c2].notnull()])
    paire=paradigmes[c1Val & c2Val & (paradigmes[c1]!=paradigmes[c2])][[c1,c2]]
    lenDiff=len(paire[~paire[c1].str.contains(",") & ~paire[c2].str.contains(",")])
    if lenDiff>0:
        if debug:
            print u"%s ≠ %s"%(c1,c2)
            print "différence",lenDiff
            if lenDiff<12:
                print paire
    else:
        surAbondant=paire[paire[c1].str.contains(",") | paire[c2].str.contains(",")]
#            print "--------------------------------"
        if len(surAbondant)==0:
#                print u"%s = %s"%(c1,c2) 
            syncretisms.append(u"%s = %s"%(c1,c2))
        else:
            compatible=True
            for index,row in surAbondant.iterrows():
                if "," in row[c1]:
                    if "," in row[c2]:
                        if row[c1]!=row[c2]:
                            compatible=False
                    else:
                        if not row[c2] in row[c1].split(","):
                            compatible=False
                else:
                    if not row[c1] in row[c2].split(","):
                        compatible=False
            if compatible:
#                    print u"%s = %s"%(c1,c2)
                syncretisms.append(u"%s = %s"%(c1,c2))
            else:
                print u"%s ≠ %s"%(c1,c2)
                print surAbondant

In [6]:
def assignerTNS(row):
    if row[-2:] in [p+n for p in ["1","2","3"] for n in ["S","P"]]:
        return row[:2]
    else:
        return "NF"

def assignerPER(row):
    if row[-2:] in [p+n for n in ["S","P"] for p in ["1","2","3"] ]:
        return row[-2:]
    else:
        if row=="inf":
            return "1S"
        elif row=="pP":
            return "2S"
        elif row=="ppMS":
            return "3S"
        elif row=="ppMP":
            return "1P"
        elif row=="ppFS":
            return "2P"
        elif row=="ppFP":
            return "3P"

lexique["TNS"]=lexique["case"].apply(assignerTNS)
lexique["PER"]=lexique["case"].apply(assignerPER)

In [7]:
lexeme=u"être"
tableauTest=lexique[lexique["lexeme"]==lexeme].pivot_table(values='phono', index=['lexeme','TNS'], columns=["PER"], aggfunc=lambda x: ",".join(x))
tableauTest

Unnamed: 0_level_0,PER,1P,1S,2P,2S,3P,3S
lexeme,TNS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
être,NF,,Etr,,Etâ,,Ete
être,ai,fym,fy,fyt,fy,fyr,fy
être,fi,s6rô,s6rE,s6re,s6ra,s6rô,s6ra
être,ii,Etjô,EtE,Etje,EtE,EtE,EtE
être,is,fysjô,fys,fysje,fys,fys,fy
être,pI,swajô,,swaje,swa,,
être,pc,s6rjô,s6rE,s6rje,s6rE,s6rE,s6rE
être,pi,sOm,sHi,Et,E,sô,E
être,ps,swajô,swa,swaje,swa,swa,swa


In [8]:
ligTable=['pi', 'ii', 'fi','pc','ps','ai','is','pI','NF']
colTable=[p+n for n in ["S","P"] for p in ["1","2","3"] ]
colTuple=[("PER",col) for col in colTable]
ligTuple=[("TNS",lig) for lig in ligTable]

In [9]:
tableauTest=tableauTest.reindex(columns=colTable)

In [10]:
idxTableau = tableauTest.sortlevel(level='TNS').index
idxTableau=[idxTableau[i] for i in [7, 3, 2, 6, 1,4,5,0]]
tableauTest=tableauTest.reindex(idxTableau)
print tableauTest.to_latex().replace("None","")

\begin{tabular}{llllllll}
\toprule
     &    &    1S &    2S &    3S &     1P &     2P &    3P \\
lexeme & TNS &       &       &       &        &        &       \\
\midrule
être & pi &   sHi &     E &     E &    sOm &     Et &    sô \\
     & ii &   EtE &   EtE &   EtE &   Etjô &   Etje &   EtE \\
     & fi &  s6rE &  s6ra &  s6ra &   s6rô &   s6re &  s6rô \\
     & pc &  s6rE &  s6rE &  s6rE &  s6rjô &  s6rje &  s6rE \\
     & ai &    fy &    fy &    fy &    fym &    fyt &   fyr \\
     & is &   fys &   fys &    fy &  fysjô &  fysje &   fys \\
     & pI &   &   swa &   &  swajô &  swaje &   \\
     & NF &   Etr &   Etâ &   Ete &    &    &   \\
\bottomrule
\end{tabular}



# Nombre de formes différentes par taille d'échantillon

In [None]:
unsPoints=[]
for tirage in listeTirages100k:
    lexique=lireLexique(tirage)
    tirs=lexique[lexique["tir1"]>0]["tir1"].sum()
    taille=lexique[lexique["tir1"]>0]["tir1"].count()
    unsPoints.append((tirs, taille)) 

In [123]:
autresPoints=[(1000000,43563),
              (2000000,52654),
              (3000000,57648),
              (4000000,60647),
              (5000000,62926),
              (6000000,64507),
              (7000000,65694),
              (8000000,66597),
              (9000000,67395),
              (10000000,68049),
              (11000000,68590),
              (12000000,69007),
              (13000000,69382),
              (14000000,69690),
              (15000000,69959),
              (16000000,70178),
              (17000000,70379),
              (18000000,70547),
              (19000000,70704),
              (20000000,70859),  
              (200000000,74702),
              (400000000,76076),
              (600000000,77351),
              (800000000,78517),
              (1000000000,79565),
              (1200000000,80618),
              (1400000000,81578),
              (1600000000,82505),
             ]
points=autresPoints

In [125]:
pdPoints=pd.DataFrame(points)
pdPoints.columns=["Sample Size", "Number of Forms"]
pdPoints.set_index("Sample Size",inplace=True)
xPoints=list(pdPoints.index)
pdPoints

Unnamed: 0_level_0,Number of Forms
Sample Size,Unnamed: 1_level_1
1000000,43563
2000000,52654
3000000,57648
4000000,60647
5000000,62926
6000000,64507
7000000,65694
8000000,66597
9000000,67395
10000000,68049


In [136]:
%matplotlib

Using matplotlib backend: MacOSX


In [49]:
#pdPoints.plot(ylim=(0,300000))
pdPoints.plot(linewidth=5,marker="o",markersize=10,xlim=(0,2000e6))

<matplotlib.axes._subplots.AxesSubplot at 0x12eaa4750>

# Nombre de lexèmes par nombre de formes par taille d'échantillon
- nom du fichier échantillon => tirage
- tirage de l'échantillon => lexique
- formes tirées de l'échantillon => lexique1
- cases tirées de l'échantillon => nbCases
- nombre de token dans le tirage =>nbTokens
- distribution du nombre de lexème par nombre de forme => distNbFormes
- liste des distributions => distributions


In [11]:
lexiquePrefix="MGC-160104"

with open("/Users/gilles/Box Sync/2015-Data/MGC-Vienna/"+lexiquePrefix+'-Verbes2-NbForms.pkl', 'rb') as input:
    dictNbFormes = pickle.load(input)

In [12]:
def nbMax(row):
    if dictNbFormes[row["lexeme"]]==row["Number of Forms"]:
        return True
    else:
        return False

def nbFull(row):
    if dictNbFormes[row["lexeme"]]==row["Number of Forms"] and row["Number of Forms"] in [48,51]:
        return True
    else:
        return False
    

In [14]:
distributions=[]
for nTirage,tirage in enumerate(listeTirages):
    print tirage
    lexique=lireLexique(tirage)
    if isinstance(lexique, pd.DataFrame):
        lexique1=lexique[lexique["tir1"]>0]
        nbTokens=lexique1["tir1"].sum()
        nbCases=lexique1.groupby(by=["lexeme","case"],as_index=False).agg({"tir1":sum})
        nbFormes=nbCases.groupby(by=["lexeme"])[["case"]].count()
#        nbFormes=lexique1.groupby(by=["lexeme"])[["phono"]].count()
        nbFormes.columns=["Number of Forms"]
        distNbFormes=nbFormes.groupby(by=["Number of Forms"])[["Number of Forms"]].count()
        distNbFormes.columns=["Number of lexemes"]
        nbFormes.reset_index(inplace=True)
        lexemesMax=nbFormes[nbFormes.apply(nbMax,axis=1)]
        lexemesFull=nbFormes[nbFormes.apply(nbFull,axis=1)]
        distributions.append((nbTokens, distNbFormes,lexemesMax,lexemesFull))
        print tirage
        print "{:,}".format(nbTokens)
        print len(lexemesFull)
        print lexemesFull
        print distNbFormes[40:]
#        print lexemesMax
        print
        print
        if nTirage%5==0 and False:
            distNbFormes.plot(kind="bar",figsize=(20,10),ylim=(0,1000))

/Users/gilles/Box Sync/2015-Data/MGC-160427-100Mo-Tirage.pkl
/Users/gilles/Box Sync/2015-Data/MGC-160427-100Mo-Tirage.pkl
100,000,000
9
       lexeme  Number of Forms
159     aimer               51
175     aller               51
403     avoir               51
1239   devoir               51
1262     dire               51
2294    faire               51
4507   savoir               51
5115  vouloir               51
5369     être               48
                 Number of lexemes
Number of Forms                   
41                              36
42                              19
43                              15
44                              16
45                               8
46                               9
47                               9
48                               9
49                               2
50                               2
51                               8


/Users/gilles/Box Sync/2015-Data/MGC-160427-10Mo-Tirage.pkl
/Users/gilles/Box Sync/2015-Data/MGC-

/Users/gilles/Box Sync/2015-Data/MGC-160717-0-200Mo-Tirage.pkl
200,000,000
14
       lexeme  Number of Forms
175     aller               51
403     avoir               51
1156   croire               51
1240   devoir               51
1263     dire               51
2295    faire               51
3101   mettre               51
3404   parler               51
3414   partir               51
4508   savoir               51
4979  trouver               51
5106     voir               51
5117  vouloir               51
5371     être               48
                 Number of lexemes
Number of Forms                   
41                              30
42                              24
43                              24
44                              16
45                              12
46                              10
47                              11
48                               9
49                               3
50                               8
51                              13




/Users/gilles/Box Sync/2015-Data/MGC-170330-14-100Ko-Tirage.pkl
75,000
0
Empty DataFrame
Columns: [lexeme, Number of Forms]
Index: []
                 Number of lexemes
Number of Forms                   
42                               1
43                               1
44                               1


/Users/gilles/Box Sync/2015-Data/MGC-170330-15-100Ko-Tirage.pkl
/Users/gilles/Box Sync/2015-Data/MGC-170330-15-100Ko-Tirage.pkl
80,000
0
Empty DataFrame
Columns: [lexeme, Number of Forms]
Index: []
                 Number of lexemes
Number of Forms                   
41                               2
43                               2
44                               1


/Users/gilles/Box Sync/2015-Data/MGC-170330-16-100Ko-Tirage.pkl
/Users/gilles/Box Sync/2015-Data/MGC-170330-16-100Ko-Tirage.pkl
85,000
0
Empty DataFrame
Columns: [lexeme, Number of Forms]
Index: []
                 Number of lexemes
Number of Forms                   
41                               2
43         

/Users/gilles/Box Sync/2015-Data/MGC-170430-05-20000Ko-Tirage.pkl
6,000,000
1
     lexeme  Number of Forms
2210  faire               51
                 Number of lexemes
Number of Forms                   
41                              22
42                              19
43                              15
44                               6
45                              11
46                               3
47                               5
48                               1
49                               1
51                               1


/Users/gilles/Box Sync/2015-Data/MGC-170430-06-20000Ko-Tirage.pkl
/Users/gilles/Box Sync/2015-Data/MGC-170430-06-20000Ko-Tirage.pkl
7,000,000
1
     lexeme  Number of Forms
2226  faire               51
                 Number of lexemes
Number of Forms                   
41                              21
42                              19
43                              15
44                               6
45                            

/Users/gilles/Box Sync/2015-Data/MGC-170430-18-20000Ko-Tirage.pkl
19,000,000
2
     lexeme  Number of Forms
2265  faire               51
5306   être               48
                 Number of lexemes
Number of Forms                   
41                              22
42                              22
43                              18
44                               7
45                               9
46                               6
47                               3
48                               5
49                               3
51                               1


/Users/gilles/Box Sync/2015-Data/MGC-170430-19-20000Ko-Tirage.pkl
/Users/gilles/Box Sync/2015-Data/MGC-170430-19-20000Ko-Tirage.pkl
20,000,000
2
     lexeme  Number of Forms
2269  faire               51
5310   être               48
                 Number of lexemes
Number of Forms                   
41                              22
42                              19
43                              21
44   

In [54]:
nbFormes

Unnamed: 0_level_0,Number of Forms
lexeme,Unnamed: 1_level_1
abaisser,24
abandonner,41
abasourdir,4
abattre,38
abdiquer,15
aberrer,1
abhorrer,10
abjurer,9
abolir,17
abonder,11


In [73]:
    
nbFormes.reset_index(inplace=True)
nbFormes[nbFormes.apply(nbMax,axis=1)]

Unnamed: 0,index,lexeme,Number of Forms
56,56,accroire,1
319,319,assavoir,1
462,462,bayer,1
938,938,comparoir,1
1110,1110,courre,1
1319,1319,douer,5
1635,1635,démurger,1
2182,2182,ester,1
2269,2269,faire,51
2458,2458,férir,1


# Calculs sur les tirages de 200Mo cumulés

In [37]:
listeLexiques=listeTirages200
cumulLexique=lireLexique(listeLexiques[0])
cumulLexique["tir1"]=0
for nTirage,tirage in enumerate(listeLexiques):
#    print nTirage,
    lexique=lireLexique(tirage)
    cumulLexique["tir1"]=cumulLexique["tir1"]+lexique["tir1"]
    lexique1=cumulLexique[cumulLexique["tir1"]>0]
    nbTokens=lexique1["tir1"].sum()
    nbTypes=lexique1["tir1"].count()
    print "(%d,%d),"%(nbTokens,nbTypes)
    nbCases=lexique1.groupby(by=["lexeme","case"],as_index=False).agg({"tir1":sum})
    nbFormes=nbCases.groupby(by=["lexeme"])[["case"]].count()
    nbFormes.columns=["Number of Forms"]
    distNbFormes=nbFormes.groupby(by=["Number of Forms"])[["Number of Forms"]].count()
    distNbFormes.columns=["Number of lexemes"]
    distributions.append(distNbFormes)
    if nTirage%1==5:
        distNbFormes.plot(kind="bar",figsize=(20,10),ylim=(0,1000))    

(200000000,74702),
(400000000,76076),
(600000000,77351),
(800000000,78517),
(1000000000,79565),
(1200000000,80618),
(1400000000,81578),
(1600000000,82505),


In [144]:
for n in [int(1e6*k) for k in range(1,1600)]:
    if not n in pdPoints.index:
        pdPoints.loc[n]=np.NaN
pdPoints.sort_index(inplace=True)
pdPoints.interpolate(method="slinear", order=4, inplace=True)
derivativeNb=pdPoints.diff().reset_index()
derivativeNb.set_index("Sample Size",inplace=True)

In [130]:
funcDerivativeNb=derivativeNb[derivativeNb["Sample Size"].isin(xPoints)]
funcDerivativeNb.set_index("Sample Size",inplace=True)

In [149]:
funcDerivativeNb.ix["150000000":].plot()

<matplotlib.axes._subplots.AxesSubplot at 0x128ec3210>

In [141]:
funcDerivativeNb

Unnamed: 0_level_0,Number of Forms
Sample Size,Unnamed: 1_level_1
1000000,
2000000,9091.0
3000000,4994.0
4000000,2999.0
5000000,2279.0
6000000,1581.0
7000000,1187.0
8000000,903.0
9000000,798.0
10000000,654.0


In [28]:
for tirage in listeTirages:
    lexique=lireLexique(tirage)
    if isinstance(lexique, pd.DataFrame):
        lexique1=lexique[lexique["tir1"]>0]
        tokens=lexique[lexique["tir1"]>0]["tir1"].sum()
        print tokens,tirage
        print


100000000 /Users/gilles/Box Sync/2015-Data/MGC-160427-100Mo-Tirage.pkl

10000000 /Users/gilles/Box Sync/2015-Data/MGC-160427-10Mo-Tirage.pkl

250000000 /Users/gilles/Box Sync/2015-Data/MGC-160427-250Mo-Tirage.pkl

5000000 /Users/gilles/Box Sync/2015-Data/MGC-160427-2Mo-5Mo-Tirage.pkl

2000000 /Users/gilles/Box Sync/2015-Data/MGC-160427-2Mo-Tirage.pkl

200000000 /Users/gilles/Box Sync/2015-Data/MGC-160628-0-200Mo-Tirage.pkl

1000000 /Users/gilles/Box Sync/2015-Data/MGC-160628-1Mo-Tirage.pkl

20000000 /Users/gilles/Box Sync/2015-Data/MGC-160628-20Mo-Tirage.pkl

200000000 /Users/gilles/Box Sync/2015-Data/MGC-160716-0-200Mo-Tirage.pkl

200000000 /Users/gilles/Box Sync/2015-Data/MGC-160716-1-200Mo-Tirage.pkl

200000000 /Users/gilles/Box Sync/2015-Data/MGC-160716-2-200Mo-Tirage.pkl

200000000 /Users/gilles/Box Sync/2015-Data/MGC-160717-0-200Mo-Tirage.pkl

200000000 /Users/gilles/Box Sync/2015-Data/MGC-160718-0-200Mo-Tirage.pkl

200000000 /Users/gilles/Box Sync/2015-Data/MGC-160719-0-200Mo-Ti

In [29]:
lexique1.groupby(by=["lexeme"]).count()

Unnamed: 0_level_0,ortho,phono,freq,case,freqcum,tir1
lexeme,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
abaisser,24,24,24,24,24,24
abandonner,41,41,41,41,41,41
abasourdir,4,4,4,4,4,4
abattre,38,38,38,38,38,38
abdiquer,15,15,15,15,15,15
aberrer,1,1,1,1,1,1
abhorrer,10,10,10,10,10,10
abjurer,9,9,9,9,9,9
abolir,17,17,17,17,17,17
abonder,11,11,11,11,11,11
