# Script pour manipuler un tirage et comparer des colonnes

In [1]:
# -*- coding: utf8 -*-
import pandas as pd
import pickle, glob
import itertools as it
import networkx as nx

In [2]:
def tiragesNom(nom):
    result=[tirage for tirage in listeTirages if nom in tirage]
    return result

listeTirages=glob.glob("/Users/gilles/Box Sync/2015-Data/*-Tirage.pkl")
listeTirages200=tiragesNom("200Mo")
listeTirages20=tiragesNom("20Mo")
listeTirages1=tiragesNom("1Mo")
listeTirages50k=tiragesNom("50Ko")
listeTirages100k=tiragesNom("100Ko")


In [3]:
def lireLexique(nomLexique):
    with open(nomLexique, 'rb') as input:
        lexique=pickle.load(input)
    return lexique

In [4]:
nomLexique=listeTirages100k[2]
lexique=lireLexique(nomLexique)
taille=lexique[lexique["tir1"]>0]["tir1"].count()
paradigmes=pd.pivot_table(lexique[lexique["tir1"]>0], values='phono', index=['lexeme'], columns=['case'], aggfunc=lambda x: ",".join(x)).reset_index().reindex()


In [5]:
def compPaire(c1,c2):
    syncretisms=[]
    c1Val=paradigmes[c1].notnull()
    c2Val=paradigmes[c2].notnull()
    c1Sur=paradigmes[c1].str.contains(",")
    c2Sur=paradigmes[c2].str.contains(",")
    l1=len(paradigmes[paradigmes[c1].notnull()])
    l2=len(paradigmes[paradigmes[c2].notnull()])
    paire=paradigmes[c1Val & c2Val & (paradigmes[c1]!=paradigmes[c2])][[c1,c2]]
    lenDiff=len(paire[~paire[c1].str.contains(",") & ~paire[c2].str.contains(",")])
    if lenDiff>0:
        if debug:
            print u"%s ≠ %s"%(c1,c2)
            print "différence",lenDiff
            if lenDiff<12:
                print paire
    else:
        surAbondant=paire[paire[c1].str.contains(",") | paire[c2].str.contains(",")]
#            print "--------------------------------"
        if len(surAbondant)==0:
#                print u"%s = %s"%(c1,c2) 
            syncretisms.append(u"%s = %s"%(c1,c2))
        else:
            compatible=True
            for index,row in surAbondant.iterrows():
                if "," in row[c1]:
                    if "," in row[c2]:
                        if row[c1]!=row[c2]:
                            compatible=False
                    else:
                        if not row[c2] in row[c1].split(","):
                            compatible=False
                else:
                    if not row[c1] in row[c2].split(","):
                        compatible=False
            if compatible:
#                    print u"%s = %s"%(c1,c2)
                syncretisms.append(u"%s = %s"%(c1,c2))
            else:
                print u"%s ≠ %s"%(c1,c2)
                print surAbondant

In [15]:
def assignerTNS(row):
    if row[-2:] in [p+n for p in ["1","2","3"] for n in ["S","P"]]:
        return row[:2]
    else:
        return "NF"

def assignerPER(row):
    if row[-2:] in [p+n for n in ["S","P"] for p in ["1","2","3"] ]:
        return row[-2:]
    else:
        if row=="inf":
            return "1S"
        elif row=="pP":
            return "2S"
        elif row=="ppMS":
            return "3S"
        elif row=="ppMP":
            return "1P"
        elif row=="ppFS":
            return "2P"
        elif row=="ppFP":
            return "3P"

lexique["TNS"]=lexique["case"].apply(assignerTNS)
lexique["PER"]=lexique["case"].apply(assignerPER)

In [59]:
lexeme=u"être"
tableauTest=lexique[lexique["lexeme"]==lexeme].pivot_table(values='phono', index=['lexeme','TNS'], columns=["PER"], aggfunc=lambda x: ",".join(x))
tableauTest

Unnamed: 0_level_0,PER,1P,1S,2P,2S,3P,3S
lexeme,TNS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
être,NF,,Etr,,Etâ,,Ete
être,ai,fym,fy,fyt,fy,fyr,fy
être,fi,s6rô,s6rE,s6re,s6ra,s6rô,s6ra
être,ii,Etjô,EtE,Etje,EtE,EtE,EtE
être,is,fysjô,fys,fysje,fys,fys,fy
être,pI,swajô,,swaje,swa,,
être,pc,s6rjô,s6rE,s6rje,s6rE,s6rE,s6rE
être,pi,sOm,sHi,Et,E,sô,E
être,ps,swajô,swa,swaje,swa,swa,swa


In [60]:
ligTable=['pi', 'ii', 'fi','pc','ps','ai','is','pI','NF']
colTable=[p+n for n in ["S","P"] for p in ["1","2","3"] ]
colTuple=[("PER",col) for col in colTable]
ligTuple=[("TNS",lig) for lig in ligTable]

In [61]:
tableauTest=tableauTest.reindex(columns=colTable)

In [65]:
idxTableau = tableauTest.sortlevel(level='TNS').index
idxTableau=[idxTableau[i] for i in [7, 3, 2, 6, 1,4,5,0]]
tableauTest=tableauTest.reindex(idxTableau)
print tableauTest.to_latex().replace("None","")

\begin{tabular}{llllllll}
\toprule
     &    &    1S &    2S &    3S &     1P &     2P &    3P \\
lexeme & TNS &       &       &       &        &        &       \\
\midrule
être & pi &   sHi &     E &     E &    sOm &     Et &    sô \\
     & ii &   EtE &   EtE &   EtE &   Etjô &   Etje &   EtE \\
     & fi &  s6rE &  s6ra &  s6ra &   s6rô &   s6re &  s6rô \\
     & pc &  s6rE &  s6rE &  s6rE &  s6rjô &  s6rje &  s6rE \\
     & ai &    fy &    fy &    fy &    fym &    fyt &   fyr \\
     & is &   fys &   fys &    fy &  fysjô &  fysje &   fys \\
     & pI &   &   swa &   &  swajô &  swaje &   \\
     & NF &   Etr &   Etâ &   Ete &    &    &   \\
\bottomrule
\end{tabular}



In [4]:
for tirage in listeTirages100k:
    lexique=lireLexique(tirage)
    taille=lexique[lexique["tir1"]>0]["tir1"].count()
    print tirage, taile


['/Users/gilles/Box Sync/2015-Data/MGC-170330-00-100Ko-Tirage.pkl',
 '/Users/gilles/Box Sync/2015-Data/MGC-170330-01-100Ko-Tirage.pkl',
 '/Users/gilles/Box Sync/2015-Data/MGC-170330-02-100Ko-Tirage.pkl',
 '/Users/gilles/Box Sync/2015-Data/MGC-170330-03-100Ko-Tirage.pkl',
 '/Users/gilles/Box Sync/2015-Data/MGC-170330-04-100Ko-Tirage.pkl',
 '/Users/gilles/Box Sync/2015-Data/MGC-170330-05-100Ko-Tirage.pkl',
 '/Users/gilles/Box Sync/2015-Data/MGC-170330-06-100Ko-Tirage.pkl',
 '/Users/gilles/Box Sync/2015-Data/MGC-170330-07-100Ko-Tirage.pkl',
 '/Users/gilles/Box Sync/2015-Data/MGC-170330-08-100Ko-Tirage.pkl',
 '/Users/gilles/Box Sync/2015-Data/MGC-170330-09-100Ko-Tirage.pkl',
 '/Users/gilles/Box Sync/2015-Data/MGC-170330-10-100Ko-Tirage.pkl',
 '/Users/gilles/Box Sync/2015-Data/MGC-170330-11-100Ko-Tirage.pkl',
 '/Users/gilles/Box Sync/2015-Data/MGC-170330-12-100Ko-Tirage.pkl',
 '/Users/gilles/Box Sync/2015-Data/MGC-170330-13-100Ko-Tirage.pkl',
 '/Users/gilles/Box Sync/2015-Data/MGC-170330-14