# Trouver et appliquer les règles à entropie nulle


## Importations
- codecs pour les encodages
- pandas et numpy pour les calculs sur tableaux
- matplotlib pour les graphiques
- itertools pour les itérateurs sophistiqués (paires sur liste, ...)

In [1]:
# -*- coding: utf8 -*-
import codecs,glob
import features
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools as it
import networkx as nx
import pickle,yaml
#%pylab inline
#pd.options.display.mpl_style = 'default'
debug=False

### Préparation des matrices de traits

In [2]:
features.add_config('bdlexique.ini')
fs=features.FeatureSystem('phonemes')

### Gestion du codage IPA

In [3]:
import unicodedata
def listerUnicode(chaine):
    result=[]
    comb=False
    prec=u""
    for char in chaine:
        print char,
        if unicodedata.combining(char):
            result.append(prec+char)
            prec=""
        else:
            result.append(prec)
            prec=char
    result.append(prec)
    return [r for r in result if r!=""]

In [4]:
ipaIn = listerUnicode(u"SZNêôârEHO")
ipaOut= [u"ʃ",u"ʒ",u"ŋ",u"ɛ̃",u"ɔ̃",u"ɑ̃",u"ʁ",u"ɛ",u"ɥ",u"ɔ"]
toipa = dict(zip(ipaIn, ipaOut))

S Z N ê ô â r E H O


In [5]:
def coderIPA(chaine,table=toipa):
    result=chaine
    for k in table:
        result=result.replace(k,table[k])
    return result

In [6]:
print coderIPA(u"SâZ piNpôN")

ʃɑ̃ʒ piŋpɔ̃ŋ


In [8]:
rep="/Users/gilles/ownCloud/Recherche/Boye/HDR/Data/Longitudinales/"
rep="/Volumes/gilles/Transfert/Copies-iMac-GB/2015-Data/Longitudinales/"
rep="/Volumes/gilles/Transfert/Copies-iMac-GB/2015-Data/StemSpaces/LongitudinalesRnd/"
rep="/Volumes/gilles/Transfert/Copies-iMac-GB/2015-Data/StemSpaces/Longitudinales/"
rep="/Volumes/gilles/Transfert/Copies-iMac-GB/2015-Data/lexique3/"
#rep="/Users/gilles/Desktop/lexique3/"
fichiers=glob.glob(rep+"*-X*-Regles.pkl")
samples=[f.rsplit("/",1)[-1].split("Regles")[0] for f in fichiers]
# samples={int(s.split("-")[1]):s for s in samples}
samples={int(s.split("-")[2]):s for s in samples}
samples

{0: 'Longitudinal-Lexique3-00-T150000000-F73539-X-',
 1: 'Longitudinal-Lexique3-01-T150000000-F254455-X-'}

In [9]:
sample=samples[1]
fRulesPMS="Regles.pkl"
fRulesPMO="Morphomes-Regles.pkl"

# Classe pour la gestion des patrons, des classes et des transformations

In [10]:
class paireClasses:
    def __init__(self,case1,case2):
        self.case1=case1
        self.case2=case2
        self.nom=case1+"-"+case2
        self.classes1=classesPaire(case1,case2)
        self.classes2=classesPaire(case2,case1)

    def ajouterPatron(self,n,patron,motif):
        if n==1:
            self.classes1.ajouterPatron(patron,motif)
        elif n==2:
            self.classes2.ajouterPatron(patron,motif)
        else:
            print "le numéro de forme n'est pas dans [1,2]",n

    def ajouterPaire(self,forme1,forme2):
        self.classes1.ajouterPaire(forme1,forme2)
        self.classes2.ajouterPaire(forme2,forme1)
        
    def calculerClasses(self):
        return(self.classes1,self.classes2)

    
class classesPaire:
    '''
    Gestion des patrons, des classes et des transformations
    
    ajouterPatron : ajoute un patron et son motif associé (MGL)
    ajouterPaire : ajoute une paire de formes, calcule la classe de la forme1 et la règle sélectionnée
    sortirForme : cacule les formes de sortie correspondant à la forme1 avec leurs coefficients respectifs
    '''
    def __init__(self,case1,case2):
        self.case1=case1
        self.case2=case2
        self.nom=case1+"-"+case2
        self.classe={}
        self.nbClasse={}
        self.patrons={}
        self.entree={}
        self.sortie={}
    
    def ajouterPatron(self,patron,motif):
        self.patrons[patron]=motif
        (entree,sortie)=patron.split("-")
        self.entree[patron]=entree.replace(u".",u"(.)")
        self.sortie[patron]=remplacementSortie(sortie)
    
    def ajouterPaire(self,forme1,forme2):
        '''
        on calcule la classe de la paire idClasseForme et la règle sélectionnée
        on incrémente le compteur de la classe et celui de la règle sélectionnée à l'intérieur de la classe
        '''
        classeForme=[]
        regleForme=""
        for patron in self.patrons:
            if re.match(self.patrons[patron],forme1):
                classeForme.append(patron)
                '''
                le +"$" permet de forcer l'alignement à droite pour les transformations suffixales
                '''
                if forme2==re.sub(self.entree[patron]+"$",self.sortie[patron],forme1):
                    regleForme=patron
        idClasseForme=", ".join(classeForme)
        if not idClasseForme in self.classe:
            self.classe[idClasseForme]={}
            self.nbClasse[idClasseForme]=0
        if not regleForme in self.classe[idClasseForme]:
            self.classe[idClasseForme][regleForme]=0
        self.nbClasse[idClasseForme]+=1
        self.classe[idClasseForme][regleForme]+=1

    def sortirForme(self,forme,contextFree=False):
        classeForme=[]
        sortieForme={}
        for patron in self.patrons:
            if contextFree:
                filterF1=".*"+patron.split("-")[0]+"$"
            else:
                filterF1=self.patrons[patron]
            if re.match(filterF1,forme):
                classeForme.append(patron)
        if classeForme:
            idClasseForme=", ".join(classeForme)
            if contextFree:
                nbClasse=self.nbClasseCF
                classe=self.classeCF
            else:
                nbClasse=self.nbClasse
                classe=self.classe
            if idClasseForme in nbClasse:
                nTotal=nbClasse[idClasseForme]
                for patron in classe[idClasseForme]:
                    sortie=re.sub(self.entree[patron]+"$",self.sortie[patron],forme)
                    sortieForme[sortie]=float(classe[idClasseForme][patron])/nTotal
            else:
#                if debug: 
#                    print (forme, file=logfile)
#                    print ("pas de classe",idClasseForme, file=logfile)
#                    print ("%.2f par forme de sortie" % (float(1)/len(classeForme)), file=logfile)
                nTotal=len(classeForme)
                for patron in classeForme:
                    sortie=re.sub(self.entree[patron]+"$",self.sortie[patron],forme)
                    sortieForme[sortie]=float(1)/nTotal
#        else:
#            if debug:
#                print (forme, file=logfile) 
#                print ("pas de patron", file=logfile)
        return sortieForme
        

## Ouvrir les fichiers de règles

In [11]:
def openRules(numero,typeRules="PMS"):
    rules=None
    if typeRules=="PMS" and numero in samples:
        with open(rep+samples[numero]+fRulesPMS, 'rb') as input:
            rules = pickle.load(input)
    elif typeRules=="PMO" and numero in samples:
        with open(rep+samples[numero]+fRulesPMO, 'rb') as input:
            rules = pickle.load(input)
    return rules

In [12]:
numero=1
typeRules="PMO"
avecContexte=True

In [13]:
sample

'Longitudinal-Lexique3-01-T150000000-F254455-X-'

In [14]:
#rules=openRules(numero,typeRules=typeRules)
with open(rep+sample+"Regles.pkl", 'rb') as input:
    rules = pickle.load(input)

In [None]:
rules[("ii1S","ii1P")].patrons

In [None]:
zeroEntropie=[]
for paire in rules:
    classe=rules[paire].classe
    c1,c2=paire
    if c1!=c2:
        zero=True
        print paire
        for transformation in classe:
            choix=classe[transformation]
            if len(choix)>1:
                print choix
                zero=False
        if zero:
            print "zéro",choix
            zeroEntropie.append(paire)
        print

In [None]:
zeroPop={}
for paire in zeroEntropie:
    for t in rules[paire].classe:
        pop=0
        for c in rules[paire].classe[t]:
            pop+=rules[paire].classe[t][c]
        zeroPop[paire]=pop
    

In [None]:
stemspaceThresh=100
stemspaceRules=[]
for k in sorted(zeroPop, key=zeroPop.get,reverse=True):
    print k, zeroPop[k]
    if zeroPop[k]>=stemspaceThresh:
        stemspaceRules.append(k)

In [None]:
len(stemspaceRules)

In [None]:
stemSpace=nx.Graph()
for c1,c2 in stemspaceRules:
    if (c2,c1) in stemspaceRules:
#        print c1,c2
        stemSpace.add_edge(c1,c2)
stemSpaceCliques=list(nx.find_cliques(stemSpace))
len(stemSpaceCliques),sorted(stemSpaceCliques,key=len,reverse=True)

In [None]:
fSamplePMS="paradigmes.csv"
fSamplePMO="Morphomes-paradigmes.csv"

In [None]:
def openSample(numero,typeSample="PMS"):
    result=None
    fSample=""
    if typeSample=="PMS" and numero in samples:
        fSample=rep+samples[numero]+fSamplePMS
    elif typeSample=="PMO" and numero in samples:
        fSample=rep+samples[numero]+fSamplePMO
    if fSample:
        result=pd.read_csv(rep+samples[numero]+fSamplePMS,sep=";",index_col=0,keep_default_na=False,encoding="utf8")
    return result    

In [None]:
df=openSample(2,typeRules)

In [None]:
def makeZeros(sampleFormes,inFormes=True,contextFree=True):
    series=sampleFormes.copy()
    series={k:v for k,v in series.iteritems() if v!=""}

    paires=rules.keys()
    formesC={}
    for c in series:
        if not c in formesC:
            formesC[c]=set()
        formesC[c].add(series[c])
        if debug: print c, series[c]
        formeC=series[c]
        pairesC=[(c1,c2) for (c1,c2) in paires if c1==c]
        for p in pairesC:
            formeP=rules[p].sortirForme(series[c],contextFree=contextFree)
            if formeP and len(formeP)==1:
                p1=p[1]
                if debug: print p1,formeP
                if p1 not in formesC:
                    formesC[p1]=set()
                formesC[p1].add(formeP.keys()[0])
    if inFormes:
        formesZero={k:list(v)[0] for k,v in formesC.iteritems() if len(v)==1}
    else:
        formesZero={k:list(v)[0] for k,v in formesC.iteritems() if len(v)==1 and k not in series}

    return formesZero

In [None]:
def fillStemSpace(sampleFormes,inFormes=True,contextFree=True):
    series=sampleFormes.copy()
    series={k:v for k,v in series.iteritems() if v!=""}

    paires=stemspaceRules
    formesC={}
    for c in series:
        if not c in formesC:
            formesC[c]=set()
        formesC[c].add(series[c])
        if debug: print c, series[c]
        formeC=series[c]
        pairesC=[(c1,c2) for (c1,c2) in paires if c1==c]
        for p in pairesC:
            formeP=rules[p].sortirForme(series[c],contextFree=contextFree)
            if formeP and len(formeP)==1:
                p1=p[1]
                if debug: print p1,formeP
                if p1 not in formesC:
                    formesC[p1]=set()
                formesC[p1].add(formeP.keys()[0])
    if inFormes:
        formesZero={k:list(v)[0] for k,v in formesC.iteritems() if len(v)==1}
    else:
        formesZero={k:list(v)[0] for k,v in formesC.iteritems() if len(v)==1 and k not in series}

    return formesZero

In [None]:
cases=[c for c in df.columns.tolist() if c!="lexeme"]
cases=sorted(cases)

In [None]:
lexemes=u"clore éclore falloir valoir".split(" ")
lexemes=[l for l in df["lexeme"].tolist() if not l.endswith(u"er")]
lexemes

In [None]:
dfNew=df.copy()
for lexeme in lexemes:
    index=df.loc[df['lexeme']==lexeme].index.values
    if index:
        i=index[0]
        sampleFormes=df.iloc[i].to_dict()
        sampleFormes={k:v for k,v in sampleFormes.iteritems() if v!=""}
        lexemeI=sampleFormes["lexeme"]
        print lexemeI,
#        formesZero=makeZeros(sampleFormes,inFormes=False)
        formesZero=fillStemSpace(sampleFormes,inFormes=False,contextFree=not avecContexte)
        for case in cases:
            if case in sampleFormes:
#                print "\t",case,"\t  ",sampleFormes[case]
                continue
            elif case in formesZero:
#                print "\t",case,"\t=>",formesZero[case]
                dfNew.iloc[i][case]=formesZero[case]
            else:
#                print "\t",case
                continue
#        print

## Ouvrir les fichiers de morphomes

In [None]:
morphomeCases=pd.read_pickle(rep+samples[54]+"Morphomes.pkl")[["case","morphome"]].drop_duplicates().to_dict()
casesMC=morphomeCases["case"]
morphomesMC=morphomeCases["morphome"]
listMorphomes=[]
for element in casesMC:
    listMorphomes.append(morphomesMC[element].split("/"))
cases

In [None]:
def makeTabularParadigme(lexeme,lDF,dictColours,title="",coulLim=False, cat="V",dictMorphomes={}):
    row=lDF[lDF["lexeme"]==lexeme]
    tabular=[]
    def makeValue(case):
        print case,row[case],len(row[case]),row[case].values
        if len(row[case])>0 and len(row[case].values[0])>0:
            result=coderIPA(row[case].values[0])
        elif dictMorphomes!={} and case in dictMorphomes:
            altCase=dictMorphomes[case][0]
            if len(row[altCase])>0 and len(row[altCase].values[0])>0:
                result=coderIPA(row[altCase].values[0])
            else:
                result="---"
        else:
            result="---"
    def makeLine6(tenseCode):
        line=[tabTemps[tenseCode]]
        for person in [per+nb for nb in ["S","P"] for per in ["1","2","3"]]:
            case=tenseCode+person
            if case in dictColours:
                line.append(r"\cellcolor{%s}%s"%(dictColours[case],makeValue(case)))
            else:
                line.append(r"\cellcolor{%s}%s"%("black",makeValue(case)))
        return r" & ".join(line)+r"\\"

    def makeLine3(tenseCode):
        line=[tabTemps[tenseCode]]
        for person in [per+nb for nb in ["S","P"] for per in ["1","2","3"]]:
            if person in ["2S","1P","2P"]:
                case=tenseCode+person
                if case in dictColours:
                    line.append(r"\cellcolor{%s}%s"%(dictColours[case],makeValue(case)))
                else:
                    line.append(r"\cellcolor{%s}%s"%("black",makeValue(case)))
#                line.append(r"\cellcolor{%s}%s"%(dictColours[case],case))
            else:
                line.append(r"---")
        return r" & ".join(line)+r"\\"
    
    def makeLineNF():
        line=["non-fini"]
        for case in ["inf","pP","ppMS","ppMP","ppFS","ppFP"]:
            if case in dictColours:
                line.append(r"\cellcolor{%s}%s"%(dictColours[case],makeValue(case)))
            else:
                line.append(r"\cellcolor{%s}%s"%("black",makeValue(case)))
#            line.append(r"\cellcolor{%s}%s"%(dictColours[case],case))
        return r" & ".join(line)+r"\\"

    def makeLineMF(nombre):
        line=[]
        for genre in "mf":
            case=genre+nombre
            if case in dictColours:
                line.append(r"\cellcolor{%s}%s"%(dictColours[case],makeValue(case)))
            else:
                line.append(r"\cellcolor{%s}%s"%("black",makeValue(case)))
        return r" & ".join(line)+r"\\"
    
    def makeLineCoulLim():
        line=[]
        for numLimite,limite in enumerate(listLimites):
            line.append(r"\cellcolor{%s}%s"%(listLimCoul[numLimite],"$<$"+str(limite)))
        return r"\hline\hline "+r" & ".join(line)+r"\\"
        
    if cat=="V":
        top=[
            r"\begin{center}",
            r"\begin{tabular}{ccccccc}",
            r"\toprule",
            " & ".join([lexeme]+[p+n for n in ["sg","pl"] for p in "123" ])+r"\\",
            r"\midrule"
            ]
        bottom=[
            r"\bottomrule",
            r"\end{tabular}\\",
            title,
            r"\end{center}",
            r"\bigskip",
            r""
            ]
        tabular.append("\n".join(top))
        for tenseCode in ["pi","ii","fi","pc", "ps","ai", "is"]:
            tabular.append(makeLine6(tenseCode))
        tabular.append(makeLine3("pI"))
        tabular.append(makeLineNF())
    elif cat=="A":
        top=[
            r"\begin{center}",
            r"\begin{tabular}{cc}",
            r"\toprule"
            ]
        bottom=[
            r"\bottomrule",
            r"\end{tabular}\\",
            title,
            r"\end{center}",
            r"\bigskip",
            r""
            ]
        tabular.append("\n".join(top))
        for number in "sp":
            tabular.append(makeLineMF(number))
    if coulLim:
        tabular.append(makeLineCoulLim())
    tabular.append("\n".join(bottom))
    return "\n".join(tabular)

In [None]:
etColours=["orange",
           "brown!50",
           "brown",
           "blue!10",
           "yellow!50",
           "blue!25",
           "teal!50",
           "blue!50",
           "cyan!50",
           "lime",
           "pink",
           "magenta!50",
          ]
etCells=[
    ["pi1P","pi2P"]+["ii"+p+n for p in "123" for n in "SP"],
    ["pi3P"],
    ["pi"+p+"S" for p in "123"],
    ["pP"],
    ["pI2S"],
    ["pI1P","pI2P"],
    ["ps"+p+"S" for p in "123"]+["ps3P"],
    ["ps1P","ps2P"],
    ["inf"],
    ["fi"+p+n for p in "123" for n in "SP"]+["pc"+p+n for p in "123" for n in "SP"],
    ["ai"+p+n for p in "123" for n in "SP"]+["is"+p+n for p in "123" for n in "SP"],
    ["pp"+g+n for g in "MF" for n in "SP"],
        ]
tabTemps={
    "pi":u"ind. prs",
    "ii":u"ind. ipf",
    "ai":u"ind. ps",
    "fi":u"ind. fut",
    "ps":u"subj. prs",
    "is":u"subj. ipf",
    "pc":u"cond. prs",
    "pI":u"imper. prs",
    "inf":u"non-fini"
    }
dictEtColours={}
for nET,ET in enumerate(etCells):
    for c in ET:
        dictEtColours[c]=etColours[nET]
dictEtColours

In [None]:
print makeTabularParadigme(u"clore",dfNew,dictEtColours,dictMorphomes=dictCaseMorphome)

In [None]:
lexeme=u"extraire"
ix=df.loc[df.lexeme==lexeme,:].T.columns.values[0]
pd.concat([df.loc[df.lexeme==lexeme,:].T.rename(columns={ix:'Old'}),
dfNew.loc[dfNew.lexeme==lexeme,:].T.rename(columns={ix:'New'})],
         axis=1)

In [None]:
dfNew.loc[dfNew.lexeme==u"valoir",:].T

In [None]:
for i in range(500):
    sampleFormes=df.loc[i].to_dict()
    lexemeI=sampleFormes["lexeme"]
    if not lexemeI.endswith("er"):
        print lexemeI
        numero=1
        for k,v in sampleFormes.iteritems():
            if k!="lexeme" and v!="":
                print numero,k,v
                numero+=1
        print
        formesZero=makeZeros(sampleFormes,inFormes=False)
        for k,v in formesZero.iteritems():
            if k!="lexeme":
                print numero,k,v
                numero+=1
        print

In [None]:
rules[("pi3S","ii1S")].sortirForme("Xi",contextFree=True)