# Script pour manipuler un tirage et comparer des colonnes

In [65]:
# -*- coding: utf8 -*-
import pandas as pd
import pickle, glob
import itertools as it
import networkx as nx
import numpy as np

In [2]:
def tiragesNom(nom):
    result=[tirage for tirage in listeTirages if nom in tirage]
    return result

listeTirages=glob.glob("/Users/gilles/Box Sync/2015-Data/*-Tirage.pkl")
listeTirages200=tiragesNom("200Mo")
listeTirages20=tiragesNom("20Mo")
listeTirages1=tiragesNom("1Mo")
listeTirages50k=tiragesNom("50Ko")
listeTirages100k=tiragesNom("100Ko")


In [3]:
def lireLexique(nomLexique):
    with open(nomLexique, 'rb') as input:
        lexique=pickle.load(input)
    return lexique

In [None]:
nomLexique=listeTirages100k[2]
lexique=lireLexique(nomLexique)
taille=lexique[lexique["tir1"]>0]["tir1"].count()
paradigmes=pd.pivot_table(lexique[lexique["tir1"]>0], values='phono', index=['lexeme'], columns=['case'], aggfunc=lambda x: ",".join(x)).reset_index().reindex()


In [None]:
def compPaire(c1,c2):
    syncretisms=[]
    c1Val=paradigmes[c1].notnull()
    c2Val=paradigmes[c2].notnull()
    c1Sur=paradigmes[c1].str.contains(",")
    c2Sur=paradigmes[c2].str.contains(",")
    l1=len(paradigmes[paradigmes[c1].notnull()])
    l2=len(paradigmes[paradigmes[c2].notnull()])
    paire=paradigmes[c1Val & c2Val & (paradigmes[c1]!=paradigmes[c2])][[c1,c2]]
    lenDiff=len(paire[~paire[c1].str.contains(",") & ~paire[c2].str.contains(",")])
    if lenDiff>0:
        if debug:
            print u"%s ≠ %s"%(c1,c2)
            print "différence",lenDiff
            if lenDiff<12:
                print paire
    else:
        surAbondant=paire[paire[c1].str.contains(",") | paire[c2].str.contains(",")]
#            print "--------------------------------"
        if len(surAbondant)==0:
#                print u"%s = %s"%(c1,c2) 
            syncretisms.append(u"%s = %s"%(c1,c2))
        else:
            compatible=True
            for index,row in surAbondant.iterrows():
                if "," in row[c1]:
                    if "," in row[c2]:
                        if row[c1]!=row[c2]:
                            compatible=False
                    else:
                        if not row[c2] in row[c1].split(","):
                            compatible=False
                else:
                    if not row[c1] in row[c2].split(","):
                        compatible=False
            if compatible:
#                    print u"%s = %s"%(c1,c2)
                syncretisms.append(u"%s = %s"%(c1,c2))
            else:
                print u"%s ≠ %s"%(c1,c2)
                print surAbondant

In [None]:
def assignerTNS(row):
    if row[-2:] in [p+n for p in ["1","2","3"] for n in ["S","P"]]:
        return row[:2]
    else:
        return "NF"

def assignerPER(row):
    if row[-2:] in [p+n for n in ["S","P"] for p in ["1","2","3"] ]:
        return row[-2:]
    else:
        if row=="inf":
            return "1S"
        elif row=="pP":
            return "2S"
        elif row=="ppMS":
            return "3S"
        elif row=="ppMP":
            return "1P"
        elif row=="ppFS":
            return "2P"
        elif row=="ppFP":
            return "3P"

lexique["TNS"]=lexique["case"].apply(assignerTNS)
lexique["PER"]=lexique["case"].apply(assignerPER)

In [None]:
lexeme=u"être"
tableauTest=lexique[lexique["lexeme"]==lexeme].pivot_table(values='phono', index=['lexeme','TNS'], columns=["PER"], aggfunc=lambda x: ",".join(x))
tableauTest

In [None]:
ligTable=['pi', 'ii', 'fi','pc','ps','ai','is','pI','NF']
colTable=[p+n for n in ["S","P"] for p in ["1","2","3"] ]
colTuple=[("PER",col) for col in colTable]
ligTuple=[("TNS",lig) for lig in ligTable]

In [None]:
tableauTest=tableauTest.reindex(columns=colTable)

In [None]:
idxTableau = tableauTest.sortlevel(level='TNS').index
idxTableau=[idxTableau[i] for i in [7, 3, 2, 6, 1,4,5,0]]
tableauTest=tableauTest.reindex(idxTableau)
print tableauTest.to_latex().replace("None","")

# Nombre de formes différentes par taille d'échantillon

In [None]:
unsPoints=[]
for tirage in listeTirages100k:
    lexique=lireLexique(tirage)
    tirs=lexique[lexique["tir1"]>0]["tir1"].sum()
    taille=lexique[lexique["tir1"]>0]["tir1"].count()
    unsPoints.append((tirs, taille)) 

In [123]:
autresPoints=[(1000000,43563),
              (2000000,52654),
              (3000000,57648),
              (4000000,60647),
              (5000000,62926),
              (6000000,64507),
              (7000000,65694),
              (8000000,66597),
              (9000000,67395),
              (10000000,68049),
              (11000000,68590),
              (12000000,69007),
              (13000000,69382),
              (14000000,69690),
              (15000000,69959),
              (16000000,70178),
              (17000000,70379),
              (18000000,70547),
              (19000000,70704),
              (20000000,70859),  
              (200000000,74702),
              (400000000,76076),
              (600000000,77351),
              (800000000,78517),
              (1000000000,79565),
              (1200000000,80618),
              (1400000000,81578),
              (1600000000,82505),
             ]
points=autresPoints

In [125]:
pdPoints=pd.DataFrame(points)
pdPoints.columns=["Sample Size", "Number of Forms"]
pdPoints.set_index("Sample Size",inplace=True)
xPoints=list(pdPoints.index)
pdPoints

Unnamed: 0_level_0,Number of Forms
Sample Size,Unnamed: 1_level_1
1000000,43563
2000000,52654
3000000,57648
4000000,60647
5000000,62926
6000000,64507
7000000,65694
8000000,66597
9000000,67395
10000000,68049


In [136]:
%matplotlib

Using matplotlib backend: MacOSX


In [49]:
#pdPoints.plot(ylim=(0,300000))
pdPoints.plot(linewidth=5,marker="o",markersize=10,xlim=(0,2000e6))

<matplotlib.axes._subplots.AxesSubplot at 0x12eaa4750>

# Nombre de lexèmes par nombre de formes par taille d'échantillon

In [6]:
distributions=[]
for nTirage,tirage in enumerate(listeTirages200):
    lexique=lireLexique(tirage)
    lexique1=lexique[lexique["tir1"]>0]
    nbFormes=lexique1.groupby(by=["lexeme"])[["phono"]].count()
    nbFormes.columns=["Number of Forms"]
    distNbFormes=nbFormes.groupby(by=["Number of Forms"])[["Number of Forms"]].count()
    distNbFormes.columns=["Number of lexemes"]
    distributions.append(distNbFormes)
    if nTirage//5==0:
        distNbFormes.plot(kind="bar",figsize=(20,10),ylim=(0,1000))

# Calculs sur les tirages de 200Mo cumulés

In [37]:
listeLexiques=listeTirages200
cumulLexique=lireLexique(listeLexiques[0])
cumulLexique["tir1"]=0
for nTirage,tirage in enumerate(listeLexiques):
#    print nTirage,
    lexique=lireLexique(tirage)
    cumulLexique["tir1"]=cumulLexique["tir1"]+lexique["tir1"]
    lexique1=cumulLexique[cumulLexique["tir1"]>0]
    nbTokens=lexique1["tir1"].sum()
    nbTypes=lexique1["tir1"].count()
    print "(%d,%d),"%(nbTokens,nbTypes)
    nbCases=lexique1.groupby(by=["lexeme","case"],as_index=False).agg({"tir1":sum})
    nbFormes=nbCases.groupby(by=["lexeme"])[["case"]].count()
    nbFormes.columns=["Number of Forms"]
    distNbFormes=nbFormes.groupby(by=["Number of Forms"])[["Number of Forms"]].count()
    distNbFormes.columns=["Number of lexemes"]
    distributions.append(distNbFormes)
    if nTirage%1==5:
        distNbFormes.plot(kind="bar",figsize=(20,10),ylim=(0,1000))    

(200000000,74702),
(400000000,76076),
(600000000,77351),
(800000000,78517),
(1000000000,79565),
(1200000000,80618),
(1400000000,81578),
(1600000000,82505),


In [144]:
for n in [int(1e6*k) for k in range(1,1600)]:
    if not n in pdPoints.index:
        pdPoints.loc[n]=np.NaN
pdPoints.sort_index(inplace=True)
pdPoints.interpolate(method="slinear", order=4, inplace=True)
derivativeNb=pdPoints.diff().reset_index()
derivativeNb.set_index("Sample Size",inplace=True)

In [130]:
funcDerivativeNb=derivativeNb[derivativeNb["Sample Size"].isin(xPoints)]
funcDerivativeNb.set_index("Sample Size",inplace=True)

In [149]:
funcDerivativeNb.ix["150000000":].plot()

<matplotlib.axes._subplots.AxesSubplot at 0x128ec3210>

In [141]:
funcDerivativeNb

Unnamed: 0_level_0,Number of Forms
Sample Size,Unnamed: 1_level_1
1000000,
2000000,9091.0
3000000,4994.0
4000000,2999.0
5000000,2279.0
6000000,1581.0
7000000,1187.0
8000000,903.0
9000000,798.0
10000000,654.0
