# Etude des préfixes exprimant le haut degré dans Le Monde (1985-2019)

liste des préfixes : ultra-, super-, supra-, hyper-, hypra-, sur-, extra-, méga-, giga-, archi-, maxi-

Dans ce travail, nous étudions d'abord les unigrammes (ie formes lexicales composées de l'un des préfixes), puis les bi-grammes (formes PREF-LEXIE ou PREF LEXIE).


In [1]:
import requests
import sys, csv,re, random, glob,os
import numpy as np                               # vectors and matrices
import pandas as pd                              # tables and data manipulations
import matplotlib.pyplot as plt
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6
import matplotlib
import seaborn as sns                            # more plots
from sklearn.metrics import mean_absolute_error
from matplotlib.backends.backend_pdf import PdfPages

# Chargement des stoplists

In [2]:
stoplist={}
files = glob.glob("./stoplists/*.txt")
for file in files:
    with open(file, mode="r",encoding="utf8") as f:
        for line in f:
            if len(line.strip())>0:
                stoplist[line.strip()]=1
            
print("Stoplist chargée : " + str(len(stoplist)) + " mots.")

Stoplist chargée : 50 mots.


# Récupération des formations préfixées avec annotations

format :

pref tab year tab count

superclasse/NOM	2017	1

supraterrestre/Adj	2017	1


In [3]:

if os.path.isfile('../../lemonde1994-2006/prefixes_all.csv'):
    print("Loading the Ngram big file ...")
    df= pd.read_csv('../../lemonde1994-2006/prefixes_all.csv',  header=0, sep="\t", error_bad_lines=False)#dtype={'pref':str,'year':datetime64[ns],'count': int64},, index_col=0 
else:
    print("Please first launch X.py to prefixes_all.csv")
    exit()


df.info()
df.head()

Loading the Ngram big file ...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10451 entries, 0 to 10450
Data columns (total 3 columns):
pref     10451 non-null object
year     10451 non-null int64
count    10451 non-null int64
dtypes: int64(2), object(1)
memory usage: 245.0+ KB


Unnamed: 0,pref,year,count
0,superclasse/NOM,2017,1
1,supraterrestre/Adj,2017,1
2,hyperémotivité/Noun,2017,2
3,extraterrestre/Noun,2017,310
4,hyperémotivité/NOM,2017,1


# format standard

In [4]:
#table = pivot_table(df, values='count', index=['string', 'prefix', 'wordpart','word'], columns=['year'], aggfunc=np.sum)
df = df.pivot_table(values='count', index=['pref'], columns=['year'], aggfunc=np.sum)
#print(table)
print(df.info())
print(df.head())
print(type(df))
df = df.fillna(0)
df.to_csv("df_lemonde_pivot.csv")

<class 'pandas.core.frame.DataFrame'>
Index: 4648 entries, archi-Khanabad/NAM to ultraïste/NOM
Data columns (total 15 columns):
1987    765 non-null float64
1988    728 non-null float64
1989    817 non-null float64
1990    734 non-null float64
1991    730 non-null float64
1992    785 non-null float64
1994    777 non-null float64
2002    1002 non-null float64
2003    740 non-null float64
2004    839 non-null float64
2005    702 non-null float64
2006    630 non-null float64
2016    408 non-null float64
2017    399 non-null float64
2018    395 non-null float64
dtypes: float64(15)
memory usage: 581.0+ KB
None
year                1987  1988  1989  1990  1991  1992  1994  2002  2003  \
pref                                                                       
archi-Khanabad/NAM   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   
archi-Satan/NOM      NaN   NaN   1.0   NaN   NaN   NaN   NaN   NaN   NaN   
archi-Trading/NAM    NaN   NaN   1.0   1.0   NaN   NaN   NaN   NaN   NaN   
archi-b

# extraction composants d'information

In [5]:
# 
df = df.reset_index()
#print(df.head())
pref_re = '^(ultra|sur|super|hyper|hypra|extra|méga|archi|maxi|supra)(?:-)?(?:.*)\/(?:.+)$'
pref_re1 = '^(.+)\/(?:.+)$'
pref_re2 = '^(?:ultra|sur|super|hyper|hypra|extra|méga|archi|maxi|supra)(?:-)?(.*)\/(?:.+)$'
pref_re3 = '^(?:ultra|sur|super|hyper|hypra|extra|méga|archi|maxi|supra)(-)?(?:.*)\/(?:.+)$'
pref_re4 = '^(?:ultra|sur|super|hyper|hypra|extra|méga|archi|maxi|supra)(?:-)?(?:.*)\/(.+)$'
df['prefix'] = df.pref.str.extract(pref_re, expand=True)
df['word'] = df.pref.str.extract(pref_re1, expand=True)
df['wordpart'] = df.pref.str.extract(pref_re2, expand=True)
df['sep'] = df.pref.str.extract(pref_re3, expand=True)
df['pos'] = df.pref.str.extract(pref_re4, expand=True)
# élimination stoplist
#print(df.word.isin(stoplist).count())
df = df[~df.word.isin(stoplist)]
#print(df.wordpart.str.contains("^[A-Z]").count())
df = df[~df.wordpart.str.contains("^[A-Z]")]
df.rename(index=str, columns={'pref':'string'}, inplace=True)
print(df.info())
print(df.head())
# Nan Values?
print(df[df.prefix.isna()])
df = df.fillna('')
#df.dropna(inplace=True)
print(df.info())
print(df.head(10))

df.to_csv("df_lemonde.csv")

<class 'pandas.core.frame.DataFrame'>
Index: 4004 entries, 3 to 4647
Data columns (total 21 columns):
string      4004 non-null object
1987        4004 non-null float64
1988        4004 non-null float64
1989        4004 non-null float64
1990        4004 non-null float64
1991        4004 non-null float64
1992        4004 non-null float64
1994        4004 non-null float64
2002        4004 non-null float64
2003        4004 non-null float64
2004        4004 non-null float64
2005        4004 non-null float64
2006        4004 non-null float64
2016        4004 non-null float64
2017        4004 non-null float64
2018        4004 non-null float64
prefix      4004 non-null object
word        4004 non-null object
wordpart    4004 non-null object
sep         2091 non-null object
pos         4004 non-null object
dtypes: float64(15), object(6)
memory usage: 688.2+ KB
None
year              string  1987  1988  1989  1990  1991  1992  1994  2002  \
3        archi-battu/VER   1.0   1.0   0.0   0.0   0.0

In [9]:
#print(df.info())
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
newdf = df.select_dtypes(include=numerics)
print(newdf.info())
#df['full_count'] = df.select_dtypes(include=numerics).sum(numeric_only=True)
print(df.head())


<class 'pandas.core.frame.DataFrame'>
Index: 4004 entries, 3 to 4647
Data columns (total 16 columns):
1987          4004 non-null float64
1988          4004 non-null float64
1989          4004 non-null float64
1990          4004 non-null float64
1991          4004 non-null float64
1992          4004 non-null float64
1994          4004 non-null float64
2002          4004 non-null float64
2003          4004 non-null float64
2004          4004 non-null float64
2005          4004 non-null float64
2006          4004 non-null float64
2016          4004 non-null float64
2017          4004 non-null float64
2018          4004 non-null float64
full_count    0 non-null float64
dtypes: float64(16)
memory usage: 531.8+ KB
None
year              string  1987  1988  1989  1990  1991  1992  1994  2002  \
3        archi-battu/VER   1.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0   
4        archi-bondé/VER   1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   
5      archi-bondées/ADJ   1.0   0.0   0.0   0.0

# Generate dataframes with relative frequency

In [None]:
# load yearly corpus stats (total words)
from datetime import datetime

def load_lemonde_counts(fn):
    totals={}
    with open(fn, mode="r", encoding="utf8") as f:
        for line in f:
            data = line.strip().split("\t")
            if len(data)==2 and not(data[0]== 'year'):
                year = datetime.strptime(data[0], '%Y')
                totals[year.year]=int(data[1])
    return totals

# load totals of tokens per corpus year
totals = load_lemonde_counts("../../lemonde1994-2006/frequency.csv")
#print(totals)
dfcount = pd.DataFrame.from_dict(totals, orient='index')
#dfcount.index.sort_values()
#print(dfcount)
dfcount.sort_index().plot(kind="bar", title="Evolution de la taille des corpus Le Monde")
#pdf0.savefig()
#plt.close()


# now create rel version for dataframe (df)

In [None]:
df_rel = df.copy(deep=True)
df_rel= df_rel.fillna(0)
#print(df_rel.head())
#print(df_rel.info())
#print(df_rel.columns)
# calculate relative frequency for each column

for i in totals.keys():
    #print(i,type(i),df_rel[i],totals[i])
    df_rel[str(i) + '_freqrel'] = (df_rel[i] / totals[i]) * 1000

# remove absolute frequency for df (relative frequency used for clustering and plotting)
df_rel = df_rel.drop([i for i in totals.keys()], axis=1)
df_rel.columns = df_rel.columns.str.replace('_freqrel', "")
df_rel = df_rel.infer_objects()
print(df_rel.info())
print(df_rel.head(10))

df_rel.to_csv("dfrel_lemonde.csv")

# Génération synthèse

In [None]:
# for moving average
# definition of plot for all measures
def plot_rolling(df,title, window=10):
    fig, ax = plt.subplots(3,figsize=(20, 10))
    ax[0].plot(df.index, df.data, label='raw data')
    ax[0].plot(df.data.rolling(window=window).mean(), label="rolling mean (window=10)");
    ax[0].plot(df.data.rolling(window=window).std(), label="rolling std (window=10)");
    ax[0].legend()

    ax[1].plot(df.index, df.z_data, label="de-trended data")
    ax[1].plot(df.z_data.rolling(window=window).mean(), label="rolling mean (window=10)");
    ax[1].plot(df.z_data.rolling(window=window).std(), label="rolling std (window=10)");
    ax[1].legend()

    ax[2].plot(df.index, df.zp_data, label="5 lag differenced de-trended data")
    ax[2].plot(df.zp_data.rolling(window=window).mean(), label="rolling mean (window=10)");
    ax[2].plot(df.zp_data.rolling(window=window).std(), label="rolling std (window=10)");
    ax[2].legend()
    fig.suptitle(title, fontsize=13)
    plt.tight_layout()
    fig.autofmt_xdate()


In [None]:
print(list(df.select_dtypes(include=[np.number]).columns))
print(list(df_rel.select_dtypes(include=[np.number]).columns))
periods = list(df.select_dtypes(include=[np.number]).columns)
periods2 = [str(i) for i in periods]
#periods = [int(elt) for elt in df.columns if int(elt)]
#print(df[periods].head())
#print(df_rel[periods].head())
# create a PdfPages object
pdf0 = PdfPages('Prefix_synthesis_lemonde.pdf')

group_data = df.groupby('prefix').sum()
group_data2 = df_rel.groupby('prefix').sum()
fig, ax = plt.subplots(3, figsize=(45, 30))
group_data2[periods2].T.plot(ax=ax[0],rot=45,figsize=(15,10),title="Evolution des fréquences relatives des préfixes (tous) entre 1800 et 2010")
ax[0].set_xticklabels([])
group_data[periods].T.plot(ax=ax[1],rot=45,figsize=(15,10),title="Evolution des fréquences absolues des préfixes (tous) entre 1800 et 2010")
group_data['full_count'] = group_data.apply(lambda x: x.sum(), axis=1)
group_data['full_count'].plot(ax=ax[2],kind="bar",rot=45,figsize=(15,10),title="Distribution des fréquences totales entre préfixes")
plt.tight_layout()
pdf0.savefig()
plt.close()

# moving average, trends etc.
df5 = group_data2[periods2]
for pref in df5.index.values:
    series = df5.loc[pref]
    ts = pd.DataFrame({'data':series.values}, index=series.index) # 'year':seriesrel.index, 

    ts['z_data'] = (ts['data'] - ts.data.rolling(window=3).mean()) / ts.data.rolling(window=3).std()
    ts['zp_data'] = ts['z_data'] - ts['z_data'].shift(3)
    plot_rolling(ts,title= pref, window=10)
    pdf0.savefig()
    plt.close()

# distribution by word for each prefix
#df['full_count'] = df[periods].sum() # 

df['full_count'] = df.apply(lambda x: x.sum(), axis=1) # [periods]
#print(df['full_count'])
for pref in df.prefix.unique():
        fig, ax = plt.subplots(2, figsize=(20, 15))
        dfplot = df[df.prefix.str.contains(pref)].sort_values('full_count', ascending=False)#['full_count']
        # get 0.95 quantile
        #print(dfplot)
        q = dfplot["full_count"].quantile(0.9)
        dfplot2 = dfplot[dfplot["full_count"] < q]['full_count']      
        sns.distplot(dfplot2, ax=ax[0]).set_title("Distribution des occurrences pour : " + pref + ' (90% quantiles)')
        #sns.distplot(np.log(dfplot), ax=ax[0])
        sns.boxplot(dfplot['full_count'],ax=ax[1]).set_title("Distribution des occurrences pour : " + pref + ' (global)')
        pdf0.savefig()
        plt.close()         

        fig, ax = plt.subplots(2, figsize=(20, 15))
        dfplot3 = df[df.prefix.str.contains(pref)].set_index('wordpart')[periods].T
        dfplot3.index = pd.to_datetime(dfplot3.index)
        dfplot4=dfplot3.resample('5AS').sum()
        #dfplot4["year"] = dfplot4.index.year
        #dfplot4.diff().plot(ax=ax[0], title=pref + " : accroissement / décroissement des fréquences pour les formations (diff)")
        dfplot4['mean_diff'] = dfplot4.diff().apply(lambda x : x.mean(), axis=1)
        dfplot4['mean_diff'].plot(ax=ax[0], title=pref + " : accroissement / décroissement des fréquences pour les formations (mean of diff)")
        dfplot4['total_distinct_words'] = dfplot4.apply(lambda x : x.astype(bool).sum(), axis=1)
        dfplot4['total_distinct_words'].plot(ax=ax[1],title=pref + " : évolution du nombre de formations lexicales attestées")
        pdf0.savefig()
        plt.close()

        
        
        df[df.prefix.str.contains(pref)].sort_values('full_count', ascending=False).set_index('wordpart')[periods].head(50).T.plot(title= pref + " :  évolution des formations les plus fréquentes (50)")
        pdf0.savefig()
        plt.close()

pdf0.close()

In [None]:
# save dataframe df4
# save to excel
def save_report(report, key, outfile):
    """
    Take a report and save it to a single Excel file
    """
    #cols_tmp = [str(i) for i in report.columns.values]
    #cols = sorted(cols_tmp,reverse=True)
    #print(cols)
    writer = pd.ExcelWriter(outfile)
    for k, grp in report.groupby(key):
        grp.sort_values('full_count', ascending=False).set_index('wordpart').to_excel(writer,k)
    writer.save()
    return True

#print(df4.head())
save_report(df, 'prefix','prefixes_lemonde1994-2018.xls')

## With Plotly

In [None]:
# with plotly
import plotly.plotly as py
import plotly.tools as pytools
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=False)
#pytools.set_credentials_file(username='ecartierdijon', api_key='3msHhM6RjRcAvIpAgcz6')
#help(py.plot) max 25 public graphs

# data
#print(df4_rel.head())
group_data = df_rel.groupby('prefix').sum()
df6 = group_data[periods2].T
print(df6.head())
data = []
for pref in df6.columns.values:
    linegraph = go.Scatter(
        x=df6.index,
        y=df6[pref],
        name = pref,
        #line = dict(color = '#17BECF'),
        opacity = 0.8)
    data.append(linegraph)

layout = dict(
    title='Evolution des fréquences relatives de 1800 à 2010',
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label='1m',
                     step='month',
                     stepmode='backward'),
                dict(count=6,
                     label='6m',
                     step='month',
                     stepmode='backward'),
                dict(step='all')
            ])
        ),
        rangeslider=dict(
            visible = True
        ),
        type='date'
    )
)

fig = dict(data=data, layout=layout)
plot(fig, filename = "./plotly_graphs/Evolution_relative-prefixes.html") 
iplot(fig) # , filename = "Time Series with Rangeslider"

In [None]:
# with plotly
import plotly.plotly as py
import plotly.tools as pytools
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=False)
#pytools.set_credentials_file(username='ecartierdijon', api_key='3msHhM6RjRcAvIpAgcz6')
#help(py.plot) max 25 public graphs

# data
#print(df4_rel.head())
group_data = df.groupby('prefix').sum()
df6 = group_data[periods].T
print(df6.head())
data = []
for pref in df6.columns.values:
    linegraph = go.Scatter(
        x=df6.index,
        y=df6[pref],
        name = pref,
        #line = dict(color = '#17BECF'),
        opacity = 0.8)
    data.append(linegraph)

layout = dict(
    title='Evolution des fréquences relatives de 1800 à 2010',
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label='1m',
                     step='month',
                     stepmode='backward'),
                dict(count=6,
                     label='6m',
                     step='month',
                     stepmode='backward'),
                dict(step='all')
            ])
        ),
        rangeslider=dict(
            visible = True
        ),
        type='date'
    )
)

fig = dict(data=data, layout=layout)
plot(fig, filename = "./plotly_graphs/Evolution_absolue-prefixes.html") 
iplot(fig) # , filename = "Time Series with Rangeslider"

In [None]:
# with plotly -  for more flexibility, use Dash!
import plotly.plotly as py
import plotly.tools as pytools
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=False)
#pytools.set_credentials_file(username='ecartierdijon', api_key='3msHhM6RjRcAvIpAgcz6')
#help(py.plot) max 25 public graphs

figs = {}
prefs = ('ultra','super','hyper','hypra','extra','méga','archi','maxi','supra')
# data
i = 0
for pref in df.prefix.unique():
    if df[df.prefix.str.contains(pref)].wordpart.count() > 0:
        i = i + 1
        dfplot = df[df.prefix.str.contains(pref)].sort_values('full_count', ascending=False).set_index('wordpart') # .head(30)
        total_words = len(dfplot.index)
        total_occ = dfplot['full_count'].sum()
        dfplotT =  dfplot[periods].T
        #plot(title= pref + " :  évolution des formations les plus fréquentes (30)")
        #print(dfplot)
        data = []
        for word in dfplotT.columns.values:
            linegraph = go.Scatter(
                x=dfplotT.index,
                y=dfplotT[word],
                name = word,
                #line = dict(color = '#17BECF'),
                opacity = 0.8)
            #fig.append_trace(linegraph, i, 1)
            data.append(linegraph)

        layout = dict(
            title= pref + " : évolution des fréquences relatives de 1800 à 2010 ("  + str(total_words) + ' mots distincts, ' + str(int(total_occ)) + ' occurrences)',
            xaxis=dict(
                rangeselector=dict(
                    buttons=list([
                        dict(count=1,
                             label='1m',
                             step='month',
                             stepmode='backward'),
                        dict(count=6,
                             label='6m',
                             step='month',
                             stepmode='backward'),
                        dict(step='all')
                    ])
                ),
                rangeslider=dict(
                    visible = True
                ),
                type='date'
            )
        )

        #fig.append_trace(dict(data=data, layout=layout), i, 1)
        fig = dict(data=data, layout=layout)
        figs[pref]=fig
        
        
#   Layout setting
#fig['layout'].update(height=1000)

#   Create Html
#folder_name = 'folder_name' + '.html'
#plotly.offline.plot(fig, filename=folder_name)
for pref in figs.keys():
    plot(figs[pref], filename = "./plotly_graphs/Evolution-prefixes-" + pref + "-words2.html") 

## get boxplot of frequency for each prefix
To be done : evolution of distribution (boxplot) + for every word

## Stationarity assessment

In [None]:
pdf0 = PdfPages('Prefix_synthesis2.pdf')
# other means to detrend
from statsmodels.tsa.tsatools import detrend
for pref in df5.index.values:
    series = df5.loc[pref]
    data = pd.DataFrame({pref:series.values}, index=series.index) # 'year':seriesrel.index, 
    #print(data)
    notrend = detrend(data[pref])
    data["notrend"] = notrend
    data["trend"] = data[pref] - notrend
    data.tail()
    data.plot(y=[pref, "notrend", "trend"], figsize=(20,10), title=pref + ": detrending")
    pdf0.savefig()
    plt.close()
    # On essaye de calculer une tendance en minimisant : Y = α + βt + γt
    notrend2 = detrend(data[pref], order=2)
    data["notrend2"] = notrend2
    data["trend2"] = data[pref] - data["notrend2"]
    data.plot(y=[pref, "notrend2", "trend2"], figsize=(20,10), title=pref + ": detrending and tendency")
    pdf0.savefig()
    plt.close()
    # same with log
    import numpy
    data["logSess"] = data[pref].apply(lambda x: numpy.log(x+1))
    lognotrend = detrend(data['logSess'])
    data["lognotrend"] = lognotrend
    data["logtrend"] = data["logSess"] - data["lognotrend"]
    data.plot(y=["logSess", "lognotrend", "logtrend"], figsize=(20,10), title=pref + ": detrending and tendency (log)")
    pdf0.savefig()
    plt.close()
    

In [None]:
# composante saisonnière
from statsmodels.tsa.seasonal import seasonal_decompose
for pref in df5.index.values:
    series = df5.loc[pref]
    data = pd.DataFrame({pref:series.values}, index=series.index) # 'year':seriesrel.index, 
    #print(data)

    res = seasonal_decompose(data[pref].as_matrix().ravel(), freq=7, two_sided=False)
    data["season"] = res.seasonal
    data["trendsea"] = res.trend
    data.plot(y=[pref, "season", "trendsea"], figsize=(20,10), title=pref + " : saisonnalité")
    pdf0.savefig()
    plt.close()
    data[-30:].plot(y=[pref, "season", "trendsea"], figsize=(20,10), title=pref + " : saisonnalité (30 dernières années)")
    pdf0.savefig()
    plt.close()
#pdf0.close()

In [None]:
# autocorrelation
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import periodogram
for pref in df5.index.values:
    series = df5.loc[pref]
    data = pd.DataFrame({pref:series.values}, index=series.index) # 'year':seriesrel.index, 
    #print(data)

    fig = plt.figure(figsize=(12,8))
    ax1 = fig.add_subplot(211)
    fig = plot_acf(data[pref], lags=40, ax=ax1)
    ax2 = fig.add_subplot(212)
    fig = plot_pacf(data[pref], lags=40, ax=ax2)
    fig.suptitle(pref + " : autocorrelation")
    pdf0.savefig()
    plt.close()
    p = periodogram(data[pref])
    plt.plot(p)
    pdf0.savefig()
    plt.close()

pdf0.close()

## Decompose in trend, seasonality and residual

In [None]:
from random import randrange
from pandas import Series
from matplotlib import pyplot
#fig, ax = plt.subplots(1,1,figsize = (20,20))
from statsmodels.tsa.seasonal import seasonal_decompose


for pref in df5.index.values:
    series = df5.loc[pref]
    df2 = pd.DataFrame({'data':series.values}, index=series.index) # 'year':seriesrel.index, 

    #series = [i+randrange(10) for i in range(1,100)]
    res = seasonal_decompose(df2, model='additive', freq=1)
    res.plot()
    pdf0.savefig()
    plt.close()
    pyplot.show()
    #res = seasonal_decompose(df2, model='multiplicative', freq=1)
    #res.plot()
    #pdf0.savefig()
    #plt.close()
    #pyplot.show()
    #print(result.trend)
    #print(result.seasonal)
    #print(result.resid)
    #print(result.observed)
pdf0.close()

## Separate lines where you have POS info (word_POS, prefix_POS, _POS_)

In [None]:
# get row with pos info in another dataframe
filter0 = df.string.str.contains("_")
print(len(df[filter0].index))
print("\n********************\n")
filter = df.prefix.str.contains("_")
print(len(df[filter].index))
print(df.prefix.unique())
print("\n********************\n")
filter2 = df.word.str.contains("_")
print(len(df[filter2].index))
print(df[filter2].word.unique())
print("\n********************\n")
print(len(df[filter & filter2].index))
print("\n********************\n")

# create dataframes
dfpos = df[filter0]
dfraw = df[~filter0]
print(dfpos.info())
print(dfraw.info())


In [None]:
print(dfpos.info())
dfpos[~dfpos.prefix.str.contains('_')].groupby(['prefix'])[['full_count']].sum()
#print(dfpos[['string','prefix','word','full_count']].head(100))
#print(df.info())
# by prefix 
#dfpos.plot(subplots=True)
#sns.boxplot(x=grp.unstack().prefix, y=grp.unstack().full_count, data=grp.unstack())
    

In [None]:
dfpos.info()

In [None]:
dfpos.fillna(0)
# first means
#res=  df['prefix'].str.split('_', n=1, expand=True)
#print(res)
dfpos[['prefix1','prefix_pos']] = df['prefix'].str.split('_', n=1, expand=True)
dfpos[['word1','word_pos']] = df['word'].str.split('_', n=1, expand=True)
# create _pos column word word and prefix
#dfpos['prefix_pos'] = dfpos['prefix'].str.extract("_(.+)$", expand=False)
#dfpos['prefix1'] = dfpos['prefix'].str.extract("^(.+?)_", expand=False)

#dfpos["word_pos"]=  dfpos['word'].str.extract("_(.+)$", expand=False)
#dfpos["word1"]= dfpos['word'].str.extract("^(.+?)_", expand=False)
  
dfpos.head()

In [None]:
pdf.close()


In [None]:
pdf2 = PdfPages('Prefix-X_synthesis2.pdf')
columns = dfpos.columns
periods = [elt for elt in columns if re.match("[0-9]{4}", elt)]
#print(periods)
dfpos2 = dfpos[~dfpos.prefix.str.contains("_")]# & dfpos.word_pos.str.contains("_")
for k,grp in dfpos2.groupby(['prefix']):
    grp[grp.word_pos.str.contains("_")].sort_values(['word_pos','full_count'],ascending=False).groupby('word_pos')[periods].sum().T.plot(kind="line", title=k, rot=45, figsize=(20,10))  # [['full_count']]
    pdf2.savefig()
    plt.close()
#    for k2, grp2 in grp[~grp.word_pos.str.contains('_')].groupby('word_pos'):
    for k2, grp2 in grp[grp.word_pos.isin(['ADJ','ADV','NOUN','VERB'])].groupby('word_pos'):
        grp2.sort_values(['word1','full_count'],ascending=False).groupby(['word1'])[periods].sum().head(20).T.plot(kind="line", title=k + "-" + k2, figsize=(20,10) )

        #grp2.groupby('word1').head(10)[periods].sum().plot(kind="barh", title=k + " - " + k2, rot=45, figsize=(10,5))  # [['full_count']]
        pdf2.savefig()
        plt.close()
        
    #print(grp.unstack())
    #print(pd.pivot_table(dfpos2[dfpos2.prefix==k], index= ['word_pos','word'])) # , columns=[periods] , values=
pdf2.close()

In [None]:
dfpos2 = dfpos[~dfpos.prefix.str.contains("_")]# & dfpos.word_pos.str.contains("_")
dfpos2.sort_values(['prefix','full_count'],ascending=False).groupby(['prefix'])[periods].sum().T.plot(kind="line", figsize=(20,10), logy=True)  # , subplots=True


In [None]:
# generate as bokeh dashboard : https://realpython.com/python-data-visualization-bokeh/
"""Bokeh Visualization Template
This template is a general outline for turning your data into a 
visualization using Bokeh.
"""
# Data handling
#import pandas as pd
#import numpy as np
from datetime import date
from random import randint

# Bokeh libraries
from bokeh.io import output_file, output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, CustomJS, HoverTool
from bokeh.layouts import row, column, gridplot # https://bokeh.pydata.org/en/latest/docs/user_guide/layout.html
from bokeh.models.widgets import Div, Tabs, Panel, Select,DataTable, DateFormatter, TableColumn

# Determine where the visualization will be rendered
output_file('Prefixes_2grams.html')  # Render to static HTML, or 
output_notebook()  # Render inline in a Jupyter Notebook

# Load and Prepare the data : timerseries of every prefix through time
dfpos2 = dfpos[~dfpos.prefix.str.contains("_")]# & dfpos.word_pos.str.contains("_")
#print(dfpos2.head())
dfpos3 = dfpos2.sort_values(['prefix','full_count'],ascending=False).groupby(['prefix'])[periods].sum().T
print(dfpos2[['prefix','full_count']])
df4 = dfpos3.reset_index() #.drop('prefix', axis=1)
#print(df4.head())
data = df4.reset_index().drop('level_0', axis=1).to_dict(orient='list')
#print(data)
source = ColumnDataSource(data)
#print(df4.head())

# callback functions
callback = CustomJS(args=dict(figure=fig2), code="""
    var f = cb_obj.value
    console.log(f)
    if (f == 'Log'){
    console.log(figure)
    figure.y_axis_type="log"}
    figure.trigger('change');
""")


# rendering
palette = ['#1f77b4','#ff7f0e','#2ca02c','#d62728','#9467bd','#8c564b','#e377c2','#7f7f7f','#bcbd22','#17becf']
TOOLS = 'crosshair,hover,save,pan,box_zoom,reset,wheel_zoom'
TOOLTIPS = [
    ("Année", "@x"),
    ("Fréquence", "@y")]

# div element at the beginning of the page
div = Div(text="""<h1>Etude des préfixes exprimant la haute intensité à partir des données de Google Ngrams</h1>
<p>Le premier schéma propose les données générales : répartition temporelle des fréquences entre les différents préfixes.</p>
<p>Ensuite, vous pouvez voir le détail en choisissant un préfixe et éventuellement des restrictions (partie du discours et mot)</p><hr/><br/><hr/>

""", width=1000, height=100)

# general figure (overall distribution between prefixes)
fig0= figure(plot_height=300, plot_width=1000,title="Distribution des Prefixes (absolute frequency)", tooltips=TOOLTIPS, tools = TOOLS, toolbar_location='right')
i = 0
for pref in data.keys():
    if pref not in ('index','sur'):
        fig0.line(x=data['index'], y=np.log(data[pref]), legend=pref,line_color=palette[i], line_width = 2)
        i = i+1

# general figure (absolute frequency)
fig2 = figure(plot_height=300, plot_width=1000,title="Evolution des Prefixes de 1800 à 2010 (absolute frequency)", tooltips=TOOLTIPS, tools = TOOLS, toolbar_location='right')
i = 0
for pref in data.keys():
    if pref not in ('index','sur'):
        fig2.line(x=data['index'], y=data[pref], legend=pref,line_color=palette[i], line_width = 2)
        i = i+1
fig2.legend.click_policy = 'hide'
tab1 = Panel(child=fig2, title="Fréquence absolue")

fig1 = figure(plot_height=300, plot_width=1000,title="Evolution des Prefixes de 1800 à 2010 (absolute frequency)", tooltips=TOOLTIPS, tools = TOOLS, toolbar_location='right')
i = 0
for pref in data.keys():
    if pref not in ('index','sur'):
        fig1.line(x=data['index'], y=np.log(data[pref]), legend=pref,line_color=palette[i], line_width = 2)
        i = i+1
fig1.legend.click_policy = 'hide'
tab2 = Panel(child=fig1, title="Fréquence Log 2")

tabs = Tabs(tabs=[ tab1, tab2 ],width=500)

# vbar for each prefix
#vbars = []
#for pref in data.keys()
#figure(x_range=periods, plot_height=250, title="Fruit Counts",
#           toolbar_location=None, tools="").vbar(x=fruits, top=counts, width=0.9)

#p.xgrid.grid_line_color = None
#p.y_range.start = 0


# select
select0 = Select(title="Choissisez un préfixe", value="", options=list(dfpos3.columns.values), callback=callback)
select1 = Select(title="Choissisez une forme de mot", value="", options=[])
#select2 = Select(title="Choissisez une mesure", value="", options=["Fréquence absolue", "Fréquence relative", "Log"])
#select2.js_on_change('value', callback)


columns = []
for col in df4.columns.values:
    columns.append(TableColumn(field=col, title=col))
datatable = DataTable(source=source, columns=columns, width=1000, height=280, editable=True)


grid = gridplot([ [div],[tabs],[select0, select1, None], [datatable, None, None]])
show(grid)
# Preview and save 
#show(fig)  # See what I made, and save if I like it

In [None]:
#for year in ('1800','1850','1900','1950','2000'):
    #dfpos[dfpos.prefix=='extra'].groupby(['word_pos','word1'])[year].sum().head(25).T.plot(kind="barh", title=year) 
    #plt.show()
    for k, grp in dfpos[dfpos.prefix=='extra'].groupby(['word_pos']):
        if re.search(r"_$", k) == None:
            for year in ('1800','1850','1900','1950','2000'):
                grp.sort_values(['word1','full_count'],ascending=False).groupby(['word1'])[year].sum().head(20).plot(kind="barh", title=year + "-" + k )
                plt.show()

# TBD  : generate a bokeh interactive html dashboard with :
- list of prefix / list of words
- distribution by year for word_pos, word


In [None]:
dfpos.head(100)

In [None]:
# now plot timeline for each prefix
df2 = df.drop(['word','string'],axis=1)

group_data = df2.groupby('prefix').sum()

#print(group_data)
group_data.T.plot(subplots=True,rot=45,figsize=(10,5),title="Evolution des fréquences totales des préfixes entre 1800 et 2010")
group_data.T.plot(logy=True,rot=45,figsize=(10,5),title="Evolution des fréquences totales des préfixes entre 1800 et 2010 (Log Y)")
group_data.T.plot(rot=45,figsize=(10,5),title="Evolution des fréquences totales des préfixes entre 1800 et 2010")


# just with POS

In [None]:
pref_re = '^(ultra|super|hyper|hypra|extra|méga|archi|maxi|supra)\s+_.+?_$'
df2 = df[df.string.str.contains(pref_re, regex="True", case=False)]
print(df2.info())
print(df2.head())
