In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import re

### Chargement des données

In [None]:
def load(name):
    df = pd.read_json(name, lines=True).fillna(pd.NA).sort_values('time').reset_index(drop=True)
    df = df.assign(time=(df.time - df.time[0]))
    df = df.assign(step=df.time.diff())
    df = df.set_index('time')
    return df

In [None]:
delay_df = load('../metrics-delay.json')
fast_df= load('../metrics-fast.json')

## Répartition des espacements de dates

In [None]:
bins = np.arange(0 ,2e10, 3e8)

In [None]:
delay_df.step.hist(log=True, bins=bins)

In [None]:
fast_df.step.hist(log=True, bins=bins)

## Répartition par type

In [None]:
[ fast_df[['name']].assign(count=1).groupby('name').count().plot.bar(log=True),
 delay_df[['name']].assign(count=1).groupby('name').count().plot.bar(log=True)]

In [None]:
fast_df[['name']].assign(count=1).groupby('name').count()

In [None]:
(delay_df[['name']].assign(count=1).groupby('name').count() - fast_df[['name']].assign(count=1).groupby('name').count()).rename(columns={'count': 'count_diff'})

On a plus 25 appels directs de moins en mode `delay`. Ce sont eux qui peuvent empécher d'optimiser. 

Y a-t-il un lien entre les -25 **direct** et les 26 **callSite** ?

# then : direct, fulfilled, pending
* **pending** arrive en attente d'I/O, particulièrement si on les rends longues.
* **fullfiled** arrive quand des résultats de I/O sont déjà arrivé. Optimisation potentielle, mais on voit bien que c'est négligeable face à **direct**

## direct

In [None]:
fast_prim = fast_df[fast_df.name == 'direct']
delay_prim = delay_df[delay_df.name == 'direct']

In [None]:
fast_prim.groupby('type').count()

In [None]:
delay_prim.groupby('type').count()

> Beaucoup de primitives

In [None]:
fast_prim = fast_df[fast_df.name == 'direct']
delay_prim = delay_df[delay_df.name == 'direct']

# callSite

In [None]:
callSite_fast = fast_df[['name','apply']][fast_df.name == 'callSite'].groupby('apply').count().sort_values('name', ascending=False)
callSite_fast

In [None]:
callSite_fast.plot()

In [None]:
callSite_fast_sum = callSite_fast.assign(count_sum=callSite_fast.cumsum()/10)
callSite_fast_sum.plot()

Il n'y a que la moitié des **callSite** à optimiser

In [None]:
callSite_fast_sum.query('count_sum < 4000')

# calledLambda

In [None]:
df = fast_df[['name','lambda']][fast_df.name == 'calledLambda'].groupby('lambda').count().sort_values('name', ascending=False)
df

In [None]:
df_sum = df.assign(count_sum=df.cumsum()/5)
df_sum.plot()

# pushStack

In [None]:
df = fast_df[['name','expr']][fast_df.name == 'pushStack'].groupby('expr').count().sort_values('name', ascending=False)
df

In [None]:
df_sum = df.assign(count_sum=df.cumsum()/30)
df_sum.plot()

# Conclusion
On voit bien, comme prévu, que certains éléments sont utilisés plus que d'autres. On a les éléments pour un compilateur...
Mais il faut être capable de prédire le type de **direct**.

# Éléments précédents un direct

In [None]:
direct_then_idx = fast_df.eval('name=="direct" and type!="then"')

In [None]:
fast_df[(direct_then_idx.shift(periods=-1).fillna(False)) & (fast_df.name != "direct")].groupby('name').count()

Un seul **popStack**

In [None]:
hot_spots = (
    fast_df[(direct_then_idx.shift(periods=-1).fillna(False)) 
        & (fast_df.name == "callSite")].groupby('apply')
    .count()
    .sort_values('name', ascending=False)
)[['name']]
hot_spots

In [None]:
hot_spots.assign(sum=hot_spots.name.cumsum()/4).plot()

Quand on regarde le code associé aux appels directs, on peut voir que ce sont des appels de primitives le plus souvant

In [None]:
PARSE_DEBUG_INFO_RE = re.compile(r".*@(\d+):(\d+)-(?:(\d+):)?(\d+)")

with open('../tp/test.tp') as f:
    ORIGINAL_TEXT = f.readlines()

def find_orginal_text(s):
    m = PARSE_DEBUG_INFO_RE.match(s)
    start_line = int(m[1])
    start_column = int(m[2])
    end_line = int(m[3] or m[1])
    end_column = int(m[4])
    if start_line == end_line:
        return ORIGINAL_TEXT[start_line-1][start_column-1:end_column-1]
    else:
        first = ORIGINAL_TEXT[start_line-1][start_column-1:]
        last = ORIGINAL_TEXT[end_line-1][:end_column-1]
        middle = ORIGINAL_TEXT[start_line:end_line-1]
        return ''.join([first]+middle+[last])
    
    
find_orginal_text('Apply@20:15-33')

In [None]:
hot_spots.assign(text=hot_spots.index.map(find_orginal_text))

Les appels directs sont tous fait sur des constantes.

In [None]:
fast_df[~ fast_df['apply'].isna()].name.unique()

# Autres sites d'appels

In [None]:
cs = pd.DataFrame(set(fast_df['apply'].unique()) - set(hot_spots.index), columns=['apply']).dropna()
cs = cs.assign(text=cs['apply'].map(find_orginal_text))
cs