In [1]:
%matplotlib inline

from math import log

def to_bin_entropy(ev):
    ev.data.Entropy = ev.data.Entropy / log(2)
    return ev

# Evaluation after epoch=159

In [2]:
from langmodels.evaluation import EvaluationResult

evaluation159 = EvaluationResult.from_path('evaluation-test-159')
evaluation159 = to_bin_entropy(evaluation159)

2020-07-06 12:33:57,212 [numexpr.utils] INFO: NumExpr defaulting to 4 threads.


## Different Projects

In [3]:
per_project_entropies = evaluation159.aggregate(['Project']).data
per_project_entropies.sort_values(['Entropy'])

Unnamed: 0_level_0,n_samples,example,Entropy
Project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
postmark-java,3917,<comment></t>,5.671923
ibatis-handling-joins,1984,sql|Session|Factory</t>,5.762738
logmx,373,(</t>,5.85284
mod-installer,5460,getLog|o|File</t>,6.134351
servletjspdemo,699,(</t>,6.188181
MCBans,37271,"""%|B|AD|WOR|D|%""</t>",6.211582
RPS,4113,"""|""</t>",6.334267
fasthat,51121,"""</|b|>|<|/|p|>|<|h|r|>""</t>",6.402121
Clara,9765,inf|l|ate|_d|uplic|ate|Id|_|exception|Throw|n</t>,6.404533
twitterdroid,5438,{</t>,6.405383


In [4]:
per_project_entropies['Entropy'].plot.box()

<matplotlib.axes._subplots.AxesSubplot at 0x12f16f0b8>

## Different Token Types

In [5]:
evaluation159.aggregate(['TokenType']).data

Unnamed: 0_level_0,n_samples,example,Entropy
TokenType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ClosingBracket,500373,)</t>,0.93146
ClosingCurlyBracket,142263,}</t>,1.256516
Identifier,1800496,GR|AP|H_|BU|S_|EN|ABLE|D_|DEFAULT</t>,19.047863
KeyWord,582531,new</t>,2.718242
MultilineComment,42781,<comment></t>,2.480811
NonCodeChar,324974,.</t>,0.544293
Number,14603,11|7|2|26|16|2|15|7|4|17|40|19|l</t>,13.101308
One,9690,1</t>,4.219729
OneLineComment,29569,<comment></t>,3.218779
OpeningBracket,500366,(</t>,0.524268


## Identifiers of different lengths 

In [6]:
df = evaluation159.aggregate(['SubtokenNumber', 'TokenType']).data
df.query("TokenType == 'Identifier'")


Unnamed: 0_level_0,Unnamed: 1_level_0,n_samples,example,Entropy
SubtokenNumber,TokenType,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Identifier,1012973,args</t>,7.960462
2,Identifier,345641,replace|First</t>,21.316411
3,Identifier,213559,ren|ame|Attributes</t>,31.727469
4,Identifier,119540,os|g|i|utils</t>,43.659296
5,Identifier,62441,get|Temp|S|id|Option</t>,54.098427
6,Identifier,21921,EX|IT_|ON|_C|LO|SE</t>,62.416725
7,Identifier,9928,getIn|st|all|er|Mod|s|Folder</t>,72.291458
8,Identifier,5406,f|Ad|dition|al|Info|Aff|ord|ance</t>,80.691314
9,Identifier,2922,H|IGH|L|IGHT_|B|G|_COL|OR_|NAME</t>,90.079935
10,Identifier,1929,test|ContentAssist|Struct|In|Class|Typed|ef|Re...,99.343847


# Improvement with training

In [7]:
evaluation0 = EvaluationResult.from_path('evaluation-test-0')
evaluation0 = to_bin_entropy(evaluation0)
evaluation19 = EvaluationResult.from_path('evaluation-test-19')
evaluation19 = to_bin_entropy(evaluation19)
evaluation79 = EvaluationResult.from_path('evaluation-test-79')
evaluation79 = to_bin_entropy(evaluation79)

## Overall improvement

In [8]:
ents = list(map(lambda e: e.total()['Entropy'], [evaluation0, evaluation19, evaluation79, evaluation159]))
epochs = [0, 19, 79, 159]

import matplotlib.pyplot as plt
plt.plot(epochs, ents)

[<matplotlib.lines.Line2D at 0x131277e48>]

## Improvement for different token types

In [9]:
import pandas as pd

df = pd.DataFrame(columns = ['Epoch0', 'Epoch19', 'Epoch79', 'Epoch159'])
df['Epoch0'] = evaluation0.aggregate(['TokenType']).data.Entropy
df['Epoch19'] = evaluation19.aggregate(['TokenType']).data.Entropy
df['Epoch79'] = evaluation79.aggregate(['TokenType']).data.Entropy
df['Epoch159'] = evaluation159.aggregate(['TokenType']).data.Entropy
df.T

TokenType,ClosingBracket,ClosingCurlyBracket,Identifier,KeyWord,MultilineComment,NonCodeChar,Number,One,OneLineComment,OpeningBracket,OpeningCurlyBracket,Operator,Semicolon,SpecialToken,StringLiteral,Zero
Epoch0,2.280373,2.50265,24.938707,6.184727,3.810939,2.071541,17.599946,7.09161,3.613184,2.943186,3.311761,4.433158,2.50656,10.655935,27.415949,4.828429
Epoch19,1.461864,1.459246,21.448082,3.21422,2.726966,0.676256,14.858365,4.689852,3.176057,0.837317,0.873929,2.155033,0.908638,4.070022,23.178581,3.655467
Epoch79,1.032758,1.231907,19.555792,2.768031,2.461105,0.581638,13.481422,4.34957,3.062756,0.573191,0.538985,1.71035,0.491704,3.315053,21.747907,3.303918
Epoch159,0.93146,1.256516,19.047863,2.718242,2.480811,0.544293,13.101308,4.219729,3.218779,0.524268,0.525828,1.633154,0.461564,3.157015,21.327894,3.143124


In [10]:
df.T.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x131362978>

### Only a part of token types

In [11]:
df.loc[['Identifier', 'ClosingBracket', 'MultilineComment', 'Semicolon', 'StringLiteral']].T.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x12f1ac550>

## Improvement for identifiers of different lengths

In [12]:
import pandas as pd

df = pd.DataFrame(columns = ['Epoch0', 'Epoch19', 'Epoch79', 'Epoch159'])
df['Epoch0'] = evaluation0.aggregate(['SubtokenNumber', 'TokenType']).data.query("TokenType == 'Identifier'").Entropy
df['Epoch19'] = evaluation19.aggregate(['SubtokenNumber', 'TokenType']).data.query("TokenType == 'Identifier'").Entropy
df['Epoch79'] = evaluation79.aggregate(['SubtokenNumber', 'TokenType']).data.query("TokenType == 'Identifier'").Entropy
df['Epoch159'] = evaluation159.aggregate(['SubtokenNumber', 'TokenType']).data.query("TokenType == 'Identifier'").Entropy

df.T

SubtokenNumber,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,34
TokenType,Identifier,Identifier,Identifier,Identifier,Identifier,Identifier,Identifier,Identifier,Identifier,Identifier,...,Identifier,Identifier,Identifier,Identifier,Identifier,Identifier,Identifier,Identifier,Identifier,Identifier
Epoch0,12.090073,26.411617,39.576831,53.225523,66.935981,79.167289,92.542924,105.287079,117.892763,130.916377,...,259.891245,271.720076,274.22562,292.410469,307.111333,308.067481,318.942509,341.943333,360.408377,422.135943
Epoch19,9.434444,23.617907,35.154513,47.949357,59.467702,69.505746,80.83761,91.666053,102.375819,113.538913,...,212.215031,238.55557,229.751671,227.729115,257.140716,263.068199,269.740472,289.996591,321.318679,377.083282
Epoch79,8.274626,21.845917,32.467962,44.543857,55.165303,63.741104,73.825598,82.706575,92.29229,101.894071,...,182.727532,222.887129,208.499286,182.828281,230.954792,250.178164,243.333681,255.043425,307.680233,357.769967
Epoch159,7.960462,21.316411,31.727469,43.659296,54.098427,62.416725,72.291458,80.691314,90.079935,99.343847,...,174.990449,217.832242,203.280168,174.408783,224.735088,242.935416,232.89702,245.800417,302.709118,351.855709


In [13]:
df.T.plot(legend=False)

<matplotlib.axes._subplots.AxesSubplot at 0x131465f60>