## CS/CG Dataset Exploration

**Ref:** https://github.com/wanyao1992/code_summarization_public

In [1]:
import os
import re
import codecs
import pprint

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set()

pp = pprint.PrettyPrinter(width=120, indent=4, compact=False)

In [None]:
def sanitize(line):
    return line.replace(' DCNL DCSP ', '\n ').replace(' DCNL ', '\n').replace(' DCSP ', ' ')

def tokenize(line):
    tokens = re.split('\.|\(|\)|\:| |;|,|!|=|[|]', line)
    return [t for t in tokens if t.strip()]

In [None]:
root_dir = os.path.join(os.environ['HOME'], 'workspace/msc-research/raw-datasets/cscg')

anno_path = os.path.join(root_dir, 'data_ps.descriptions')
code_path = os.path.join(root_dir, 'data_ps.declbodies')

anno_stream = codecs.open(anno_path, 'r', 'utf-8', errors='ignore')
code_stream = codecs.open(code_path, 'r', 'utf-8', errors='ignore')

anno_data, code_data = [], []

while True:
    a = anno_stream.readline().strip()
    c = code_stream.readline().strip()
    
    if a == '' or c == '':
        break
    
    anno_data.append(a)
    code_data.append(c)
    
assert len(anno_data) == len(code_data)

df = pd.DataFrame({'anno': anno_data, 'code': code_data})
df['code_tok'] = df['code'].apply(lambda x: tokenize(sanitize(x)))
df['code_len'] = df['code_tok'].apply(len)
df['anno_tok'] = df['anno'].apply(lambda x: tokenize(sanitize(x)))
df['anno_len'] = df['anno_tok'].apply(len)

df.sort_values(by=['anno_len', 'code_len'], ascending=False, inplace=True)

print('dataset size:', df.shape)

In [None]:
def sample(n=1):
    d = df.sample(n)

    return {
        'anno': ' '.join(*d['anno_tok']),
        'code': ' '.join(*d['code_tok'])
    }

pp.pprint(sample(1))

### Length histograms

In [None]:
plt.figure(figsize=(16,8))


key = 'code_tok'
d = df[key].apply(lambda l: len(l) / 1)
print(f'{key} len: min {d.min():.3f} | max {d.max():.3f} | mean {d.mean():.3f}')
xs = d[(0 <= d) & (d <= 0.01 * len(d))].values

y, x, _ = plt.hist(xs, bins=128, label=key, fc=(0,0.5,0.5,0.3))

key = 'anno_tok'
d = df[key].apply(lambda l: len(l) / 1)
print(f'{key} len: min {d.min():.3f} | max {d.max():.3f} | mean {d.mean():.3f}')
xs = d[(0 <= d) & (d <= 0.01 * len(d))].values
y, x, _ = plt.hist(xs, bins=128, label=key, fc=(0.1,0,0.8,0.2))

plt.xticks(np.round(x[::4]))
plt.legend()

pass