## CS/CG Dataset Exploration

**Ref:** https://github.com/wanyao1992/code_summarization_public

In [None]:
import os
import re
import codecs

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set()

In [None]:
def sanitize(line):
    return line.replace(' DCNL DCSP ', '\n ').replace(' DCNL ', '\n').replace(' DCSP ', ' ')

def tokenize(line):
    tokens = re.split('\.|\(|\)|\:| |;|,|!|=|[|]', line)
    return [t for t in tokens if t.strip()]

In [None]:
root_dir = os.path.join(os.environ['HOME'], 'workspace/msc-research/raw-datasets/cscg')

anno_path = os.path.join(root_dir, 'data_ps.descriptions')
code_path = os.path.join(root_dir, 'data_ps.declbodies')

anno_stream = codecs.open(anno_path, 'r', 'utf-8', errors='ignore')
code_stream = codecs.open(code_path, 'r', 'utf-8', errors='ignore')

anno_data, code_data = [], []

while True:
    a = anno_stream.readline().strip()
    c = code_stream.readline().strip()
    
    if a == '' or c == '':
        break
    
    anno_data.append(a)
    code_data.append(c)
    
assert len(anno_data) == len(code_data)

df = pd.DataFrame({'anno': anno_data, 'code': code_data})
df['code_tok'] = df['code'].apply(lambda x: tokenize(sanitize(x)))
df['code_len'] = df['code_tok'].apply(len)
df['anno_tok'] = df['anno'].apply(lambda x: tokenize(sanitize(x)))
df['anno_len'] = df['anno_tok'].apply(len)

df = df.drop(['anno', 'code'], axis=1)
df.sort_values(by=['anno_len', 'code_len'], ascending=False, inplace=True)

print('dataset size:', df.shape)

In [None]:
print(' '.join(df.iloc[0]['code_tok']))

In [None]:
key = 'anno'
x = df[key].apply(len)
print(f'{key} len: min {x.min()} | max {x.max()} | mean {x.mean():.3f}')
x[(30 <= x) & (x <= 50)].hist(bins=32)