In [1]:
import glob
import os
import pandas as pd

PATH = "data/OpenClinica_core_src_main_java/"
FILE_MATCHER = "**/*."
FILE_EXTENSION = "java"
PATH_MATCHER = os.path.join(PATH, FILE_MATCHER) + FILE_EXTENSION
FILES_TO_EXCLUDE = ["package-info.java"]
corpus = pd.DataFrame(glob.glob(PATH_MATCHER, recursive=True), columns=["filepath"])
corpus['filename'] = corpus['filepath'].str.split(os.path.sep).str[-1]
corpus['typename'] = corpus['filename'].str.replace("." + FILE_EXTENSION, "", regex=False)
corpus = corpus[~corpus["filename"].isin(FILES_TO_EXCLUDE)].copy()
corpus.head()

Unnamed: 0,filepath,filename,typename
0,data/OpenClinica_core_src_main_java/org/akaza/...,AuditBean.java,AuditBean
1,data/OpenClinica_core_src_main_java/org/akaza/...,AuditEventBean.java,AuditEventBean
2,data/OpenClinica_core_src_main_java/org/akaza/...,CRFBean.java,CRFBean
3,data/OpenClinica_core_src_main_java/org/akaza/...,DeletedEventCRFBean.java,DeletedEventCRFBean
4,data/OpenClinica_core_src_main_java/org/akaza/...,DisplayStudyBean.java,DisplayStudyBean


In [2]:
def read_content(path):
    with open(path) as source_code:
        return source_code.read()

corpus['code'] = corpus['filepath'].apply(read_content)
corpus.head()

Unnamed: 0,filepath,filename,typename,code
0,data/OpenClinica_core_src_main_java/org/akaza/...,AuditBean.java,AuditBean,package org.akaza.openclinica.bean.admin;\n\ni...
1,data/OpenClinica_core_src_main_java/org/akaza/...,AuditEventBean.java,AuditEventBean,/*\n * OpenClinica is distributed under the\n ...
2,data/OpenClinica_core_src_main_java/org/akaza/...,CRFBean.java,CRFBean,/*\n * OpenClinica is distributed under the\n ...
3,data/OpenClinica_core_src_main_java/org/akaza/...,DeletedEventCRFBean.java,DeletedEventCRFBean,package org.akaza.openclinica.bean.admin;\n\ni...
4,data/OpenClinica_core_src_main_java/org/akaza/...,DisplayStudyBean.java,DisplayStudyBean,/*\n * OpenClinica is distributed under the\n ...


In [3]:
corpus['lines'] = corpus['code'].str.count("\n")
corpus.head()

Unnamed: 0,filepath,filename,typename,code,lines
0,data/OpenClinica_core_src_main_java/org/akaza/...,AuditBean.java,AuditBean,package org.akaza.openclinica.bean.admin;\n\ni...,241
1,data/OpenClinica_core_src_main_java/org/akaza/...,AuditEventBean.java,AuditEventBean,/*\n * OpenClinica is distributed under the\n ...,320
2,data/OpenClinica_core_src_main_java/org/akaza/...,CRFBean.java,CRFBean,/*\n * OpenClinica is distributed under the\n ...,126
3,data/OpenClinica_core_src_main_java/org/akaza/...,DeletedEventCRFBean.java,DeletedEventCRFBean,package org.akaza.openclinica.bean.admin;\n\ni...,63
4,data/OpenClinica_core_src_main_java/org/akaza/...,DisplayStudyBean.java,DisplayStudyBean,/*\n * OpenClinica is distributed under the\n ...,55


In [4]:
import re
import os.path
from pygments.token import Token
from pygments.lexers.jvm import JavaLexer

DOT_PATTERN = re.compile(r'(.*)\.(.*)')
CAMEL_CASE_1_PATTERN = re.compile(r'(.)([A-Z][a-z]+)')
CAMEL_CASE_2_PATTERN = re.compile(r'([a-z0-9])([A-Z])')
WORD_BOUNDARY_PATTERN = re.compile(r'[^a-zA-Z]')

JAVA_STOP_WORDS = set(["byte", "short", "int", "long", "float", "double", "char", "string", "object", "java"])
PRODUCT_NAME_STOP_WORDS = set(["openclinica"])
START_NAMES_TO_AVOID = set(["package", "import"])
STOP_LIST = JAVA_STOP_WORDS | PRODUCT_NAME_STOP_WORDS


def extract_identifiers(source_code, lexer, min_token_length):

    identifiers = []
    
    filtered_source_code = ""
    
    for line in source_code.split("\n"):
        beginning = line.split(" ", maxsplit=1)[0]
        if beginning not in START_NAMES_TO_AVOID:
            filtered_source_code += line
               
    for token_type, value in lexer.get_tokens(filtered_source_code):

        if token_type in Token.Name or Token:
            tokens = str(value)
            tokens = CAMEL_CASE_1_PATTERN.sub(r'\1 \2', tokens)
            tokens = CAMEL_CASE_2_PATTERN.sub(r'\1 \2', tokens)
            tokens = WORD_BOUNDARY_PATTERN.sub(' ', tokens)
            tokens = tokens.split(' ')

            for token in tokens:
                if len(token) >= min_token_length and token.lower() not in STOP_LIST:
                    identifiers.append(token.lower())

    return " ".join(identifiers)

LEXER = JavaLexer()
MIN_TOKEN_LENGTH = 3
# 
def extract_java_identifiers(source_code):
    return extract_identifiers(source_code, LEXER, MIN_TOKEN_LENGTH)

corpus['tokens'] = corpus['code'].apply(extract_java_identifiers)
corpus.head()

Unnamed: 0,filepath,filename,typename,code,lines,tokens
0,data/OpenClinica_core_src_main_java/org/akaza/...,AuditBean.java,AuditBean,package org.akaza.openclinica.bean.admin;\n\ni...,241,audit bean entity bean date audit date audit t...
1,data/OpenClinica_core_src_main_java/org/akaza/...,AuditEventBean.java,AuditEventBean,/*\n * OpenClinica is distributed under the\n ...,320,audit event bean auditable entity bean
2,data/OpenClinica_core_src_main_java/org/akaza/...,CRFBean.java,CRFBean,/*\n * OpenClinica is distributed under the\n ...,126,crf bean auditable entity bean status descript...
3,data/OpenClinica_core_src_main_java/org/akaza/...,DeletedEventCRFBean.java,DeletedEventCRFBean,package org.akaza.openclinica.bean.admin;\n\ni...,63,deleted event crf bean study event crf name cr...
4,data/OpenClinica_core_src_main_java/org/akaza/...,DisplayStudyBean.java,DisplayStudyBean,/*\n * OpenClinica is distributed under the\n ...,55,display study bean auditable entity bean study...


In [5]:
corpus['tokens_count'] = corpus['tokens'].apply(len)
corpus.head(15)

Unnamed: 0,filepath,filename,typename,code,lines,tokens,tokens_count
0,data/OpenClinica_core_src_main_java/org/akaza/...,AuditBean.java,AuditBean,package org.akaza.openclinica.bean.admin;\n\ni...,241,audit bean entity bean date audit date audit t...,2477
1,data/OpenClinica_core_src_main_java/org/akaza/...,AuditEventBean.java,AuditEventBean,/*\n * OpenClinica is distributed under the\n ...,320,audit event bean auditable entity bean,38
2,data/OpenClinica_core_src_main_java/org/akaza/...,CRFBean.java,CRFBean,/*\n * OpenClinica is distributed under the\n ...,126,crf bean auditable entity bean status descript...,69
3,data/OpenClinica_core_src_main_java/org/akaza/...,DeletedEventCRFBean.java,DeletedEventCRFBean,package org.akaza.openclinica.bean.admin;\n\ni...,63,deleted event crf bean study event crf name cr...,589
4,data/OpenClinica_core_src_main_java/org/akaza/...,DisplayStudyBean.java,DisplayStudyBean,/*\n * OpenClinica is distributed under the\n ...,55,display study bean auditable entity bean study...,234
5,data/OpenClinica_core_src_main_java/org/akaza/...,JDBCType.java,JDBCType,package org.akaza.openclinica.bean.admin;\n\n\...,184,jdbc type bit types bit tinyint types tinyint ...,1060
6,data/OpenClinica_core_src_main_java/org/akaza/...,NewCRFBean.java,NewCRFBean,/*\n * OpenClinica is distributed under the\n ...,821,new crf bean serializable data source dao dige...,293
7,data/OpenClinica_core_src_main_java/org/akaza/...,QueryObject.java,QueryObject,package org.akaza.openclinica.bean.admin;\n\ni...,30,query sql array list sql parameter sql paramet...,255
8,data/OpenClinica_core_src_main_java/org/akaza/...,SqlParameter.java,SqlParameter,package org.akaza.openclinica.bean.admin;\n\n\...,41,sql parameter value jdbc type type sql paramet...,268
9,data/OpenClinica_core_src_main_java/org/akaza/...,StudyEventAuditBean.java,StudyEventAuditBean,/*\n * OpenClinica is distributed under the\n ...,101,study event audit bean study event definition ...,944


In [6]:
# some files seem to have no tokens, problem needs to be analyzed further!
corpus = corpus[corpus['tokens_count'] != 0].copy()

In [7]:
corpus.to_csv("temp/openclinica_corpus.bz2", columns=["filepath", "typename", "tokens"], compression='bz2')

In [8]:
import pandas as pd
corpus = pd.read_csv('temp/openclinica_corpus.bz2', index_col=0)
corpus.head()

Unnamed: 0,filepath,typename,tokens
0,data/OpenClinica_core_src_main_java/org/akaza/...,AuditBean,audit bean entity bean date audit date audit t...
1,data/OpenClinica_core_src_main_java/org/akaza/...,AuditEventBean,audit event bean auditable entity bean
2,data/OpenClinica_core_src_main_java/org/akaza/...,CRFBean,crf bean auditable entity bean status descript...
3,data/OpenClinica_core_src_main_java/org/akaza/...,DeletedEventCRFBean,deleted event crf bean study event crf name cr...
4,data/OpenClinica_core_src_main_java/org/akaza/...,DisplayStudyBean,display study bean auditable entity bean study...


In [9]:
corpus['splitted_tokens'] = corpus['tokens'].str.split()
corpus[:5]

Unnamed: 0,filepath,typename,tokens,splitted_tokens
0,data/OpenClinica_core_src_main_java/org/akaza/...,AuditBean,audit bean entity bean date audit date audit t...,"[audit, bean, entity, bean, date, audit, date,..."
1,data/OpenClinica_core_src_main_java/org/akaza/...,AuditEventBean,audit event bean auditable entity bean,"[audit, event, bean, auditable, entity, bean]"
2,data/OpenClinica_core_src_main_java/org/akaza/...,CRFBean,crf bean auditable entity bean status descript...,"[crf, bean, auditable, entity, bean, status, d..."
3,data/OpenClinica_core_src_main_java/org/akaza/...,DeletedEventCRFBean,deleted event crf bean study event crf name cr...,"[deleted, event, crf, bean, study, event, crf,..."
4,data/OpenClinica_core_src_main_java/org/akaza/...,DisplayStudyBean,display study bean auditable entity bean study...,"[display, study, bean, auditable, entity, bean..."


In [10]:
tokens_list = corpus['tokens'].str.split()
tokens_list[5:]

5      [jdbc, type, bit, types, bit, tinyint, types, ...
6      [new, crf, bean, serializable, data, source, d...
7      [query, sql, array, list, sql, parameter, sql,...
8      [sql, parameter, value, jdbc, type, type, sql,...
9      [study, event, audit, bean, study, event, defi...
                             ...                        
732    [item, validator, validator, item, dao, item, ...
733    [response, set, validator, validator, logger, ...
734    [api, security, filter, once, per, request, fi...
735    [open, clinica, jdbc, service, jdbc, dao, impl...
736    [open, clinica, ldap, authorities, populator, ...
Name: tokens, Length: 655, dtype: object

In [11]:
from gensim import corpora
corpus["splitted_tokens"] = corpus['tokens'].str.split()
corpus.head()

Unnamed: 0,filepath,typename,tokens,splitted_tokens
0,data/OpenClinica_core_src_main_java/org/akaza/...,AuditBean,audit bean entity bean date audit date audit t...,"[audit, bean, entity, bean, date, audit, date,..."
1,data/OpenClinica_core_src_main_java/org/akaza/...,AuditEventBean,audit event bean auditable entity bean,"[audit, event, bean, auditable, entity, bean]"
2,data/OpenClinica_core_src_main_java/org/akaza/...,CRFBean,crf bean auditable entity bean status descript...,"[crf, bean, auditable, entity, bean, status, d..."
3,data/OpenClinica_core_src_main_java/org/akaza/...,DeletedEventCRFBean,deleted event crf bean study event crf name cr...,"[deleted, event, crf, bean, study, event, crf,..."
4,data/OpenClinica_core_src_main_java/org/akaza/...,DisplayStudyBean,display study bean auditable entity bean study...,"[display, study, bean, auditable, entity, bean..."


In [12]:
dictionary = corpora.Dictionary(list(corpus['splitted_tokens'].values))
print(dictionary)

Dictionary<1234 unique tokens: ['audit', 'bean', 'change', 'class', 'code']...>


In [13]:
corpus['token_ids'] = corpus["splitted_tokens"].map(dictionary.doc2bow)
corpus.head()

Unnamed: 0,filepath,typename,tokens,splitted_tokens,token_ids
0,data/OpenClinica_core_src_main_java/org/akaza/...,AuditBean,audit bean entity bean date audit date audit t...,"[audit, bean, entity, bean, date, audit, date,...","[(0, 37), (1, 4), (2, 7), (3, 2), (4, 2), (5, ..."
1,data/OpenClinica_core_src_main_java/org/akaza/...,AuditEventBean,audit event bean auditable entity bean,"[audit, event, bean, auditable, entity, bean]","[(0, 1), (1, 2), (8, 1), (10, 1), (36, 1)]"
2,data/OpenClinica_core_src_main_java/org/akaza/...,CRFBean,crf bean auditable entity bean status descript...,"[crf, bean, auditable, entity, bean, status, d...","[(1, 2), (5, 1), (8, 1), (36, 1), (37, 1), (38..."
3,data/OpenClinica_core_src_main_java/org/akaza/...,DeletedEventCRFBean,deleted event crf bean study event crf name cr...,"[deleted, event, crf, bean, study, event, crf,...","[(1, 1), (5, 22), (7, 10), (10, 15), (12, 6), ..."
4,data/OpenClinica_core_src_main_java/org/akaza/...,DisplayStudyBean,display study bean auditable entity bean study...,"[display, study, bean, auditable, entity, bean...","[(1, 5), (8, 1), (12, 2), (29, 2), (30, 4), (3..."


In [14]:
id_words = [[(dictionary[id], count) for id, count in line] for line in corpus['token_ids'].values]
print(id_words)



In [15]:
id2word = corpora.Dictionary(list(corpus['splitted_tokens'].values))

corpus_new = []
for text in corpus['splitted_tokens'].values:
    new  = id2word.doc2bow(text)
    corpus_new.append(new)

print (corpus_new[0][0:20])

word = id2word[[0][:1][0]]
print (word)



[(0, 37), (1, 4), (2, 7), (3, 2), (4, 2), (5, 28), (6, 14), (7, 20), (8, 15), (9, 2), (10, 35), (11, 7), (12, 23), (13, 2), (14, 7), (15, 7), (16, 14), (17, 7), (18, 42), (19, 7)]
audit


In [16]:
from gensim.models import LdaModel

lda_model = LdaModel(
    corpus = corpus_new,
    num_topics=50,
    id2word=id2word,
    random_state=0,
    passes=10
)
print(lda_model.show_topics(formatted=False))

[(16, [('type', 0.19186799), ('response', 0.14353864), ('data', 0.08264495), ('list', 0.06635962), ('bean', 0.057494223), ('set', 0.055327695), ('item', 0.053487934), ('get', 0.037027813), ('name', 0.030206174), ('present', 0.023433693)]), (38, [('study', 0.17640059), ('subject', 0.0799226), ('dao', 0.07534365), ('bean', 0.054096825), ('get', 0.050389733), ('data', 0.047445957), ('event', 0.047034267), ('source', 0.02643958), ('audit', 0.021604793), ('logs', 0.0186219)]), (15, [('query', 0.13276853), ('get', 0.0889122), ('class', 0.0731385), ('domain', 0.061742578), ('session', 0.05843485), ('hibernate', 0.04075509), ('current', 0.038456574), ('create', 0.03443242), ('find', 0.03176347), ('org', 0.031228721)]), (6, [('set', 0.12511572), ('label', 0.09775967), ('get', 0.09390698), ('audit', 0.07425884), ('ref', 0.073409006), ('hint', 0.06750496), ('appearance', 0.050742205), ('item', 0.042754043), ('reference', 0.035800155), ('log', 0.021714337)]), (18, [('note', 0.09356384), ('discrepa

In [17]:
import warnings
# we use an old library for generating this visualization
warnings.filterwarnings("ignore")

import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus_new, id2word, mds="mmds", R=30)
vis

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
