In [15]:
import numpy as np
import pandas as pd
import os
import re

In [2]:
KEYWORDS_DICT = {
    'go': ['break', 'default', 'func', 'interface', 'select', 'case', 'defer', 'go', 'map', 'struct', 'chan', 'else',
           'goto', 'package', 'switch', 'const', 'fallthrough', 'if', 'range', 'type', 'continue', 'for', 'import',
           'return', 'var'],
    'java': ["abstract", "assert", "boolean", "break", "byte", "case", "catch", "char", "class", "const", "continue",
             "default", "do", "double", "else", "extends", "false", "final", "finally", "float", "for", "goto",
             "if", "implements", "import", "instanceof", "int", "interface", "long", "native", "new", "null",
             "package", "private", "protected", "public", "return", "short", "static", "strictfp", "super", "switch",
             "synchronized", "this", "throw", "throws", "transient", "true", "try", "void", "volatile", "while"],
    'javascript' : ['break', 'do', 'instanceof', 'typeof', 'case', 'else', 'new', 'var', 'catch', 'finally', 'return',
                    'void', 'continue', 'for', 'switch', 'while', 'debugger', 'function', 'this', 'with', 'default',
                    'if', 'throw', 'delete', 'in', 'try', 'abstract', 'export', 'interface', 'static', 'boolean',
                    'extends', 'long', 'super', 'byte', 'final', 'native', 'synchronized', 'char', 'float', 'package',
                    'throws', 'class', 'goto', 'private', 'transient', 'const', 'implements', 'protected', 'volatile',
                    'double', 'import', 'public', 'enum', 'int', 'short'],
    'php' : ['__halt_compiler', 'abstract', 'and', 'array', 'as', 'break', 'callable', 'case', 'catch', 'class',
             'clone', 'const', 'continue', 'declare', 'default', 'die', 'do', 'echo', 'else', 'elseif', 'empty',
             'enddeclare', 'endfor', 'endforeach', 'endif', 'endswitch', 'endwhile', 'eval', 'exit', 'extends',
             'final', 'for', 'foreach', 'function', 'global', 'goto', 'if', 'implements', 'include', 'include_once',
             'instanceof', 'insteadof', 'interface', 'isset', 'list', 'namespace', 'new', 'or', 'print', 'private',
             'protected', 'public', 'require', 'require_once', 'return', 'static', 'switch', 'throw', 'trait', 'try',
             'unset', 'use', 'var', 'while', 'xor'], 
    'python' : ['and', 'as', 'assert', 'break', 'class', 'continue', 'def', 'del', 'elif', 'else', 'except', 'exec',
                'finally', 'for', 'from', 'global', 'if', 'import', 'in', 'is', 'lambda', 'not', 'or', 'pass',
                'print', 'raise', 'return', 'try', 'while', 'with', 'yield'], 
    'ruby' : ["BEGIN", "END", "__FILE__", "__LINE__", "alias", "and", "begin", "break", "case", "class", "def",
              "defined?", "do", "else", "elsif", "end", "ensure", "false", "for", "if", "in", "module", "next", "nil",
              "not", "or", "redo", "rescue", "retry", "return", "self", "super", "then", "true", "undef", "unless",
              "until", "when", "while", "yield"]
}

In [3]:
# Count keywords based on string search
def count_keyword_occurence(file):
    #print(file)
    lang_occur_count = {'go':0, 'java':0, 'javascript':0, 'php':0, 'python':0, 'ruby':0}
    for language in KEYWORDS_DICT:
        for word in KEYWORDS_DICT[language]:
            lang_occur_count[language] = lang_occur_count[language] + file.count(word)
            #if file.count(word) > 0:
                #print(word, language, file.count(word))
    return lang_occur_count

In [13]:
# Count keywords based on regex search -sophisticated model
def count_keyword_occurence_regex(file):
    lang_occur_count = {'go':0, 'java':0, 'javascript':0, 'php':0, 'python':0, 'ruby':0}
    for language in KEYWORDS_DICT:
        for word in KEYWORDS_DICT[language]:
            word_count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(word), file))
            lang_occur_count[language] = lang_occur_count[language] + word_count
    return lang_occur_count

In [18]:
def get_performance(keyword_function, base_file_path):
    # y axis of the confusion matrix are the real values
    confusion_matrix = pd.DataFrame({
        'real_y':['go', 'java', 'javascript', 'php', 'python', 'ruby'],
        'go' :[0] * 6,
        'java': [0] * 6,
        'javascript' : [0] * 6,
        'php': [0] * 6,
        'python': [0] * 6, 
        'ruby': [0] * 6
    })

    test_data_file_dirs = os.listdir(base_file_path)
    # loop through all 6 folders
    for file_dir in test_data_file_dirs:
        print(file_dir)
        if file_dir in ['go', 'java', 'javascript', 'php', 'python', 'ruby']:
            # loop through text files in each folder
            for test_file in os.listdir(os.path.join(base_file_path, file_dir)):
                file  = open(os.path.join(base_file_path, file_dir, test_file), 'r').read()

                # count how many times each keyword occurs
                language_keyword_counts = keyword_function(file)

                # predict language based on highest occurence of keywords
                pred_language = max(language_keyword_counts, key=language_keyword_counts.get)

                # update confusion matrix
                cm_real, cm_pred = confusion_matrix['real_y'] == file_dir, pred_language
                confusion_matrix.loc[cm_real, cm_pred] = confusion_matrix.loc[cm_real, cm_pred] + 1
    
    confusion_matrix_array = np.asarray(confusion_matrix[['go', 'java', 'javascript', 'php', 'python', 'ruby']])
    return confusion_matrix
#, np.trace(confusion_matrix_array)/sum(sum(confusion_matrix_array)), np.diag(confusion_matrix_array)/1000

In [19]:
print('Performance on full function files using string search')
get_performance(count_keyword_occurence, 'sample_data/test')

Performance on full function files using string search
go
python
java
php
javascript
ruby


Unnamed: 0,real_y,go,java,javascript,php,python,ruby
0,go,69,7,153,157,315,299
1,java,3,112,480,125,252,28
2,javascript,56,22,314,222,313,73
3,php,7,103,297,257,308,28
4,python,6,1,15,24,657,297
5,ruby,13,2,26,44,267,648


In [11]:
print('Performance on augmented function files using string search')
get_performance(count_keyword_occurence, 'sample_data_augmented/test')

Performance on augmented function files using string search
go
.DS_Store
python
java
php
javascript
ruby


(       real_y   go  java  javascript  php  python  ruby
 0          go  248    29         136  190     184   213
 1        java  134   170         296  139     225    36
 2  javascript  215    70         206  158     274    77
 3         php  137   186         163  190     290    34
 4      python   90     7          54   68     559   222
 5        ruby  112    31          69  109     239   440,
 0.30216666666666664,
 array([0.248, 0.17 , 0.206, 0.19 , 0.559, 0.44 ]))

In [20]:
print('Performance on full function files using regex search')
get_performance(count_keyword_occurence_regex, 'sample_data/test')

Performance on full function files using regex search
go
python
java
php
javascript
ruby


Unnamed: 0,real_y,go,java,javascript,php,python,ruby
0,go,681,33,26,14,18,228
1,java,5,913,39,31,8,4
2,javascript,59,113,688,97,19,24
3,php,4,363,411,205,3,14
4,python,20,13,21,30,469,447
5,ruby,9,5,9,15,23,939


In [17]:
print('Performance on augmented function files using regex search')
get_performance(count_keyword_occurence_regex, 'sample_data_augmented/test')

Performance on augmented function files using regex search
go
.DS_Store
python
java
php
javascript
ruby


(       real_y   go  java  javascript  php  python  ruby
 0          go  695    56          23    7      25   194
 1        java  243   669          23   41      12    12
 2  javascript  368   221         294   59      30    28
 3         php  206   492         123  148       8    23
 4      python  207    38          71   66     352   266
 5        ruby  217    99          15   47      83   539,
 0.4495,
 array([0.695, 0.669, 0.294, 0.148, 0.352, 0.539]))