In [17]:
### Imports
import pandas as pd
import numpy as np
import re2 as re

import string

from typing import List

from libsvm.svmutil import *

In [15]:
# STR TO INT class mapping
class_mapping = {
    'TGLANG_LANGUAGE_OTHER': 0,
    'TGLANG_LANGUAGE_1S_ENTERPRISE': 1,
    'TGLANG_LANGUAGE_ABAP': 2,
    'TGLANG_LANGUAGE_ACTIONSCRIPT': 3,
    'TGLANG_LANGUAGE_ADA': 4,
    'TGLANG_LANGUAGE_APACHE_GROOVY': 5,
    'TGLANG_LANGUAGE_APEX': 6,
    'TGLANG_LANGUAGE_APPLESCRIPT': 7,
    'TGLANG_LANGUAGE_ASP': 8,
    'TGLANG_LANGUAGE_ASSEMBLY': 9,
    'TGLANG_LANGUAGE_AUTOHOTKEY': 10,
    'TGLANG_LANGUAGE_AWK': 11,
    'TGLANG_LANGUAGE_BASIC': 12,
    'TGLANG_LANGUAGE_BATCH': 13,
    'TGLANG_LANGUAGE_BISON': 14,
    'TGLANG_LANGUAGE_C': 15,
    'TGLANG_LANGUAGE_CLOJURE': 16,
    'TGLANG_LANGUAGE_CMAKE': 17,
    'TGLANG_LANGUAGE_COBOL': 18,
    'TGLANG_LANGUAGE_COFFESCRIPT': 19,
    'TGLANG_LANGUAGE_COMMON_LISP': 20,
    'TGLANG_LANGUAGE_CPLUSPLUS': 21,
    'TGLANG_LANGUAGE_CRYSTAL': 22,
    'TGLANG_LANGUAGE_CSHARP': 23,
    'TGLANG_LANGUAGE_CSS': 24,
    'TGLANG_LANGUAGE_CSV': 25,
    'TGLANG_LANGUAGE_D': 26,
    'TGLANG_LANGUAGE_DART': 27,
    'TGLANG_LANGUAGE_DELPHI': 28,
    'TGLANG_LANGUAGE_DOCKER': 29,
    'TGLANG_LANGUAGE_ELIXIR': 30,
    'TGLANG_LANGUAGE_ELM': 31,
    'TGLANG_LANGUAGE_ERLANG': 32,
    'TGLANG_LANGUAGE_FIFT': 33,
    'TGLANG_LANGUAGE_FORTH': 34,
    'TGLANG_LANGUAGE_FORTRAN': 35,
    'TGLANG_LANGUAGE_FSHARP': 36,
    'TGLANG_LANGUAGE_FUNC': 37,
    'TGLANG_LANGUAGE_GAMS': 38,
    'TGLANG_LANGUAGE_GO': 39,
    'TGLANG_LANGUAGE_GRADLE': 40,
    'TGLANG_LANGUAGE_GRAPHQL': 41,
    'TGLANG_LANGUAGE_HACK': 42,
    'TGLANG_LANGUAGE_HASKELL': 43,
    'TGLANG_LANGUAGE_HTML': 44,
    'TGLANG_LANGUAGE_ICON': 45,
    'TGLANG_LANGUAGE_IDL': 46,
    'TGLANG_LANGUAGE_INI': 47,
    'TGLANG_LANGUAGE_JAVA': 48,
    'TGLANG_LANGUAGE_JAVASCRIPT': 49,
    'TGLANG_LANGUAGE_JSON': 50,
    'TGLANG_LANGUAGE_JULIA': 51,
    'TGLANG_LANGUAGE_KEYMAN': 52,
    'TGLANG_LANGUAGE_KOTLIN': 53,
    'TGLANG_LANGUAGE_LATEX': 54,
    'TGLANG_LANGUAGE_LISP': 55,
    'TGLANG_LANGUAGE_LOGO': 56,
    'TGLANG_LANGUAGE_LUA': 57,
    'TGLANG_LANGUAGE_MAKEFILE': 58,
    'TGLANG_LANGUAGE_MARKDOWN': 59,
    'TGLANG_LANGUAGE_MATLAB': 60,
    'TGLANG_LANGUAGE_NGINX': 61,
    'TGLANG_LANGUAGE_NIM': 62,
    'TGLANG_LANGUAGE_OBJECTIVE_C': 63,
    'TGLANG_LANGUAGE_OCAML': 64,
    'TGLANG_LANGUAGE_OPENEDGE_ABL': 65,
    'TGLANG_LANGUAGE_PASCAL': 66,
    'TGLANG_LANGUAGE_PERL': 67,
    'TGLANG_LANGUAGE_PHP': 68,
    'TGLANG_LANGUAGE_PL_SQL': 69,
    'TGLANG_LANGUAGE_POWERSHELL': 70,
    'TGLANG_LANGUAGE_PROLOG': 71,
    'TGLANG_LANGUAGE_PROTOBUF': 72,
    'TGLANG_LANGUAGE_PYTHON': 73,
    'TGLANG_LANGUAGE_QML': 74,
    'TGLANG_LANGUAGE_R': 75,
    'TGLANG_LANGUAGE_RAKU': 76,
    'TGLANG_LANGUAGE_REGEX': 77,
    'TGLANG_LANGUAGE_RUBY': 78,
    'TGLANG_LANGUAGE_RUST': 79,
    'TGLANG_LANGUAGE_SAS': 80,
    'TGLANG_LANGUAGE_SCALA': 81,
    'TGLANG_LANGUAGE_SCHEME': 82,
    'TGLANG_LANGUAGE_SHELL': 83,
    'TGLANG_LANGUAGE_SMALLTALK': 84,
    'TGLANG_LANGUAGE_SOLIDITY': 85,
    'TGLANG_LANGUAGE_SQL': 86,
    'TGLANG_LANGUAGE_SWIFT': 87,
    'TGLANG_LANGUAGE_TCL': 88,
    'TGLANG_LANGUAGE_TEXTILE': 89,
    'TGLANG_LANGUAGE_TL': 90,
    'TGLANG_LANGUAGE_TYPESCRIPT': 91,
    'TGLANG_LANGUAGE_UNREALSCRIPT': 92,
    'TGLANG_LANGUAGE_VALA': 93,
    'TGLANG_LANGUAGE_VBSCRIPT': 94,
    'TGLANG_LANGUAGE_VERILOG': 95,
    'TGLANG_LANGUAGE_VISUAL_BASIC': 96,
    'TGLANG_LANGUAGE_WOLFRAM': 97,
    'TGLANG_LANGUAGE_XML': 98,
    'TGLANG_LANGUAGE_YAML': 99
}

In [16]:
### Helper functions and regexes
SPECIAL_SYMBOLS_REGEX = r"([.,;:\\\/{}\[\]\|!\"#\$%&\'\(\)\*\+\-\<\=\>\?@\^\`\)])"
SPECIAL_SYMBOLS_REGEX_2 = r"(\b\w+\b|[.,;:\\\/{}\[\]\|!\"#\$%&\'\(\)\*\+\-\<\=\>\?@\^\`\~)])"

INT_REGEX = "-?\d+"
FLOAT_REGEX = "-?\d*[.,]\d+"
HEX_REGEX = "0[xX]([0-9a-fA-F])+"
OCTAL_REGEX = "0[oO]([0-7])+"
BINARY_REGEX = "0[bB]([01])+"
EXP_REGEX = "-?\d+[eE]-?\d+"

config = [
    {'regex': BINARY_REGEX, 'change_to': '<num_binary>'},
    {'regex': OCTAL_REGEX, 'change_to': '<num_octal>'},
    {'regex': HEX_REGEX, 'change_to': '<num_hex>'},
    {'regex': EXP_REGEX, 'change_to': '<num_exp>'},
    {'regex': FLOAT_REGEX, 'change_to': '<num_float>'},
    {'regex': INT_REGEX, 'change_to': '<num_int>'},
]



def add_spaces(text: str) -> str:
    return re.sub(SPECIAL_SYMBOLS_REGEX, r' \1 ', text)


def tokenize(text: str) -> List[str]:
    results = re.findall(SPECIAL_SYMBOLS_REGEX_2, text)
    return results

leave_only_ascii = lambda text: "".join([symbol for symbol in text if symbol in string.printable]).strip()


def change_nums_to_tokens(config: List, text: str) -> str:

    for config_record in config:
        text = re.sub(config_record['regex'], config_record['change_to'], text)

    return text


def preprocess_text_to_ascii(text: str) -> List[str]:

    # assert all([column in df.columns for column in ["text"]])

    text = add_spaces(text)
    text = change_nums_to_tokens(config, text)
    text =  leave_only_ascii(text)
    string_arr = tokenize(text)
    
    return string_arr

from typing import Dict, List, Tuple
def generate_vector(words: List[str], tfidf_mapping: Dict[str, Tuple[float, int]]) -> List[float]:

    n_len_vector = len(tfidf_mapping)
    vector = np.zeros(n_len_vector) # = [0 for _ in range(n_len_vector)]
    for word in words:
        tuple_to_unpack = tfidf_mapping.get(word)
        if tuple_to_unpack:
            tf_idf_val, idx = tuple_to_unpack
            if vector[idx] == 0:
                vector[idx] = tf_idf_val
            else:
                vector[idx] += tf_idf_val

    return vector


In [6]:
### load tfidf and svm
import json

TFIDF_PATH = "../tfidf_ascii_mapping.json"
MODEL_PATH = "../model_libsvm_ascii.model"


reverse_class_mapping = {value: idx for idx, value in class_mapping.items()}


with open(TFIDF_PATH) as f:
    mapping = json.load(f)


model = svm_load_model(MODEL_PATH)

In [13]:
# inference
inference_string = """
public static void main(String[] args) {
    System.out.println("Hello, World!");
}
"""

prepared_tokens = preprocess_text_to_ascii(inference_string)
vector = [generate_vector(prepared_tokens, mapping)]
label, _, _ = svm_predict(y=[1], x=vector, m=model)
label_pred = int(label[0])
label_lang = reverse_class_mapping.get(label_pred)

print(label_lang)

Accuracy = 0% (0/1) (classification)
