In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import pandas as pd
import gc
import os
import json
from pathlib import Path
from os.path import join as path_join
import matplotlib.pyplot as plt

In [4]:
train_path = Path("../dataset_v1/data/tglang_dataset/Forest_Train.parquet")
eval_path  = Path("../dataset_v1/data/tglang_dataset/Eval.parquet")

In [5]:
def load_dataset(path, max_samples_per_class=1000):
    df = pd.read_parquet(path)\
            [["code", "tglang", "target"]]\
            .reset_index()\
            .groupby("tglang")\
            .apply(
                lambda x: x[["target", "code"]].sample(n=min(max_samples_per_class, x.shape[0]), random_state=137)
            )\
            .reset_index()\
            [["code", "tglang", "target"]]
    return df

In [6]:
from pathlib import Path

keywords_root = Path("../dataset_v1/data/raw/keywords2")
keywords = []

for tglang_keywords in keywords_root.iterdir():
    tglang = tglang_keywords.name.split(".")[0]
    with open(tglang_keywords) as f:
        keywords += list(map(lambda x: x.strip(), f.readlines()))

keywords = list(set(keywords))
keywords_set = set(keywords)

keywords_set

{'uri',
 'last',
 'library',
 'geoip_city_continent_code',
 'rp',
 'double',
 'tab',
 'tcpinfo_rtt',
 'mode',
 'div',
 'lua_iscfunction',
 'lua_checkstack',
 'error',
 'underline',
 'embed',
 'EXISTS',
 'upstream_bytes_received',
 'var',
 'lua_gettop',
 'Protocol',
 'int',
 'li',
 'auto',
 'proxy_protocol_server_port',
 'fileprivate',
 'bool',
 '__property',
 'cushort',
 'dyn',
 'clone',
 'time_iso8601',
 'fileID',
 'CHECK',
 'snap',
 'include_once',
 'alias',
 'out',
 'background',
 'on',
 'behavior',
 'transient',
 'mut',
 'left',
 'bdi',
 'otel_trace_id',
 'ssl_client_i_dn_legacy',
 'builder',
 'trait',
 'overscroll',
 'modern_browser',
 'macro',
 'redo',
 'integer',
 'ulimit',
 'begin',
 'connections_writing',
 'trap',
 'arg_',
 'ssl_client_s_dn_legacy',
 '__forceinline',
 '__m128',
 'TRUNCATE',
 'search',
 'spacing',
 'memcached_key',
 'block',
 'hr',
 'sent_trailer_',
 'ssl_server_name',
 'uint32',
 'table',
 'hanging',
 'ligatures',
 'geoip_city',
 'cont',
 'crate',
 'union',
 '

In [7]:
df_for_maps = pd.read_parquet(eval_path)

target_2_lang = {}
for idx, row in df_for_maps[["target", "tglang"]].drop_duplicates().iterrows():
    target_2_lang[row["target"]] = row["tglang"]

d = []
for target, tglang in target_2_lang.items():
    d.append((target, tglang))

tglang_names = list(map(lambda x: x[1], sorted(d)))
tglang_names

['TGLANG_LANGUAGE_OTHER',
 'TGLANG_LANGUAGE_C',
 'TGLANG_LANGUAGE_CPLUSPLUS',
 'TGLANG_LANGUAGE_CSHARP',
 'TGLANG_LANGUAGE_CSS',
 'TGLANG_LANGUAGE_DART',
 'TGLANG_LANGUAGE_DOCKER',
 'TGLANG_LANGUAGE_FUNC',
 'TGLANG_LANGUAGE_GO',
 'TGLANG_LANGUAGE_HTML',
 'TGLANG_LANGUAGE_JAVA',
 'TGLANG_LANGUAGE_JAVASCRIPT',
 'TGLANG_LANGUAGE_JSON',
 'TGLANG_LANGUAGE_KOTLIN',
 'TGLANG_LANGUAGE_LUA',
 'TGLANG_LANGUAGE_NGINX',
 'TGLANG_LANGUAGE_OBJECTIVE_C',
 'TGLANG_LANGUAGE_PHP',
 'TGLANG_LANGUAGE_POWERSHELL',
 'TGLANG_LANGUAGE_PYTHON',
 'TGLANG_LANGUAGE_RUBY',
 'TGLANG_LANGUAGE_RUST',
 'TGLANG_LANGUAGE_SHELL',
 'TGLANG_LANGUAGE_SOLIDITY',
 'TGLANG_LANGUAGE_SQL',
 'TGLANG_LANGUAGE_SWIFT',
 'TGLANG_LANGUAGE_TL',
 'TGLANG_LANGUAGE_TYPESCRIPT',
 'TGLANG_LANGUAGE_XML']

In [8]:
def my_tokenizer(s):
    single_token_ids  = set([9,10,32,34,39,40,41,44,59,91,93,96,123,125])
    word_ids          = set(list(range(65, 91)) + list(range(97, 123)) + [95])
    number_ids        = set(list(range(48, 58)))
    symbol_ids        = set([33,35,36,37,38,42,43,45,46,47,58,60,61,62,63,64,92,94,95,124,126])
    
    tokens = []
    cache = []
    token_type = 0
    
    for idx, c in enumerate(s):
        id = ord(c)
    
        if id in single_token_ids:

            if len(cache) > 0 and (token_type == 1 or token_type == 2 or token_type == 3):
                tokens.append("".join(cache))
            cache = []
            tokens.append(c)
            token_type = 0

        elif id in word_ids or (token_type == 1 and id in number_ids):

            if token_type == 0:
                cache = [c]
            elif token_type == 1:
                cache.append(c)
            elif token_type == 2:
                tokens.append("".join(cache))
                cache = [c]
            elif token_type == 3:
                tokens.append("".join(cache))
                cache = [c]
            token_type = 1

        elif id in symbol_ids:

            if token_type == 0:
                cache = [c]
            elif token_type == 1:
                tokens.append("".join(cache))
                cache = [c]
            elif token_type == 2:
                cache.append(c)
            elif token_type == 3:
                tokens.append("".join(cache))
                cache = [c]
            token_type = 2

        elif id in number_ids:

            if token_type == 0:
                cache = [c]
            elif token_type == 1:
                tokens.append("".join(cache))
                cache = [c]
            elif token_type == 2:
                tokens.append("".join(cache))
                cache = [c]
            elif token_type == 3:
                cache.append(c)
            token_type = 3

        else:
            if len(cache) > 0 and (token_type == 1 or token_type == 2 or token_type == 3):
                tokens.append("".join(cache))
            cache = []
            token_type = 0

        # print(c, id, cache)

    if len(cache) > 0 and (token_type == 1 or token_type == 2 or token_type == 3):
        tokens.append("".join(cache))

    return tokens

def filter_tokens(tokens):
    return list(filter(lambda x: x != " " and x != "", tokens))

In [9]:
from time import time

code = """
import openai
import os
import pandas as pd
from tqdm import tqdm
from sys import argv
from time import sleep
from typing import List
 
 
def walk_directory(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if os.path.splitext(file)[0][-4:] != "CODE":
                continue
            file_path = os.path.join(root, file)
            with open(file_path, "r") as f:
                content = f.read()
            yield (content, file_path)
 
 
openai.api_key = "sk-2UNhL8AnOUZ1JogW8246Df2e279044348d53680e68Ea57A0"
openai.api_base = "https://neuroapi.host/v1"
LANGS = ['CPLUSPLUS', 'JAVA', 'CSS', 'OBJECTIVE_C', 'GO', 'NGINX', 'LUA',
         'KOTLIN', 'DOCKER', 'JAVASCRIPT', 'PYTHON', 'SHELL', 'SOLIDITY',
         'HTML', 'RUST', 'PHP', 'DART', 'C', 'TYPESCRIPT', 'SWIFT', 'SQL',
         'TL', 'POWERSHELL', 'JSON', 'FUNC', 'XML', 'RUBY', 'CSHARP']
request_template = "\n".join([
    f"You must answer with ONLY ONE 'word', an element from this list: {LANGS}",
    "Under no circumstances should you write anything other than one word from the list above.",
    "Your task is to guess in which programming language from those presented this piece of code is written:",
    "```\n{}\n```",
    "Even if you recognize a language and it is not in the list presented, still find the most suitable one, for example, instead of BASH write SHELL, etc. If nothing at all fits, indicate a random language, but do not write anything that is not on the list. You must follow the spelling of the words on the list exactly."
    "If your answer is even one character different from the correct element from the list, I will suffer terribly for many days.",
 
    # "If enone of the languages in the list is suitable, write `OTHER`. UNDER NO CIRCUMSTANCES WRITE ANYTHING BUT ONE WORD. IT MUST BE AN ELEMENT FROM THE LIST UP TO EACH CHARACTER."
])
 
 
def get_content(request: str) -> str:
    chat_completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        temperature = 0.2,
        messages=[{"role": "user", "content": request}],
        stream=False,
        timeout=5,
    )
 
    if isinstance(chat_completion, dict):
        # not stream
        content = chat_completion.choices[0].message.content
        return content
    else:
        res = []
        for token in chat_completion:
            content = token["choices"][0]["delta"].get("content")
            if content != None:
                res.append(content)
        return "".join(res)
 
 
def make_files(samples: list, start: int, pb: tqdm = None):
    results = []
    for code, file_path in samples:
        try:
            label = get_content(request_template.format(code))
            print(label)
        except Exception as e:
            sleep(1)
            print(e)
            # label = get_content(request_template.format(code))
            label = "OTHER"
            continue
        if len(label) > 12:
            label = "OTHER"
        results.append([file_path, label])
        pb.update()
    return pd.DataFrame(results, columns=["path", "label"])
 
WORK_DIR = "/home/kama/pythonProjects/telegram-ML-Competition-2023/datasets/test/"
def read_file(path):
    with open(WORK_DIR + path, "r") as f:
        return f.read(), path
 
def get_text(path):
    df = pd.read_csv(path)
    df = df[~df["label"].isin(set(LANGS))]
    return [
        list(read_file(p)) for p in df["path"] 
    ]
 
def main():
    start_num = int(argv[1]) if len(argv) > 1 else 0
    # dir_1 = "/home/kama/pythonProjects/telegram-ML-Competition-2023/datasets/test/"
    # save_dir = "gpt_cls_part_2"
    # samples = get_text("merged_1_prefinal.csv")
    df = pd.read_csv("merged_1_prefinal_2.csv")
    # df = df
    samples = [
        list(read_file(p)) for p in df["path"][df["label"].isin({"OTHER", "ERROR"})]
    ][::-1]
    print(len(samples))
    d = 1
 
    pb = tqdm(total=len(samples))
    for k in range(start_num, len(samples), d):
        df_2 = make_files(samples[k:k+d], k, pb)
        m = pd.merge(df, df_2, on='path', how='left')
        m['label'] = m['label_y'].fillna(m['label_x']).astype(str)
        m = m[['path', 'label']]
        m.to_csv(f"merged_1_prefinal_2.csv", index=False)
        df = m
 
if __name__ == "__main__":
    main()
"""


a = []

for i in range(1000):
    start = time()
    code_tokens = my_tokenizer(code)
    a.append(time() - start)

import numpy as np

np.mean(a), len(code)

(0.0009090790748596192, 4288)

In [10]:
word_ids          = set(list(range(65, 91)) + list(range(97, 123)) + [95])

def enrich_with_tranformed_code_column(dataset):
    dataset["code_enrichment"] = dataset["code"].apply(
        lambda x: " ".join(filter(lambda x: (x[0] not in word_ids) or (x in keywords_set), my_tokenizer(x)))
    )

    return dataset

In [11]:
train = load_dataset(train_path, 4000)
eval  = load_dataset(eval_path, 100000)

In [12]:
# import math

# train = train.sample(frac=1).reset_index(drop=True)
# split_index = math.floor(train.shape[0] * 0.8)
# print("split_index", split_index)

# train_ = train.iloc[:split_index]
# move_to_eval_ = train.iloc[split_index:]

# train = train_
# eval  = pd.concat([move_to_eval_, eval]).reset_index()

In [13]:
train.shape, eval.shape

((109757, 3), (11246, 3))

In [14]:
train = enrich_with_tranformed_code_column(train)
eval = enrich_with_tranformed_code_column(eval)

In [15]:
import numpy as np
import joblib

def char2num(c):
    n = ord(c)
    if n == 10:
        return 0
    elif  n <= 31 or n >= 127:
        return 96
    else:
        return n - 31


def get_embedding(s: str):
    if len(s) == 0:
        return np.zeros((96, ))
    res = np.array(np.bincount([char2num(c) for c in s], minlength=97), dtype=float) / len(s)
    return res[:-1]

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report

def get_baseline_score(X_train, y_train, X_test, y_test):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.preprocessing import FunctionTransformer
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.svm import LinearSVC
    from sklearn.pipeline import Pipeline

    manual_best_params = {
        'criterion': 'gini',
        'max_features': 'sqrt',
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'n_estimators': 300,
        'max_depth': 80
    }

    clf = RandomForestClassifier(n_jobs=4, **manual_best_params)
    clf.fit(X_train, y_train)

    print(f'Accuracy: {clf.score(X_test, y_test)}')

    y_pred = clf.predict(X_test)

    print("f1_score", f1_score(y_test, y_pred, average="macro"))
    print("precision_score", precision_score(y_test, y_pred, average="macro"))
    print("recall_score", recall_score(y_test, y_pred, average="macro"))
    print(classification_report(y_test, y_pred, target_names=tglang_names))
    print("balanced_accuracy_score", balanced_accuracy_score(y_test, y_pred))

    return clf

In [17]:
import re
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import numpy as np

def word2token(word: str):
    if re.search(r'[^a-zA-Z0-9_]', word):
        return None
    return ''.join([
        'A' if word[0].isupper() else ('a' if word[0].islower() else '_'),  # startswith 'A', 'a' or '_'
        'a' if any(map(str.islower, word[1:])) else '',
        'A' if any(map(str.isupper, word[1:])) else '',
        '_' if '_' in word else '',
        '2' if len(word) > 10 else ('0' if len(word) == 1 else "1")
    ])

def word2token_new(word: str):
    if word.isdigit():
        return '0'
    if (word in keywords_set) or re.search(r'[^a-zA-Z0-9_]', word):
        return word
    return ''.join([
        'A' if word[0].isupper() else ('a' if word[0].islower() else '_'),  # startswith 'A', 'a' or '_'
        'a' if any(map(str.islower, word[1:])) else '',
        'A' if any(map(str.isupper, word[1:])) else '',
        '_' if '_' in word else '',
        '2' if len(word) > 10 else ('0' if len(word) == 1 else "1")
    ])

def identity_tokenizer(text):
    return text


# alphnum["total"] = alphnum["code"].str.len()
# alphnum["numbers"] = alphnum["code"].apply(lambda s: sum(c.isdigit() for c in s))
# alphnum["letters"] = alphnum["code"].apply(lambda s: sum(c.isalpha() for c in s))
# alphnum["spaces"] = alphnum["code"].apply(lambda s: sum(c.isspace() for c in s))
# alphnum["other"] = alphnum["code"].apply(lambda s: sum(not(c.isspace() or c.isalpha() or c.isdigit()) for c in s))


def feature_1(code_iterator):
    return list(map(
        lambda x: sum(c.isdigit() for c in x) / len(x),
        code_iterator
    ))

def feature_2(code_iterator):
    return list(map(
        lambda x: sum(c.isalpha() for c in x) / len(x),
        code_iterator
    ))

def feature_3(code_iterator):
    return list(map(
        lambda x: sum(c.isspace() for c in x) / len(x),
        code_iterator
    ))

def feature_4(code_iterator):
    return list(map(
        lambda x: sum(not(c.isspace() or c.isalpha() or c.isdigit()) for c in x) / len(x),
        code_iterator
    ))

def feature_5(code_iterator):
    return list(map(
        get_embedding,
        code_iterator
    ))

def get_embeddings(train_code, train_target, test_code, test_target, tf_idf_params=None):
    if tf_idf_params is None:
        tf_idf_params = {
            "tokenizer": identity_tokenizer,
            "max_features": 1500,
            "lowercase": False
        }

    """
        Enrich train
    """
    print(len(train_code))
    train_tokens = map(my_tokenizer, train_code)
    train_tokens = map(
        lambda x: list(map(word2token_new, x)),
        train_tokens
    )
    train_data = list(filter(
        lambda x: len(x[0]) > 0,
        zip(train_tokens, train_target, train_code)
    ))
    print(len(train_data))
    train_tokens = list(map(lambda x: x[0], train_data))
    train_target = list(map(lambda x: x[1], train_data))
    train_code   = list(map(lambda x: x[2], train_data))

    """
        Enrich test
    """
    test_tokens = map(my_tokenizer, test_code)
    test_tokens = map(
        lambda x: list(map(word2token_new, x)),
        test_tokens
    )
    test_data = list(filter(
        lambda x: len(x[0]) > 0,
        zip(test_tokens, test_target, test_code)
    ))
    test_tokens = list(map(lambda x: x[0], test_data))
    test_target = list(map(lambda x: x[1], test_data))
    test_code   = list(map(lambda x: x[2], test_data))

    """
        Vectorize
    """
    vectorizer = TfidfVectorizer(**tf_idf_params)
    vectorizer.fit(train_tokens)

    train_embeddings = vectorizer.transform(train_tokens)
    test_embeddings  = vectorizer.transform(test_tokens)

    train_embeddings = train_embeddings.toarray()
    test_embeddings  = test_embeddings.toarray()

    for feature_ in []:
        train_feature_ = np.array(feature_(train_code))

        print(train_embeddings.shape)
        print(train_feature_.shape)
        
        test_feature_  = np.array(feature_(test_code))
        train_embeddings = np.c_[train_embeddings, train_feature_]
        test_embeddings  = np.c_[test_embeddings,  test_feature_]

    train_embeddings = sparse.csr_matrix(np.matrix(train_embeddings))
    test_embeddings  = sparse.csr_matrix(np.matrix(test_embeddings))

    return train_embeddings, train_target, test_embeddings, test_target, vectorizer

In [18]:
train_data = list(
    map(
        lambda x: (x[1], x[2]),
        filter(
            lambda x: len(x[0]) > 0,
            map(
                lambda y: (my_tokenizer(y[0]), y[0], y[1]),
                zip(train["code"], train["target"])
            )
        )
    )
)
train_code   = list(map(lambda x: x[0], train_data))
train_target = list(map(lambda x: x[1], train_data))

eval_data = list(
    map(
        lambda x: (x[1], x[2]),
        filter(
            lambda x: len(x[0]) > 0,
            map(
                lambda y: (my_tokenizer(y[0]), y[0], y[1]),
                zip(eval["code"], eval["target"])
            )
        )
    )
)
eval_code   = list(map(lambda x: x[0], eval_data))
eval_target = list(map(lambda x: x[1], eval_data))

X_test_svm = list(map(get_embedding, eval_code))

X_train, y_train, X_test, y_test, tfidf = get_embeddings(train_code, train_target, eval_code, eval_target)
X_train.shape, len(y_train), X_test.shape, len(y_test)

109722
109722




((109722, 1500), 109722, (11145, 1500), 11145)

In [26]:
forest = get_baseline_score(X_train, y_train, X_test, y_test)

Accuracy: 0.9227456258411844
f1_score 0.42855124135530015
precision_score 0.3757848596523379
recall_score 0.5979489622573149
                             precision    recall  f1-score   support

      TGLANG_LANGUAGE_OTHER       1.00      0.95      0.97     10186
          TGLANG_LANGUAGE_C       0.51      0.70      0.59        30
  TGLANG_LANGUAGE_CPLUSPLUS       0.73      0.73      0.73        37
     TGLANG_LANGUAGE_CSHARP       0.62      0.84      0.71        19
        TGLANG_LANGUAGE_CSS       0.44      0.73      0.55        15
       TGLANG_LANGUAGE_DART       0.19      0.43      0.26         7
     TGLANG_LANGUAGE_DOCKER       0.08      0.18      0.11        11
       TGLANG_LANGUAGE_FUNC       0.00      0.00      0.00         1
         TGLANG_LANGUAGE_GO       0.41      0.47      0.44        19
       TGLANG_LANGUAGE_HTML       0.63      0.69      0.66        32
       TGLANG_LANGUAGE_JAVA       0.64      0.22      0.33       113
 TGLANG_LANGUAGE_JAVASCRIPT       0.04      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
def get_svm_score(X_train, y_train, X_test, y_test):
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.svm import SVC

    manual_best_params = {
        "kernel": ["linear", "poly", "rbf", "sigmoid", "precomputed"][3],
        "max_iter": 1000
    }

    clf = OneVsRestClassifier(SVC(**manual_best_params))
    clf.fit(X_train, y_train)

    print(f'Accuracy: {clf.score(X_test, y_test)}')

    y_pred = clf.predict(X_test)

    print("f1_score", f1_score(y_test, y_pred, average="macro"))
    print("precision_score", precision_score(y_test, y_pred, average="macro"))
    print("recall_score", recall_score(y_test, y_pred, average="macro"))
    print(classification_report(y_test, y_pred, target_names=tglang_names))
    print("balanced_accuracy_score", balanced_accuracy_score(y_test, y_pred))

    return clf

In [40]:
code_svm = get_svm_score(X_train, y_train, X_test, y_test)



Accuracy: 0.44549125168236875
f1_score 0.0664122763917314
precision_score 0.1305131075068466
recall_score 0.09915953996250028
                             precision    recall  f1-score   support

      TGLANG_LANGUAGE_OTHER       0.96      0.48      0.64     10186
          TGLANG_LANGUAGE_C       0.01      0.03      0.01        30
  TGLANG_LANGUAGE_CPLUSPLUS       0.00      0.00      0.00        37
     TGLANG_LANGUAGE_CSHARP       0.01      0.21      0.01        19
        TGLANG_LANGUAGE_CSS       0.00      0.00      0.00        15
       TGLANG_LANGUAGE_DART       0.00      0.00      0.00         7
     TGLANG_LANGUAGE_DOCKER       0.17      0.09      0.12        11
       TGLANG_LANGUAGE_FUNC       0.00      0.00      0.00         1
         TGLANG_LANGUAGE_GO       0.01      0.11      0.01        19
       TGLANG_LANGUAGE_HTML       0.00      0.00      0.00        32
       TGLANG_LANGUAGE_JAVA       0.08      0.05      0.06       113
 TGLANG_LANGUAGE_JAVASCRIPT       0.00      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
# Accuracy: 0.8186356947656086
# f1_score 0.7435253562514733
# precision_score 0.7553975846316408
# recall_score 0.7433261328760707
#                              precision    recall  f1-score   support

#       TGLANG_LANGUAGE_OTHER       0.96      0.94      0.95     11175
#           TGLANG_LANGUAGE_C       0.55      0.64      0.59      1024
#   TGLANG_LANGUAGE_CPLUSPLUS       0.73      0.59      0.65      1054
#      TGLANG_LANGUAGE_CSHARP       0.80      0.79      0.80       955
#         TGLANG_LANGUAGE_CSS       0.91      0.94      0.92      1028
#        TGLANG_LANGUAGE_DART       0.81      0.65      0.72      1008
#      TGLANG_LANGUAGE_DOCKER       0.91      0.89      0.90       930
#        TGLANG_LANGUAGE_FUNC       0.00      0.00      0.00         5
#          TGLANG_LANGUAGE_GO       0.75      0.85      0.80       990
#        TGLANG_LANGUAGE_HTML       0.77      0.79      0.78      1038
#        TGLANG_LANGUAGE_JAVA       0.61      0.69      0.65      1113
#  TGLANG_LANGUAGE_JAVASCRIPT       0.56      0.43      0.49      1000
#        TGLANG_LANGUAGE_JSON       0.81      0.94      0.87      1095
#      TGLANG_LANGUAGE_KOTLIN       0.70      0.81      0.75      1023
#         TGLANG_LANGUAGE_LUA       0.77      0.73      0.75      1033
#       TGLANG_LANGUAGE_NGINX       0.92      0.91      0.92       780
# TGLANG_LANGUAGE_OBJECTIVE_C       0.88      0.73      0.80       910
#         TGLANG_LANGUAGE_PHP       0.90      0.79      0.84      1079
#  TGLANG_LANGUAGE_POWERSHELL       0.75      0.85      0.80       996
#      TGLANG_LANGUAGE_PYTHON       0.62      0.77      0.68      1219
#        TGLANG_LANGUAGE_RUBY       0.76      0.80      0.78      1029
#        TGLANG_LANGUAGE_RUST       0.88      0.82      0.85      1013
#       TGLANG_LANGUAGE_SHELL       0.66      0.83      0.73      1170
#    TGLANG_LANGUAGE_SOLIDITY       0.86      0.81      0.83      1014
#         TGLANG_LANGUAGE_SQL       0.84      0.82      0.83      1006
#       TGLANG_LANGUAGE_SWIFT       0.71      0.82      0.76      1036
#          TGLANG_LANGUAGE_TL       0.99      0.53      0.69       365
#  TGLANG_LANGUAGE_TYPESCRIPT       0.68      0.49      0.57       980
#         TGLANG_LANGUAGE_XML       0.82      0.90      0.86       988

#                    accuracy                           0.82     38056
#                   macro avg       0.76      0.74      0.74     38056
#                weighted avg       0.82      0.82      0.82     38056

# balanced_accuracy_score 0.7433261328760707

In [27]:
svm_path   = Path("../dataset_v1/data/model/SVM.pkl")

for svm_path in [
    "../dataset_v1/data/model/SVM.pkl",
    "../dataset_v1/data/model/SVM_careful.pkl",
    "../dataset_v1/data/model/SVM_careless.pkl",
    "../dataset_v1/data/model/SVM_too_careful.pkl",
    "../dataset_v1/data/model/SVM_equal.pkl",
]:
    svm = joblib.load(svm_path)
    models_predict = forest.predict(X_test) * svm.predict(X_test_svm)

    print(svm_path.split("/")[-1])
    print("accuracy", accuracy_score(y_test, models_predict))
    print("f1_score", f1_score(y_test, models_predict, average="macro"))
    print("precision_score", precision_score(y_test, models_predict, average="macro"))
    print("recall_score", recall_score(y_test, models_predict, average="macro"))
    # print(classification_report(y_test, models_predict, target_names=tglang_names))
    print("balanced_accuracy_score", balanced_accuracy_score(y_test, models_predict))

SVM.pkl
accuracy 0.9483176312247644
f1_score 0.4734345983654528
precision_score 0.4423915969090338
recall_score 0.5833540716033445
balanced_accuracy_score 0.5833540716033445


  _warn_prf(average, modifier, msg_start, len(result))


SVM_careful.pkl
accuracy 0.9341408703454464
f1_score 0.4437925199236539
precision_score 0.3977162876574334
recall_score 0.5915369456876967
balanced_accuracy_score 0.5915369456876967


  _warn_prf(average, modifier, msg_start, len(result))


SVM_careless.pkl
accuracy 0.9509196949304621
f1_score 0.47844339271927566
precision_score 0.45671177580469213
recall_score 0.5647219287201618
balanced_accuracy_score 0.5647219287201618


  _warn_prf(average, modifier, msg_start, len(result))


SVM_too_careful.pkl
accuracy 0.9286675639300135
f1_score 0.43840430195226
precision_score 0.3889259887909439
recall_score 0.595418121244119
balanced_accuracy_score 0.595418121244119


  _warn_prf(average, modifier, msg_start, len(result))


SVM_equal.pkl
accuracy 0.9530731269627636
f1_score 0.4753050721007133
precision_score 0.48771612789727314
recall_score 0.5169294479218013
balanced_accuracy_score 0.5169294479218013


  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
import joblib

joblib.dump(forest, "forest.pkl")
joblib.dump(tfidf, "tfidfi.pkl")

['tfidfi.pkl']

In [23]:
# Accuracy: 0.8375551818372924
# f1_score 0.7643238669957512
# precision_score 0.7719059282201296
# recall_score 0.7642280574677937
#                              precision    recall  f1-score   support

#       TGLANG_LANGUAGE_OTHER       0.98      0.95      0.96     11175
#           TGLANG_LANGUAGE_C       0.55      0.67      0.61      1024
#   TGLANG_LANGUAGE_CPLUSPLUS       0.70      0.60      0.65      1054
#      TGLANG_LANGUAGE_CSHARP       0.82      0.82      0.82       955
#         TGLANG_LANGUAGE_CSS       0.92      0.95      0.93      1028
#        TGLANG_LANGUAGE_DART       0.82      0.71      0.76      1008
#      TGLANG_LANGUAGE_DOCKER       0.92      0.90      0.91       930
#        TGLANG_LANGUAGE_FUNC       0.00      0.00      0.00         5
#          TGLANG_LANGUAGE_GO       0.79      0.87      0.83       990
#        TGLANG_LANGUAGE_HTML       0.78      0.81      0.79      1038
#        TGLANG_LANGUAGE_JAVA       0.68      0.68      0.68      1113
#  TGLANG_LANGUAGE_JAVASCRIPT       0.56      0.47      0.51      1000
#        TGLANG_LANGUAGE_JSON       0.84      0.94      0.88      1095
#      TGLANG_LANGUAGE_KOTLIN       0.77      0.82      0.79      1023
#         TGLANG_LANGUAGE_LUA       0.76      0.78      0.77      1033
#       TGLANG_LANGUAGE_NGINX       0.94      0.92      0.93       780
# TGLANG_LANGUAGE_OBJECTIVE_C       0.89      0.75      0.82       910
#         TGLANG_LANGUAGE_PHP       0.90      0.80      0.85      1079
#  TGLANG_LANGUAGE_POWERSHELL       0.79      0.87      0.83       996
#      TGLANG_LANGUAGE_PYTHON       0.67      0.78      0.72      1219
#        TGLANG_LANGUAGE_RUBY       0.79      0.82      0.80      1029
#        TGLANG_LANGUAGE_RUST       0.88      0.84      0.86      1013
#       TGLANG_LANGUAGE_SHELL       0.70      0.86      0.77      1170
#    TGLANG_LANGUAGE_SOLIDITY       0.85      0.83      0.84      1014
#         TGLANG_LANGUAGE_SQL       0.84      0.85      0.84      1006
#       TGLANG_LANGUAGE_SWIFT       0.79      0.84      0.81      1036
#          TGLANG_LANGUAGE_TL       0.97      0.58      0.72       365
#  TGLANG_LANGUAGE_TYPESCRIPT       0.68      0.54      0.60       980
#         TGLANG_LANGUAGE_XML       0.85      0.90      0.88       988

#                    accuracy                           0.84     38056
#                   macro avg       0.77      0.76      0.76     38056
#                weighted avg       0.84      0.84      0.84     38056

# balanced_accuracy_score 0.7642280574677937

# SVM.pkl
# accuracy 0.7908608366617617
# f1_score 0.7295919733940959
# precision_score 0.7828005765033781
# recall_score 0.692997430579706
# balanced_accuracy_score 0.692997430579706
# /Users/platon.fedorov/opt/anaconda3/envs/tgml/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
#   _warn_prf(average, modifier, msg_start, len(result))
# SVM_careful.pkl
# accuracy 0.8244166491486231
# f1_score 0.7547543831899822
# precision_score 0.7735080351001856
# recall_score 0.7443816122630108
# balanced_accuracy_score 0.7443816122630108
# /Users/platon.fedorov/opt/anaconda3/envs/tgml/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
#   _warn_prf(average, modifier, msg_start, len(result))
# SVM_careless.pkl
# accuracy 0.7620086188774438
# f1_score 0.7022615903269126
# precision_score 0.7838848341302994
# recall_score 0.6501080387531972
# balanced_accuracy_score 0.6501080387531972
# /Users/platon.fedorov/opt/anaconda3/envs/tgml/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
#   _warn_prf(average, modifier, msg_start, len(result))
# SVM_too_careful.pkl
# accuracy 0.8295932310279588
# f1_score 0.7584515377198469
# precision_score 0.7722386497009159
# recall_score 0.7525468119791242
# balanced_accuracy_score 0.7525468119791242
# /Users/platon.fedorov/opt/anaconda3/envs/tgml/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1469: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
#   _warn_prf(average, modifier, msg_start, len(result))
# SVM_equal.pkl
# accuracy 0.7044881227664495
# f1_score 0.6474534257622541
# precision_score 0.7859921528115783
# recall_score 0.5708234518178323
# balanced_accuracy_score 0.5708234518178323