# single label classifier

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import random
import re
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix



In [None]:
# load data
df = pd.read_csv('./dataset.csv')



# # focus on columns 'code', 'Primary-label'
df = df[['code', 'Primary-label', 'Secondary-label','cell number', 'filename']]
# df = df[['code', 'Primary-label', 'Secondary-label']]

# drop data with no label or no code
df = df.dropna(subset=['Primary-label', 'code'])   


# fill "None" if there is no secondary label
df['Secondary-label'] = df['Secondary-label'].fillna('none')


# give a unique id to each data
df['id'] = df.index


In [None]:
# show head of train
df.head()


In [None]:
# show total number of data
print('Total number of train data: ', len(df))


In [None]:
# show total number of data
print('Total number of train data: ', len(df))

In [None]:
# feature extraction (use regex to extract function name)
# for instance, model.fit() -> fit, df.dropna() -> dropna
# df.head -> head
def extract_func_name(code):
    # Split the code into lines
    lines = code.splitlines()

    # Remove import statements from the lines
    non_import_lines = [
        line for line in lines if not line.strip().startswith(('import', 'from'))]

    # remove comments
    non_import_lines = [re.sub(r'#.*', '', line) for line in non_import_lines]

    # Join the non-import lines back into a single string
    non_import_code = '\n'.join(non_import_lines)

    # Use regex to match the function, class, and attribute name patterns
    pattern = r'(?:^|\s|\.)((?:\w+\()|(?:\w+)(?=\(|\[|\.|$))'
    matches = re.finditer(pattern, non_import_code)

    # If matches are found, return a list of function names
    if matches:
        func_names = [match.group(1).replace("(", "") for match in matches]
        return func_names
    # Otherwise, return an empty list
    else:
        return []
# Example
code = "empdata.loc[empdata['uu_id']; LinearRegression().fit; df.head"
print(extract_func_name(code))  # Output: ['loc', 'LinearRegression', 'fit', 'head']





In [None]:
# feature extraction ( use regex to extract number of import)
def extract_num_import(code):
    # Use regex to match the import statements
    pattern = r'\bimport\b'
    matches = re.finditer(pattern, code)

    # If matches are found, return the count
    if matches:
        import_count = sum(1 for _ in matches)
        return import_count
    # Otherwise, return 0
    else:
        return 0


# # Example
# code = '''
# import pandas as pd
# import numpy as np
# from sklearn.linear_model import LinearRegression
# '''

# print(extract_num_import(code))  # Output: 3


In [None]:
# feature extraction ( use regex to extract number of import)
def extract_num_comments(code):
    # Use regex to match the single-line comments
    pattern = r'\s*#.*$'
    matches = re.finditer(pattern, code, re.MULTILINE)

    # If matches are found, return the count
    if matches:
        comment_count = sum(1 for _ in matches)
        return comment_count
    # Otherwise, return 0
    else:
        return 0



In [None]:
def extract_num_lines(code):
    # Split the code into lines
    lines = code.splitlines()

    return len(lines)


In [None]:
# extract function names
df['func_names'] = df['code'].apply(extract_func_name)
# extract number of function names
df['num_func'] = df['func_names'].apply(lambda x: len(x))
# extract number of import
df['num_import'] = df['code'].apply(extract_num_import)
# extract number of comments
df['num_comments'] = df['code'].apply(extract_num_comments)
# extract number of lines
df['num_lines'] = df['code'].apply(extract_num_lines)

df.sort_values(by=['filename', 'cell number'], inplace=True)

# Create new columns representing the next row's data
df['next_num_func'] = df.groupby('filename')['num_func'].shift(-1)
df['next_num_import'] = df.groupby('filename')['num_import'].shift(-1)
df['next_num_comments'] = df.groupby('filename')['num_comments'].shift(-1)
df['next_num_lines'] = df.groupby('filename')['num_lines'].shift(-1)
# Create new columns representing the previous row's data
df['previous_num_func'] = df.groupby('filename')['num_func'].shift(1)
df['previous_num_import'] = df.groupby('filename')['num_import'].shift(1)
df['previous_num_comments'] = df.groupby('filename')['num_comments'].shift(1)
df['previous_num_lines'] = df.groupby('filename')['num_lines'].shift(1)

# Handling missing values
# You can fill the missing values (NaNs for the last row of each file) with zeros, or other values depending on your requirement:
df.fillna(0, inplace=True)

# cell number data preprocessing
max_cell_numbers = df.groupby('filename')['cell number'].max() + 1
df['normalized cell number'] = df.apply(
    lambda row: row['cell number'] / max_cell_numbers[row['filename']], axis=1)

# Extract numerical features
X_num = df[['num_func', 'num_import', 'num_comments',
            'num_lines', 'normalized cell number', 'next_num_func', 'next_num_import', 'next_num_comments', 'next_num_lines', 'previous_num_func', 'previous_num_import', 'previous_num_comments', 'previous_num_lines']].values



# drop unnecessary columns
df = df.drop(['cell number', 'filename'], axis=1)

scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)

df[['num_func', 'num_import', 'num_comments', 'num_lines', 'normalized cell number', 'next_num_func', 'next_num_import', 'next_num_comments', 'next_num_lines', 'previous_num_func', 'previous_num_import', 'previous_num_comments', 'previous_num_lines']] = X_num_scaled


In [None]:
df.head()

In [None]:


# extract unique function names from all data
unique_func_names = set()

for func_names in df['func_names']:
    unique_func_names.update(func_names)

# Convert the set to a list
unique_func_names_list = list(unique_func_names)

def has_func_name(row, func_name):
    # Check if the function name is in the list of function names
    if func_name in row['func_names']:
        return 1
    else:
        return 0

# Create a binary feature for each unique function name
for func_name in unique_func_names_list:
    df[f'has_{func_name}'] = df.apply(
        has_func_name, args=(func_name,), axis=1)


In [None]:
from sklearn.model_selection import train_test_split

# encode labels
labels = df['Primary-label'].unique().tolist()

# print labels name and their corresponding integer values
print(labels)

# encode them as integers
df['Primary-label'] = df['Primary-label'].apply(lambda x: labels.index(x))

# split data into train and test, with stratification
train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Primary-label'])

x_train = train.drop(['Primary-label', 'Secondary-label', 'code', 'func_names', 'id'], axis=1)
y_train = train['Primary-label']
x_test = test.drop(['Primary-label', 'Secondary-label', 'code', 'func_names', 'id'], axis=1)
y_test = test['Primary-label']



In [None]:
x_test.head()

In [None]:
x_train.head()


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


# Define the parameter grid
param_grid = {
    'C': [1, 10],
    'gamma': [0.1, 1, 10], # expanded range for gamma
    'kernel': ['rbf'],
    'class_weight': [None, 'balanced'],
    'shrinking': [True, False], # included False for experimentation
    'tol': [1e-4, 1e-3, 1e-2],
    'decision_function_shape': ['ovr', 'ovo'],
    'max_iter': [1000],
}
# Create an SVM classifier
clf = SVC()

# Create a grid search object for each classifier
clf_search = GridSearchCV(clf, param_grid, cv=5)

# Train the grid search objects on the training data
print("training SVM")
clf_search.fit(x_train, y_train)

# Print the best hyperparameters for each classifier
print('Best hyperparameters for SVM:', clf_search.best_params_)

# Evaluate each classifier on the testing data
y_pred = clf_search.predict(x_test)


print('Accuracy of SVM: ', accuracy_score(y_test, y_pred))

# f1 score
print('F1 score of SVM: ', f1_score(y_test, y_pred, average='weighted'))

# precision score
print('Precision score of SVM: ', precision_score(
    y_test, y_pred, average='weighted'))

# recall score
print('Recall score of SVM: ', recall_score(
    y_test, y_pred, average='weighted'))


# classification_report
print(classification_report(y_test, y_pred))





In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define the parameter grid
param_grid_rf = {
    'n_estimators': [100, 200],       
    'max_depth': [10, 20],
    'min_samples_split': [10],
    'min_samples_leaf': [1],
    'bootstrap': [True],
    'class_weight': [None, 'balanced'],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', 0.5],
    'max_leaf_nodes': [10, 20, 50],
    'min_impurity_decrease': [0.0, 0.1, 0.2],
    'random_state': [42],
}


# Create a Random Forest classifier
rf_clf = RandomForestClassifier(random_state=5)

# Create a grid search object for the Random Forest classifier
rf_search = GridSearchCV(rf_clf, param_grid_rf, cv=5)

# Train the grid search object on the training data
print("Training Random Forest")
rf_search.fit(x_train, y_train)

# Print the best hyperparameters for the Random Forest classifier
print('Best hyperparameters for Random Forest:', rf_search.best_params_)

# Evaluate the classifier on the testing data
y_pred = rf_search.predict(x_test)

print('Accuracy of Random Forest: ', accuracy_score(y_test, y_pred))


# classification_report
print(classification_report(y_test, y_pred))



In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define the parameter grid
param_grid_lr = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1.0, 10.0],
    'class_weight': [None, 'balanced'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 500]
}

# Create a Logistic Regression classifier
lr_clf = LogisticRegression(random_state=5)

# Create a grid search object for the Logistic Regression classifier
lr_search = GridSearchCV(lr_clf, param_grid_lr, cv=5)

# Train the grid search object on the training data
print("Training Logistic Regression")
lr_search.fit(x_train, y_train)

# Print the best hyperparameters for the Logistic Regression classifier
print('Best hyperparameters for Logistic Regression:', lr_search.best_params_)
# Evaluate the classifier on the testing data
y_pred = lr_search.predict(x_test)

print('Accuracy of Logistic Regression: ', accuracy_score(y_test, y_pred))

# classification_report
print(classification_report(y_test, y_pred))


In [None]:
# K-Nearest Neighbors Classifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define the parameter grid
param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [30, 50, 100],
    'p': [1, 2],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'metric_params': [None],
    'n_jobs': [None, -1]
}

# Create a K-Nearest Neighbors classifier
knn_clf = KNeighborsClassifier()

# Create a grid search object for the K-Nearest Neighbors classifier
knn_search = GridSearchCV(knn_clf, param_grid_knn, cv=5)

# Train the grid search object on the training data
print("Training K-Nearest Neighbors")
knn_search.fit(x_train, y_train)

# Print the best hyperparameters for the K-Nearest Neighbors classifier
print('Best hyperparameters for K-Nearest Neighbors:', knn_search.best_params_)
# Evaluate the classifier on the testing data
y_pred = knn_search.predict(x_test)

print('Accuracy of K-Nearest Neighbors: ', accuracy_score(y_test, y_pred))
8
# classification_report
print(classification_report(y_test, y_pred))

In [None]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define the parameter grid
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'class_weight': [None, 'balanced'],
    'ccp_alpha': [0.0, 0.1, 0.2]
}


# Create a Decision Tree classifier
dt_clf = DecisionTreeClassifier(random_state=5)

# Create a grid search object for the Decision Tree classifier
dt_search = GridSearchCV(dt_clf, param_grid_dt, cv=5)

# Train the grid search object on the training data
print("Training Decision Tree")
dt_search.fit(x_train, y_train)

# Print the best hyperparameters for the Decision Tree classifier
print('Best hyperparameters for Decision Tree:', dt_search.best_params_)

# Evaluate the classifier on the testing data
y_pred = dt_search.predict(x_test)

print('Accuracy of Decision Tree: ', accuracy_score(y_test, y_pred))

# classification_report
print(classification_report(y_test, y_pred))


In [None]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define the parameter grid
param_grid_nb = {
    'var_smoothing': [1e-09, 1e-08, 1e-07, 1e-06],
    'priors': [None, [0.2, 0.8], [0.5, 0.5]],
}

# Create a Naive Bayes classifier
nb_clf = GaussianNB()

# Create a grid search object for the Naive Bayes classifier
nb_search = GridSearchCV(nb_clf, param_grid_nb, cv=5)

# Train the grid search object on the training data
print("Training Naive Bayes")
nb_search.fit(x_train, y_train)

# Print the best hyperparameters for the Naive Bayes classifier
print('Best hyperparameters for Naive Bayes:', nb_search.best_params_)
# Evaluate the classifier on the testing data
y_pred = nb_search.predict(x_test)

print('Accuracy of Naive Bayes: ', accuracy_score(y_test, y_pred))

# classification_report
print(classification_report(y_test, y_pred))

In [None]:
# Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define the parameter grid
param_grid_gb = {
    'loss': ['deviance', 'exponential'],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 500],
    'subsample': [1.0, 0.8, 0.5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_depth': [3, 5, None],
    'max_features': ['sqrt', 'log2', None],
    'warm_start': [False, True]
}

# Create a Gradient Boosting classifier
gb_clf = GradientBoostingClassifier()

# Create a grid search object for the Gradient Boosting classifier
gb_search = GridSearchCV(gb_clf, param_grid_gb, cv=5)

# Train the grid search object on the training data
print("Training Gradient Boosting")
gb_search.fit(x_train, y_train)

# Print the best hyperparameters for the Gradient Boosting classifier
print('Best hyperparameters for Gradient Boosting:', gb_search.best_params_)
# Evaluate the classifier on the testing data
y_pred = gb_search.predict(x_test)

print('Accuracy of Gradient Boosting: ', accuracy_score(y_test, y_pred))

# classification_report
print(classification_report(y_test, y_pred))