In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.model_selection import train_test_split

In [None]:
#Define the path of the dataset
dataset_path = 'Dataset'

In [None]:
#Create lists for storing the tokens
good_tokens = []
bad_tokens = []

In [None]:
#Loop through the folders in the dataset
for root, folders, files in os.walk(dataset_path):
    for file in files:
        #Check if it is a C++ file
        if file.endswith(".c"):
            #Open the file
            with open(os.path.join(root, file), 'r') as f:
                current_file = f.read()
                #Tokenize the file content
                current_tokens = word_tokenize(current_file)
                #Remove stopwords
                current_words = [word for word in current_tokens if word not in stopwords.words('english')]
                #Remove punctuation
                current_words = [word for word in current_words if word not in string.punctuation]
                #Lemmatize the words
                current_words = [WordNetLemmatizer().lemmatize(word) for word in current_words]
                #Check if it is a good function
                if 'good' in root:
                    good_tokens.extend(current_words)
                else:
                    bad_tokens.extend(current_words)

In [None]:
#Concatenate good and bad tokens and create labels
all_tokens = good_tokens + bad_tokens
labels = [0]*len(good_tokens) + [1]*len(bad_tokens)

In [None]:
#Split the dataset into train, test and validation
X_train, X_test, y_train, y_test = train_test_split(all_tokens, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [None]:
#Create a tokenizer object
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
#Convert data into sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_val_seq = tokenizer.texts_to_sequences(X_val)

In [None]:
#Pad the sequences
max_length = max([len(x) for x in all_tokens])
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_length, padding='post')

In [None]:
#Create the model
embedding_dims = 50

inputs = Input(shape=(max_length,))
x = Embedding(len(tokenizer.word_index) + 1, embedding_dims)(inputs)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
predictions = Dense(1, activation='sigmoid')(x)

model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

In [None]:
#Train the model
model.fit(X_train_pad, np.array(y_train), batch_size=32, epochs=10, validation_data=(X_val_pad, np.array(y_val)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5e4dfaaf40>

In [None]:
#Evaluate the model
y_test = np.array(y_test)
loss, acc = model.evaluate(X_test_pad, y_test, verbose=1)
print("Accuracy:", acc)

Accuracy: 1.0
