In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/api-detection/sample_submission.csv
/kaggle/input/api-detection/test.msgpack
/kaggle/input/api-detection/train_info.csv
/kaggle/input/api-detection/info.csv
/kaggle/input/api-detection/train.msgpack


In [2]:
## Import libraries 
import msgpack
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

## Loading and exploring the data

In [3]:
info = pd.read_csv("/kaggle/input/api-detection/info.csv")
info.head(2)

Unnamed: 0,id,injection
0,59290,True
1,54992,True


In [4]:
train_info = pd.read_csv("/kaggle/input/api-detection/train_info.csv")
train_info.head(2)

Unnamed: 0,id,injection
0,14577,False
1,60186,True


In [5]:
sample_sub = pd.read_csv("/kaggle/input/api-detection/sample_submission.csv")
sample_sub.head()

Unnamed: 0,id,injection
0,43427,False
1,86082,False
2,19102,False
3,103200,False
4,95244,False


##### The train and test data are serialized with messsagepack Therefore, the data has to be deserialized. 

In [6]:
with open('/kaggle/input/api-detection/train.msgpack', 'rb') as data_file:
    train = msgpack.unpack(data_file)

In [7]:
with open('/kaggle/input/api-detection/test.msgpack', 'rb') as data_file:
    test = msgpack.unpack(data_file)

In [8]:
# Transforming train and test data to a pandas dataframe
train = pd.DataFrame(train)
test = pd.DataFrame(test)
train.columns = ['id', 'vector']
test.columns = ['id', 'vector']


In [9]:
train.head(2)

Unnamed: 0,id,vector
0,59290,' and/**/38>( select\t(622)/**/from/*362 emhgp...
1,54992,"shqpkt"" union /*!426 all\t*/(select kwicwt(\t(..."


In [10]:
test.head(2)

Unnamed: 0,id,vector
0,43427,/handler_sync_example.example?i=17S8Hq17qCMJVR...
1,86082,/web/registration?username=&_csrf=24283cb2-566...


In [11]:
# view the data sample to understand the data
train.iloc[2].values

array([64287,
       'nhnqag" uniondistinct--154 298 plhlre exaloq unyote \r(select (rpfeir#15 256 15 rttlat 66 gsjxjq 44 819 \r()), kuoopt(), (#\rvxhuha), (116), (--kfbvww 193 \r"mkhkjq"), 271, (omqjmj), gvdira--\r((--\r325), --mcrgew oqrhwp jhseux 55 xpayss 88 \rfctnjs (), (794), 3, #58 14 114 vdqoyq \r\'apxkvx\', (\tmhjblm())), #yxnynl \r"lllpue"/**/from#\rexample.example)'],
      dtype=object)

The data appears to be in disarray, containing a mixture of symbols, unconventional word encodings, and links, among other elements. At this stage, it's uncertain if preprocessing would be beneficial; in fact, it might even prove counterproductive. Therefore, let's start with a straightforward approach: tokenization at the character level coupled with logistic regression.






## Data preparation

In [12]:
# Convert the 'vector' column in the train dataset to string type to ensure consistency
train['vector'] = train['vector'].astype(str)

# Convert the 'vector' column in the test dataset to string type to ensure consistency
test['vector'] = test['vector'].astype(str)


In [13]:
# Initialize the TfidfVectorizer with specified parameters
# ngram_range=(1, 4) specifies that we want to consider uni-grams, bi-grams, tri-grams, and four-grams
# analyzer='char' indicates that we want to tokenize the input text into characters
vectorizer = TfidfVectorizer(ngram_range=(1, 4), analyzer='char')

# Combine the 'vector' column from the train and test datasets into a single list
full_text = list(train['vector'].values) + list(test['vector'].values)

# Fit the vectorizer on the combined text data to learn the vocabulary and IDF values
vectorizer.fit(full_text)

# Transform the 'vector' column of the train dataset into a sparse matrix representation using the fitted vectorizer
train_vectorized = vectorizer.transform(train['vector'])

# Transform the 'vector' column of the test dataset into a sparse matrix representation using the fitted vectorizer
test_vectorized = vectorizer.transform(test['vector'])

In [14]:
info = pd.merge(train, info, on='id')
info.head()

Unnamed: 0,id,vector,injection
0,59290,' and/**/38>( select\t(622)/**/from/*362 emhgp...,True
1,54992,"shqpkt"" union /*!426 all\t*/(select kwicwt(\t(...",True
2,64287,"nhnqag"" uniondistinct--154 298 plhlre exaloq u...",True
3,28821,D8j+oNbylTIGw=,False
4,27825,"ened \"">\n <head>\n <script data-react-hel...",False


In [15]:
y = np.array([1 if i == True else 0 for i in info.injection.values])

## Building model to classify vector

In [16]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(train_vectorized, y, test_size=0.2, random_state=42)

# Define parameters for XGBoost model
params = {
    'max_depth': 6,  # Maximum tree depth
    'eta': 0.1,  # Learning rate
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'auc'  # Evaluation metric: AUC
}

# Convert data into DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Train the XGBoost model
num_rounds = 100
xgb_model = xgb.train(params, dtrain, num_rounds)

# Predict probabilities on the test set
preds = xgb_model.predict(dtest)

# Calculate AUC
auc = roc_auc_score(y_test, preds)
print("AUC:", auc)


AUC: 0.9991689378665782


Wow! we have 0.99 AUC score. 
An AUC (Area Under the ROC Curve) of 0.99 indicates a very high level of model performance.

The ROC curve is a graphical plot that illustrates the diagnostic ability of a binary classifier system as its discrimination threshold is varied. The AUC represents the probability that the classifier will rank a randomly chosen positive instance higher than a randomly chosen negative instance.

An AUC value of 0.99 suggests that the model has an excellent ability to distinguish between positive and negative classes. In practical terms, this means that the model is making very accurate predictions, with a high true positive rate and a low false positive rate.



## Training final model and make predictions with the test data

In [17]:
import xgboost as xgb

# Define parameters for XGBoost model
params = {
    'max_depth': 6,  # Maximum tree depth
    'eta': 0.1,  # Learning rate
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'auc'  # Evaluation metric: AUC
}

# Convert all data into DMatrix
dtrain_all = xgb.DMatrix(train_vectorized, label=y)

# Train the XGBoost model
num_rounds = 100
final_model = xgb.train(params, dtrain_all, num_rounds)

# Convert test data into DMatrix
test_vectorized = vectorizer.transform(test['vector'])
dtest = xgb.DMatrix(test_vectorized)

# Make predictions on the test data
test_preds = final_model.predict(dtest)

binary_predictions = [1 if pred > 0.5 else 0 for pred in test_preds]


In [24]:
sample_sub['injection'] = binary_predictions
#sample_sub['injection'] = sample_sub['injection'].map({0: False, 1: True})
print(sample_sub.head())
sample_sub.to_csv('sample_sub.csv', index=False)

       id  injection
0   43427          0
1   86082          1
2   19102          0
3  103200          1
4   95244          1
