In [1]:
import pandas as pd
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

In [2]:
import xgboost as xgb
from sentence_transformers import SentenceTransformer,util

In [3]:
import sys  
sys.path.insert(1, '/home/jovyan/work/core') 
import core_utils

In [5]:
# read labelled dataset 
# This consists of 3k annotated API methods 
# These are the methods that we will use to train the model

df_labels = pd.read_csv('../inputs/new_API.csv',index_col=None)
df_labels.drop_duplicates(subset=['keys'],keep='first',inplace=True)
true_labels = dict(zip(df_labels['keys'],df_labels['real']))

### Efficacy of representation  
### Section 4.2

### Observations

- The full table is in outputs/format_evaluation_precision.csv
- Run the cells below to read/reproduce the data for this plot

In [6]:
################ Experiment_1 using pre-trained model ################

# First we download the pre-trained model
# This line takes some time
sbert = SentenceTransformer('all-mpnet-base-v2')

# to save the model locally, run the following command
# sbert.save(path='../embeddings/sent_bert/',model_name='all-mpnet-base-v2')


.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [7]:
header = ['input','learning','accuracy','precision','recall','f1']


# Name for the input dataset. Formats are ready for the embedding layer
inputs = ['new_API.csv']
base = '../inputs/'

classifiers = []
output = []

for input in inputs:
    # read input documentation
    format = input.replace('.csv','')
    df = pd.read_csv(base + input,index_col=None)
    df.drop_duplicates(keep='first',inplace=True)
    # true label here are from block 16
    df['real'] = [true_labels[x] for x in df['keys']]
    
    # embed the method documentation
    X_original = sbert.encode(df['keys'].values)

    label_encoder = LabelEncoder()
    label_encoder = label_encoder.fit(df['real'].values)
    label_encoded_y = label_encoder.transform(df['real'].values)
    
    x_train, x_test, y_train, y_test = train_test_split(X_original,
                                    label_encoded_y, test_size=0.25, random_state=0) #

    
    # Run classifiers
    
    learning = 'logistic'
    logisticRegr = LogisticRegression()
    logisticRegr.fit(x_train, y_train)
    classifiers.append(logisticRegr)
    predicted = logisticRegr.predict(x_test)
    accuracy,precision,recall,f1 =  core_utils.get_metrics(y_test, predicted)
    output.append([format,learning,accuracy,precision,recall,f1])
    
    
    learning = 'svm'
    clf = svm.SVC(kernel='linear', C=1).fit(x_train, y_train)
    classifiers.append(clf)
    predicted = clf.predict(x_test)
    accuracy,precision,recall,f1 =  core_utils.get_metrics(y_test, predicted)
    output.append([format,learning,accuracy,precision,recall,f1])
    
    learning = 'xgboost'
    xgbc = xgb.XGBClassifier()
    xgbc.fit(x_train, y_train)
    classifiers.append(xgbc)
    predicted = xgbc.predict(x_test)
    accuracy,precision,recall,f1 =  core_utils.get_metrics(y_test, predicted)
    output.append([format,learning,accuracy,precision,recall,f1])
    
    learning = 'NN'
    nn = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(64), random_state=1,max_iter=500)
    nn.fit(x_train,y_train)
    classifiers.append(nn)
    predicted = nn.predict(x_test)
    accuracy,precision,recall,f1 =  core_utils.get_metrics(y_test, predicted)
    output.append([format,learning,accuracy,precision,recall,f1])    


df_results = pd.DataFrame(output,columns=header)

In [8]:
df_results.sort_values(by = ['learning','input'])

Unnamed: 0,input,learning,accuracy,precision,recall,f1
3,new_API,NN,0.962963,0.982456,0.958333,0.968769
0,new_API,logistic,0.925926,0.815789,0.916667,0.832261
1,new_API,svm,0.962963,0.833333,0.958333,0.866667
2,new_API,xgboost,0.962963,0.982456,0.958333,0.968769


### Platform libraries experiments

### Section 4.4

In [9]:
# Table 5 Experiments. Here we use the pre-trained models. 
# The approach here is to train the classifiers and then use the ones with best performance
# to predict the labels for Google Play Services methods. 

inputs = ['new_API.csv']
base = '../inputs/'
header = ['input','learning','accuracy','precision','recall','f1']
df_labels = pd.read_csv(f'{base}new_API.csv',index_col=None)
df_labels.drop_duplicates(subset=['keys'],keep='first',inplace=True)
true_labels = dict(zip(df_labels['keys'],df_labels['real']))
# store the models for latter predictions
classifiers = []
output = []

for input in inputs:
    df = pd.read_csv(base + input,index_col=None)
    df.drop_duplicates(keep='first',inplace=True)
    df['real'] = [true_labels[x] for x in df['keys']]

    # use the fine-tunned model if available. 
    sbert = SentenceTransformer('all-mpnet-base-v2')
    
    X_original = sbert.encode(df['docs'].values)
    embedding = input.replace('.csv','').replace('3k_','')


    label_encoder = LabelEncoder()
    label_encoder = label_encoder.fit(df['real'].values)
    label_encoded_y = label_encoder.transform(df['real'].values)

    x_train, x_test, y_train, y_test = train_test_split(X_original,
                                         label_encoded_y, test_size=0.15, random_state=0) 


    learning = 'xgboost'
    model = xgb.XGBClassifier()
    model.fit(x_train, y_train)
    classifiers.append(model)
    predicted = model.predict(x_test)
    accuracy,precision,recall,f1 =  core_utils.get_metrics(y_test, predicted)
    output.append([embedding,learning,accuracy,precision,recall,f1])
    
    learning = 'NN'
    nn = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(64), random_state=1,max_iter=500)
    nn.fit(x_train,y_train)
    classifiers.append(nn)
    predicted = nn.predict(x_test)
    accuracy,precision,recall,f1 =  core_utils.get_metrics(y_test, predicted)
    output.append([embedding,learning,accuracy,precision,recall,f1])

In [11]:
# Read the dataset of Google Play Services libraries with the ground truth
df_input = pd.read_csv('../inputs/test_experiment_libs_nodup_D.csv',index_col=False)
df_input.shape

(513, 4)

In [12]:
# We first need to embed the documentation from the test set
test_emb = sbert.encode(df_input['docs'].values)
# encode the labels
label_encoded_test_y = label_encoder.transform(df_input['classification'].values)
test_emb.shape,label_encoded_test_y.shape

((513, 768), (513,))

In [13]:
# Now, we make the predictions for the test set
predictions = []
output2 = []
for clf,learning,embedding in [(classifiers[0],'xg','d'),(classifiers[1],'nn','d')]:
    predicted = clf.predict(test_emb)
    predictions.append(predicted)
    accuracy,precision,recall,f1 =  core_utils.get_metrics(label_encoded_test_y, predicted)
    output2.append([embedding,learning,accuracy,precision,recall,f1])
df_results2 = pd.DataFrame(output2,columns=header)
df_input['xg_new'] = label_encoder.inverse_transform(predictions[0])
df_input['nn_new'] = label_encoder.inverse_transform(predictions[1])

In [14]:
# Here we calculate the percentage of detected sources/sinks per library 
# table 5 is generated from this output
totals = []
lib_wear = df_input.loc[df_input.library == 'wearable']
totals.append(['wear','sources',lib_wear.loc[(lib_wear.classification == 'sink')].shape[0], \
lib_wear.loc[(lib_wear.classification == 'sink') & (lib_wear.classification == lib_wear.xg_new) ].shape[0]])
totals.append(['wear','sinks',lib_wear.loc[(lib_wear.classification == 'source')].shape[0], \
lib_wear.loc[(lib_wear.classification == 'source') & (lib_wear.classification == lib_wear.xg_new) ].shape[0]])

lib_tv = df_input.loc[df_input.library == 'tv']
totals.append(['tv','sources',lib_tv.loc[(lib_tv.classification == 'sink')].shape[0], \
lib_tv.loc[(lib_tv.classification == 'sink') & (lib_tv.classification == lib_tv.xg_new) ].shape[0]])
totals.append(['tv','sources',lib_tv.loc[(lib_tv.classification == 'source')].shape[0], \
lib_tv.loc[(lib_tv.classification == 'source') & (lib_tv.classification == lib_tv.xg_new) ].shape[0]])

analytic = df_input.loc[df_input.library.isin(['analytics'])]
totals.append(['analytic','sources',analytic.loc[(analytic.classification == 'sink')].shape[0], \
analytic.loc[(analytic.classification == 'sink') & (analytic.classification == analytic.xg_new) ].shape[0]])
totals.append(['analytic','sinks',analytic.loc[(analytic.classification == 'source')].shape[0], \
analytic.loc[(analytic.classification == 'source') & (analytic.classification == analytic.xg_new) ].shape[0]])

ads = df_input.loc[df_input.library.isin(['ads'])]
totals.append(['ads','source',ads.loc[(ads.classification == 'source')].shape[0], \
ads.loc[(ads.classification == 'source') & (ads.classification == ads.xg_new) ].shape[0]])
totals.append(['ads','sink',ads.loc[(ads.classification == 'sink')].shape[0], \
ads.loc[(ads.classification == 'sink') & (ads.classification == ads.xg_new) ].shape[0]])

In [15]:
# results are slightly different than the paper due to the model used
totals_df = pd.DataFrame(totals,columns=['library','method','total','detected'])
totals_df['perc'] = (totals_df.detected / totals_df.total) * 100
totals_df

Unnamed: 0,library,method,total,detected,perc
0,wear,sources,32,13,40.625
1,wear,sinks,89,8,8.988764
2,tv,sources,3,0,0.0
3,tv,sources,14,0,0.0
4,analytic,sources,29,15,51.724138
5,analytic,sinks,9,3,33.333333
6,ads,source,63,0,0.0
7,ads,sink,14,13,92.857143
