In [278]:
# IMPORTS NEEDED FOR CODE TO RUN
import csv
import numpy as np
import pandas as pd
import regex as re

In [279]:
"""
    Function for stripping column names as well as data values of redundant white spaces.
"""
def strip_df_whitespaces(dataframe):
    # Strip column names of whitepaces
    column_names = [column.strip() for column in dataframe.columns]
    dataframe.columns = column_names

    # Strip datavalues of whitespaces
    dataframe = dataframe.map(lambda x: x.strip() if isinstance(x, str) else x)

    # Return result
    return dataframe

In [280]:
# Load in the uncleaned tabloid data
uncleaned_specific_data = open("../data/specific_data_uncleaned.txt").read()

print(uncleaned_specific_data)

| App Name | MASVS Category | Vulnerability | MASVS Counterpart |
| - | - | - | - |
| [Androgoat](https://github.com/satishpatnayak/AndroGoat) |  |  |  |
|  | Storage |  |  |
|  |  | Insecure Data Storage â€“ Shared Prefs - 1 | MASWE-0006: Sensitive Data Stored Unencrypted in Private Storage Locations |
|  |  | Insecure Data Storage - Shared Prefs - 2 | MASWE-0006: Sensitive Data Stored Unencrypted in Private Storage Locations |
|  |  | Insecure Data Storage - SQLite | MASWE-0006: Sensitive Data Stored Unencrypted in Private Storage Locations |
|  |  | Insecure Data Storage â€“ Temp Files | MASWE-0006: Sensitive Data Stored Unencrypted in Private Storage Locations |
|  |  | Insecure Data Storage â€“ SD Card | MASWE-0007: Sensitive Data Stored Unencrypted in Shared Storage Requiring No User Interaction |
|  |  | Insecure Logging | MASWE-0001: Insertion of Sensitive Data into Logs |
|  |  | Android allowBackup | MASWE-0003: Backup Unencrypted + MASWE-0004: Sensitive Data Not Excluded Fro

In [281]:
# Use regular expressions to reformat app names
# Regex pattern: replaces [AppName](https://github.com/anything) → AppName
uncleaned_specific_data = re.sub(r"\[([^\]]+)\]\(https://github\.com/[^\)]+\)", r"\1", uncleaned_specific_data)

In [282]:
# Split data into seperate rows
specific_data_rows = uncleaned_specific_data.split("\n")

# Store data in a csv file, using the "|" as seperators
with open("../data/specific_data_uncleaned.csv", "w", newline="") as file:
    writer = csv.writer(file)
    for row in specific_data_rows:
        # Split rows on "|" and write them to the csv file
        writer.writerow(row.split("|"))

In [283]:
# Convert it to a pandas dataframe
specific_data_df = pd.read_csv("../data/specific_data_uncleaned.csv")

# Drop first and last columns
specific_data_df.drop(["Unnamed: 0", "Unnamed: 5"], axis=1, inplace=True)
# Drop first row
specific_data_df.drop(0, inplace=True)

# Strip column names of whitespaces
specific_data_df = strip_df_whitespaces(specific_data_df)

# Add uniform missing values sign
for column in specific_data_df.columns:
    specific_data_df.loc[specific_data_df[column] == "", column] = "-"


# Reset the index to a meaningful one, considering the now removed values
specific_data_df.reset_index(drop=True, inplace=True)

display(specific_data_df)

Unnamed: 0,App Name,MASVS Category,Vulnerability,MASVS Counterpart
0,Androgoat,-,-,-
1,-,Storage,-,-
2,-,-,Insecure Data Storage – Shared Prefs - 1,MASWE-0006: Sensitive Data Stored Unencrypted ...
3,-,-,Insecure Data Storage - Shared Prefs - 2,MASWE-0006: Sensitive Data Stored Unencrypted ...
4,-,-,Insecure Data Storage - SQLite,MASWE-0006: Sensitive Data Stored Unencrypted ...
...,...,...,...,...
255,-,-,Passcode Bypass,MASWE-0008: Missing Device Secure Lock Verific...
256,-,-,Debuggable application,MASWE-0067: Debuggable Flag Not Disabled + MAS...
257,-,-,Login pin bypass via Frida/Objection,???MASWE-0097: Root/Jailbreak Detection Not Im...
258,-,Privacy,-,-


In [284]:
# Add correct "App Name" value to every row
app_names = specific_data_df.loc[specific_data_df["App Name"] != "-", "App Name"].values
app_names_indexes = []
for app_name in app_names:
    app_names_indexes.append(specific_data_df[specific_data_df["App Name"] == app_name].index[0])
# Add last index of dataframe rows as guardian value for next operation
app_names_indexes.append(len(specific_data_df.values))


# Go through all app name indexes, locate the rows which require the same app name and fill them in
for index in range(len(app_names_indexes)-1):
    specific_data_df.iloc[app_names_indexes[index]:app_names_indexes[index+1], 0] = app_names[index]

# Count amount of "-" missing values
print(np.sum(specific_data_df["Vulnerability"] == "-"))

# Reset the index to a meaningful one, considering the now removed values
specific_data_df.reset_index(drop=True, inplace=True)
display(specific_data_df)

97


Unnamed: 0,App Name,MASVS Category,Vulnerability,MASVS Counterpart
0,Androgoat,-,-,-
1,Androgoat,Storage,-,-
2,Androgoat,-,Insecure Data Storage – Shared Prefs - 1,MASWE-0006: Sensitive Data Stored Unencrypted ...
3,Androgoat,-,Insecure Data Storage - Shared Prefs - 2,MASWE-0006: Sensitive Data Stored Unencrypted ...
4,Androgoat,-,Insecure Data Storage - SQLite,MASWE-0006: Sensitive Data Stored Unencrypted ...
...,...,...,...,...
255,BugBazaar,-,Passcode Bypass,MASWE-0008: Missing Device Secure Lock Verific...
256,BugBazaar,-,Debuggable application,MASWE-0067: Debuggable Flag Not Disabled + MAS...
257,BugBazaar,-,Login pin bypass via Frida/Objection,???MASWE-0097: Root/Jailbreak Detection Not Im...
258,BugBazaar,Privacy,-,-


In [285]:
for app_name in app_names:
    # Repeat the transformations above, but for "MASVS Category" instead of "App Name"
    masvs_names = specific_data_df.loc[(specific_data_df["MASVS Category"] != "-") & 
                                       (specific_data_df["App Name"] == app_name), "MASVS Category"].values
    masvs_names_indexes = []
    for masvs_name in masvs_names:
        masvs_names_indexes.append(specific_data_df[(specific_data_df["MASVS Category"] == masvs_name) &
                                                    (specific_data_df["App Name"] == app_name)].index[0])
        
    # Add last index of dataframe rows as guardian value for next operation
    masvs_names_indexes.append(int(specific_data_df[specific_data_df["App Name"] == app_name].index[-1]))

    print(masvs_names_indexes)

    # Go through all app name indexes, locate the rows which require the same app name and fill them in
    for index in range(len(masvs_names_indexes)-1):
        specific_data_df.iloc[masvs_names_indexes[index]:masvs_names_indexes[index+1], 1] = masvs_names[index]


# Reset the index to a meaningful one, considering the now removed values
specific_data_df.reset_index(drop=True, inplace=True)

display(specific_data_df)

[1, 9, 11, 13, 18, 27, 29, 32, 33]
[35, 41, 43, 49, 51, 53, 57, 59, 60]
[62, 67, 70, 75, 77, 86, 88, 94, 95]
[97, 103, 105, 107, 109, 113, 117, 120, 121]
[123, 125, 127, 129, 131, 142, 149, 151, 152]
[154, 157, 159, 162, 164, 178, 180, 182, 183]
[185, 187, 189, 191, 193, 197, 200, 202, 203]
[205, 213, 215, 217, 219, 243, 254, 258, 259]


Unnamed: 0,App Name,MASVS Category,Vulnerability,MASVS Counterpart
0,Androgoat,-,-,-
1,Androgoat,Storage,-,-
2,Androgoat,Storage,Insecure Data Storage – Shared Prefs - 1,MASWE-0006: Sensitive Data Stored Unencrypted ...
3,Androgoat,Storage,Insecure Data Storage - Shared Prefs - 2,MASWE-0006: Sensitive Data Stored Unencrypted ...
4,Androgoat,Storage,Insecure Data Storage - SQLite,MASWE-0006: Sensitive Data Stored Unencrypted ...
...,...,...,...,...
255,BugBazaar,Resilience,Passcode Bypass,MASWE-0008: Missing Device Secure Lock Verific...
256,BugBazaar,Resilience,Debuggable application,MASWE-0067: Debuggable Flag Not Disabled + MAS...
257,BugBazaar,Resilience,Login pin bypass via Frida/Objection,???MASWE-0097: Root/Jailbreak Detection Not Im...
258,BugBazaar,Privacy,-,-


In [286]:
# Remove title rows for "App Name" and "MASVS Category", which hold no actual values
specific_data_df = specific_data_df[specific_data_df["Vulnerability"] != "-"]

# Reset the index to a meaningful one, considering the now removed values
specific_data_df.reset_index(drop=True, inplace=True)

# Strip leading and ending question marks from "MASVS Counterpart" column
specific_data_df["MASVS Counterpart"] = specific_data_df["MASVS Counterpart"].str.strip("?")

display(specific_data_df)

Unnamed: 0,App Name,MASVS Category,Vulnerability,MASVS Counterpart
0,Androgoat,Storage,Insecure Data Storage – Shared Prefs - 1,MASWE-0006: Sensitive Data Stored Unencrypted ...
1,Androgoat,Storage,Insecure Data Storage - Shared Prefs - 2,MASWE-0006: Sensitive Data Stored Unencrypted ...
2,Androgoat,Storage,Insecure Data Storage - SQLite,MASWE-0006: Sensitive Data Stored Unencrypted ...
3,Androgoat,Storage,Insecure Data Storage – Temp Files,MASWE-0006: Sensitive Data Stored Unencrypted ...
4,Androgoat,Storage,Insecure Data Storage – SD Card,MASWE-0007: Sensitive Data Stored Unencrypted ...
...,...,...,...,...
158,BugBazaar,Code,Runtime code modification,MASWE-0085: Unsafe Dynamic Code Loading + MASW...
159,BugBazaar,Code,Improper cache handling,MASWE-0082 (Unsafe Handling of Data from Local...
160,BugBazaar,Resilience,Passcode Bypass,MASWE-0008: Missing Device Secure Lock Verific...
161,BugBazaar,Resilience,Debuggable application,MASWE-0067: Debuggable Flag Not Disabled + MAS...


In [287]:
# Export cleaned CSV file
specific_data_df.to_csv("../data/specific_data_cleaned.csv", index=False)

In [288]:
# Now the same for the overview data
# Load in the uncleaned data
uncleaned_overview_data = open("../data/overview_data_uncleaned.txt").read()

# Split data into seperate rows
overview_data_rows = uncleaned_overview_data.split("\n")

# Store data in a csv file, using the "|" as seperators
with open("../data/overview_data_uncleaned.csv", "w", newline="") as file:
    writer = csv.writer(file)
    for row in overview_data_rows:
        # Split rows on "|" and write them to the csv file
        writer.writerow(row.split("|"))

# Convert it to a pandas dataframe
overview_data_df = pd.read_csv("../data/overview_data_uncleaned.csv")

# Drop first and last column and first (empty) row
overview_data_df.drop(["Unnamed: 0", " "], axis=1, inplace=True)
overview_data_df.drop(0, inplace=True)

# Remove "#" from Vulnerabilities column
overview_data_df.columns = overview_data_df.columns.str.replace("#", "")

# Reset index to a meaningful one
overview_data_df.reset_index(drop=True, inplace=True)

# Remove redundant string parts of data values
for column_name in overview_data_df.columns[2:]:
    overview_data_df[column_name] = overview_data_df[column_name].str.strip(column_name)

# Strip column names and data values of whitepaces
overview_data_df = strip_df_whitespaces(overview_data_df)

# Add uniform missing values sign
overview_data_df.replace("", np.nan, inplace=True)

display(overview_data_df)

# Export cleaned df to csv
overview_data_df.to_csv("../data/overview_data_cleaned.csv", index=False)

Unnamed: 0,App Name,Vulnerabilites,Storage,Crypto,Auth,Network,Platform,Code,Resilience,Privacy
0,Androgoat,24,7.0,1.0,1.0,4.0,7.0,1.0,2.0,1.0
1,DIVA,13,5.0,,5.0,,,3.0,,
2,InsecureBankv2,25,4.0,2.0,4.0,1.0,8.0,1.0,5.0,
3,Hacking Playground Android App,15,5.0,1.0,,1.0,3.0,3.0,2.0,
4,OVAA,18,1.0,1.0,,,10.0,6.0,,
5,InsecureShop,19,2.0,,2.0,1.0,13.0,,1.0,
6,Finstergram,5,,,,,3.0,2.0,,
7,BugBazaar,43,7.0,,1.0,,23.0,9.0,3.0,
