In [None]:
"""
This script processes a dataset stored in a CSV file ('dataset_2.csv') with columns: text, snapshot IDs, and labels.

Upon inspection of a data sample, the following patterns were identified as possible mislabeled entries.

So, the lines of code below:
1. Iterate through each entry in the dataset.
2. Filter entries based on specific criteria:
    a. If the text is shorter than 40 words (reason code 1).
    b. If the text contains the word 'queue', it implies temporarily blocked access (reason code 2).
    c. If the text contains the word 'captcha', it indicates the need for verification (reason code 3).
    d. If the text contains 'sign in' or 'sign up', it suggests credentials are needed to access (reason code 4).
    e. If the text contains 'porn', it suggests mislabeling, possibly pointing to inappropriate content (reason code 5).
    f. If the text contains 'redirected', 'suspended', or 'bad gateway', it indicates various access issues (reason codes 6-8).
3. Create a new DataFrame ('inspecting_dataset') with the filtered entries and their corresponding reason codes.

This inspecting_dataset undergoes manual inspection, resulting in the exclusion of the snapshots below from the final CSV file ('inspected_dataset_.csv').

"""

# Note: Uncommenting the two .to_csv lines will overwrite the original CSV files.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/preprocessed_dataset.csv') #Dataset post label aggregation and preprocessing steps

In [None]:
inspecting_dataset = []

for _, entry in df.iterrows():
    text = entry["text"]
    snapshot_id = entry["snapshot_id"]
    label = entry["label"]

    if len(text.split()) < 40: #if text is shorter than 40 words
        inspecting_dataset.append({"text": text, "snapshot_id": snapshot_id, "label": label, 'reason': 1})

    elif 'queue' in text.lower(): #if queue -> temporarily blocked access
        inspecting_dataset.append({"text": text, "snapshot_id": snapshot_id, "label": label,  'reason': 2})

    elif 'captcha' in text.lower(): #if captcha -> need to get access granted
        inspecting_dataset.append({"text": text, "snapshot_id": snapshot_id, "label": label,  'reason': 3})

    elif 'sign in' in text.lower() or 'sign up' in text.lower(): #credentials needed to access
        inspecting_dataset.append({"text": text, "snapshot_id": snapshot_id, "label": label,  'reason': 4})

    elif 'porn' in text.lower(): #from manual inspection some entries mislabelled and where porn websites or similar
        inspecting_dataset.append({"text": text, "snapshot_id": snapshot_id, "label": label,  'reason': 5})

    elif 'redirected' in text.lower(): #redirected
        inspecting_dataset.append({"text": text, "snapshot_id": snapshot_id, "label": label,  'reason': 6})

    elif 'suspended' in text.lower(): #suspended
        inspecting_dataset.append({"text": text, "snapshot_id": snapshot_id, "label": label,  'reason': 7})
 
    elif 'bad gateway' in text.lower(): #bad gateway
        inspecting_dataset.append({"text": text, "snapshot_id": snapshot_id, "label": label,  'reason': 8})


inspecting_dataset = pd.DataFrame(inspecting_dataset)
inspecting_dataset.groupby('label')['text'].count()

In [4]:
#inspecting_dataset.to_csv('data/ispecting_dataset.csv') 
#This is manually inspected and the snapshots below are identified as incorrect entries

In [5]:
snapshot_to_exclude = []# provide a list of snapshot_id to exclude determined by manual inspection of inspecting dataset

In [6]:
df_inspected = df[(~df['snapshot_id'].isin(snapshot_to_exclude))].copy()

In [7]:
df_inspected.shape

(1932, 8)

In [8]:
#df_inspected.to_csv('data/inspected_dataset_.csv')