# Building dataset

In [10]:
import contextlib
import os
from itertools import chain, permutations

import numpy as np
import pandas as pd

## Load files

In [11]:
def get_max_index(path: str) -> int:
    """Get the last file with generated plagiarisms"""
    max_index = -1
    for file in os.listdir(path):
        with contextlib.suppress(ValueError):
            max_index = max(max_index, int(os.path.splitext(file)[0]))
    return max_index


def get_df(path: str) -> pd.DataFrame:
    idx = get_max_index(path)
    return pd.read_csv(os.path.join(path, f"{idx}.csv"))

In [12]:
def generate_data(
    df: pd.DataFrame, positive_score: float
) -> list[tuple[str, str, float]]:
    data: list[tuple[str, str, float]] = []

    # construct pairs
    perms = list(chain(*[permutations(row, 2) for row in df.values]))
    data += [(*p, positive_score) for p in perms]

    positive_num = len(data)
    negative_num = int(positive_num / len(df) / 2)
    # divide by 2 because we have negative for type 1 and 2
    # and we need equal proportions for
    # positive type 1, positive type 2 and negative

    # negative samples
    for i, row in df.iterrows():
        for _ in range(negative_num):
            target = np.random.choice(row.values)
            negative_idx = np.random.choice([j for j in range(len(df)) if j != i])
            negative_row = df.loc[negative_idx].values
            negative = np.random.choice(negative_row)

            data.append((target, negative, 0.0))

    return data


def generate_dataset(path: str, shuffle: bool = False, seed: int = 42) -> pd.DataFrame:
    np.random.seed(seed)

    data = []

    data += generate_data(get_df(f"{path}1"), 1.0)  # type 1 plagiarism
    data += generate_data(get_df(f"{path}2"), 0.5)  # type 2 plagiarism

    if shuffle:
        np.random.shuffle(data)

    return pd.DataFrame(data, columns=["target", "candidate", "score"])

In [13]:
train_df = generate_dataset("../generated/train")
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   target     15120 non-null  object 
 1   candidate  15120 non-null  object 
 2   score      15120 non-null  float64
dtypes: float64(1), object(2)
memory usage: 354.5+ KB


In [14]:
test_df = generate_dataset("../generated/test")
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 882 entries, 0 to 881
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   target     882 non-null    object 
 1   candidate  882 non-null    object 
 2   score      882 non-null    float64
dtypes: float64(1), object(2)
memory usage: 20.8+ KB


In [15]:
def save_df(df: pd.DataFrame, path: str):
    """Process and save pandas Data frame"""
    df.loc[:, ~df.columns.str.contains("^Unnamed")].to_csv(path, index=False)

In [21]:
def soft_mkdir(path: str):
    try:
        os.mkdir(os.path.join(".", path))
    except:
        pass


soft_mkdir("../generated/datasets")

In [22]:
save_df(train_df, "../generated/datasets/train.csv")
save_df(test_df, "../generated/datasets/test.csv")

In [23]:
k = 3
save_df(train_df[train_df.index % k == 0], "../generated/datasets/train_md.csv")
save_df(test_df[test_df.index % k == 0], "../generated/datasets/test_md.csv")

In [24]:
k = 6
save_df(train_df[train_df.index % k == 0], "../generated/datasets/train_sm.csv")
save_df(test_df[test_df.index % k == 0], "../generated/datasets/test_sm.csv")