In [1]:
# Imports and settings
import pandas as pd
import seaborn as sns
pd.set_option('display.min_rows', 20)

from untidy import untidyfy

In [2]:
# Load clean data
clean_df = sns.load_dataset('titanic')
clean_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
messy_df = untidyfy(clean_df, 
                  corruption_level=8,
                  nans=True,
                  outliers=True,
                  text_noise=True,
                  mess_with_numbers=True,
                  mess_with_string_encodings=True,
                  duplicate_rows=True,
                  duplicate_columns=False)

In [4]:
messy_df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3000,b'male!',220000.0,1,0,725000.0,S&,Third,man!,True,,Southampton!,,?
1,1,1,b'female#',,1000,0,7128330.0,C,First,woman,False,C,,yes%,False
2,,,b'female&',,0,,,?,,woman&,False,,Southampton?,yes,True
3,1,,b'female',,1000,0,5310000.0,S!,,,False,C,Southampton%,,False
4,,,b'male!',,0,0,805000.0,S$,Third,man%,,,,no$,True
5,0,,?,,,0,,Q$,Third,man,,,Queenstown?,no&,True
6,,,?,,0,,5186250.0,,First,man$,True,E,Southampton$,no,True
7,0,3000,,2.0,,1,,,,,False,,Southampton!,no?,
8,,,b'female',,0,2000,,S$,,,,,Southampton&,,False
9,?,2000,?,140000.0,?,0,30.0708,C!,Second,child#,,,Cherbourg,,


In [9]:
def add_duplicate_columns(clean_data, corruption_level=4):
    """
    Add extra columns in a dataset

    Parameters
    ----------
    clean_data: pd.DataFrame
        dataset to be contaminated
    corruption_level: int, optional
        level of corruption, should be between 0 and 10, where 0 leaves the dataset as is, 10
        is the highest level of contamination

    Returns
    -------
    data: pd.DataFrame
        data with duplicated columns
    """
    data = clean_data.copy()
    _, n_cols = data.shape
    n_cols_duplicated = int(np.ceil(n_cols * (0.2 * corruption_level / 10)))

    dupes = data.sample(n=n_cols_duplicated, axis=1)
    data = pd.concat([data, dupes], axis=1, ignore_index=True)
    # Add new names to duplicate columns
    dup_col_names = [c + random.choice(string.ascii_lowercase) for c in dupes.columns]
    data.columns = clean_data.columns.tolist() + dup_col_names

    # Shuffle the columns of the data
    new_col_order = np.random.choice(data.columns, size=len(data.columns), replace=False)
    data = data[new_col_order]

    return data