In [1]:
import json
import re
import pandas as pd
from pandas import json_normalize

In [5]:
# You can safely assume that `build_dataset` is correctly implemented
def build_dataset():
    data = [json.loads(x) for x in open("MLA_100k_checked_v3.jsonlines")]
    target = lambda x: x.get("condition")
    N = -10000
    X_train = data[:N]
    X_test = data[N:]
    y_train = [target(x) for x in X_train]
    y_test = [target(x) for x in X_test]
    for x in X_test:
        del x["condition"]
    return X_train, y_train, X_test, y_test

def pre_process(text):
    '''function that normalize the text'''
    text = str(text)                             # Converting texto into string
    text = re.sub(r'[^\w\s]', ' ', text)         # Remove all the special characters
    return text

def clean_flattern_json(df):
    #1st normalization: flatten columns with json files
    dfp = json_normalize(df)
    #2nd normalization: flatten nested columns with json files
    nested_columns = ['non_mercado_pago_payment_methods',
                      'pictures']
    for col in nested_columns:
        df_col = json_normalize(df, record_path=col)
        columns_name = df_col.columns
        for ncols in columns_name:
            df_col = df_col.rename(columns={ncols:col+'.'+ncols})
        dfp = pd.concat([dfp, df_col], axis=1)
    #remove nested columns 
    dfp.drop(nested_columns, axis='columns', inplace=True)
    #3rd normalization: remove special characters from columns
    columns_with_special_characters = ['sub_status',
                                       'deal_ids',
                                       'variations',
                                       'attributes',
                                       'tags',
                                       'coverage_areas',
                                       'descriptions']
    for col in columns_with_special_characters:
        dfp[col] = dfp[col].apply(pre_process)
    return dfp

In [6]:
if __name__ == "__main__":
    print("Loading dataset...")
    # Train and test data following sklearn naming conventions
    # X_train (X_test too) is a list of dicts with information about each item.
    # y_train (y_test too) contains the labels to be predicted (new or used).
    # The label of X_train[i] is y_train[i].
    # The label of X_test[i] is y_test[i].
    X_train, y_train, X_test, y_test = build_dataset()
    X_train = clean_flattern_json(X_train)
    X_test = clean_flattern_json(X_test)

Loading dataset...


In [9]:
len(X_train.columns)

63

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
df_X_train = pd.DataFrame(X_train)

In [None]:
df_X_train.head(2)

In [4]:
X_train_df = json_normalize(X_train)

In [None]:
X_train_df.head(2)

In [None]:
for col in ['non_mercado_pago_payment_methods','pictures']:
    df1 = json_normalize(X_train, record_path=col)
    columns_name = df1.columns
    for ncols in columns_name:
        df1 = df1.rename(columns={ncols:col+'.'+ncols})
    X_train_df = pd.concat([X_train_df, df1], axis=1)
    #df1.head()

In [None]:
X_train_df.head(2)

In [None]:
for col in ['sub_status','deal_ids','variations','attributes','tags','coverage_areas','descriptions']:
    X_train_df[col] = X_train_df[col].apply(pre_process)

In [None]:
X_train_df.head(2)

In [None]:
X_train_df['shipping.local_pick_up'].unique()

In [None]:
X_train_df.drop(['non_mercado_pago_payment_methods','pictures'], axis='columns', inplace=True)

In [None]:
X_train_df.head(2)