# Dataset Exploration


Here we are going to explore the dataset with thousands of Mercado Libre items to see which features are more suitable for our model.


First we import the packages we need and set the palette for our plots.


In [None]:
import os

for _ in range(3):
    if os.path.exists(f'{os.getcwd()}/setup.py'):
        break
    os.chdir('..')
print('Current working directory:', os.getcwd())

In [116]:
import json
import re
import unicodedata

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

from src.utils.config import get_dataset_path
from src.utils.styling import apply_styling, make_palette

In [241]:
# Seaborn
colors = make_palette()
palette = colors['palette']
apply_styling(colors)

# Pandas
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: f'{x:.3f}')
pd.set_option('display.max_colwidth', 50)

## Load and explore the data


Read the jsonl file


In [None]:
train_path = get_dataset_path('raw_items_train')
with open(train_path, 'r') as f:
    lines = f.readlines()
    dict_objects = [json.loads(line) for line in lines[:]]
    print('Number of lines:', len(lines))

#### Schema


In [None]:
print(json.dumps(dict_objects[0], indent=2))

Print groups of dtypes


In [None]:
dtypes = {k: type(v) for k, v in dict_objects[0].items()}
for dtype in set(dtypes.values()):
    print(f'{dtype}:')
    for k, v in dtypes.items():
        if v == dtype:
            print(f'  {k}')
    print()

### To Pandas


Load the data into a pandas dataframe


In [183]:
df = pd.DataFrame.from_records(dict_objects)

#### Target variable


In [198]:
target_column = 'condition'

In [None]:
print(df[target_column].value_counts(dropna=False))
print()
print(df[target_column].value_counts(normalize=True))

In [185]:
df[target_column] = df[target_column].map(
    {
        'new': 0,
        'used': 1,
    }
)

#### Dictionary columns

Preprocess dictionary columns


In [186]:
# Location features
location_features = ['country', 'state', 'city']
for feature in location_features:
    df[feature] = df['seller_address'].apply(
        lambda x, k=feature: x.get(k, {}).get('name', None)
    )

df.drop(columns=['seller_address'], inplace=True)

In [187]:
# Shipping features
shipping_df = pd.json_normalize(df['shipping'])
shipping_df.columns = [
    'local_pick_up',
    'shipping_methods',
    'shipping_tags',
    'free_shipping',
    'shipping_mode',
    'dimensions',
    'shipping_free_methods',
]

df = pd.concat([df.drop('shipping', axis=1), shipping_df], axis=1)

#### List columns


In [188]:
list_cols = [
    'sub_status',
    'deal_ids',
    'non_mercado_pago_payment_methods',
    'variations',
    'attributes',
    'tags',
    'coverage_areas',
    'descriptions',
    'pictures',
]

# Replace empty lists with None
# for col in list_cols:
#     df[col] = df[col].apply(lambda x: np.nan if len(x) == 0 else x)

In [None]:
for col in list_cols:
    print(col)
    try:
        values, counts = np.unique(df[col].values, return_counts=True)
        print('Unique values:', len(values))
        if len(values) < 10:
            print('\n'.join([f'{v}: {c}' for v, c in zip(values, counts)]))
    except Exception as e:
        print(e)
    print('-' * 60)

In [190]:
# sub_status
# Make it a string instead of a list
df['sub_status'] = df['sub_status'].apply(lambda x: x[0] if x else np.nan)

In [191]:
# attributes
len_mask = df['attributes'].apply(lambda x: len(x)) > 0
attributes_df = pd.json_normalize(df.loc[len_mask, 'attributes'])
all_attributes = attributes_df.values.tolist()


# Clean attributes
def clean_name(name):
    """Normalize attribute names"""
    name = re.sub(r'[^\w\s]', '', name)
    name = re.sub(r'\s+', ' ', name)
    name = re.sub(r'\b(de|del)\s+', '', name, flags=re.IGNORECASE)
    name = name.replace(' ', '_').lower().strip()
    name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8')
    return name


# Flatten and consolidate attributes
attributes = []
for row in all_attributes:
    for attribute in row:
        if attribute:
            attribute_name = clean_name(attribute['name'])
            value = clean_name(attribute['value_name'])
            attributes.append({'attribute': attribute_name, 'value': value})

attributes_df = pd.DataFrame(attributes)
common_attributes = (attributes_df['attribute'].value_counts() >= 100).index.tolist()

In [196]:
def is_float(element: any) -> bool:
    # If you expect None to be passed:
    if pd.isna(element):
        return False
    try:
        float(element)
        return True
    except ValueError:
        return False


def consolidate_attributes(attributes):
    """Consolidate attributes"""
    new_attributes = {}
    for attribute in attributes:
        attribute_name = clean_name(attribute['name'])
        value = clean_name(attribute['value_name'])
        if (attribute_name in common_attributes) and ('pieza' not in attribute_name):
            new_attributes[f'attr_{attribute_name}'] = value

    return new_attributes


attributes_dict_df = df['attributes'].apply(consolidate_attributes).to_frame()
attributes_normalized_df = pd.json_normalize(attributes_dict_df['attributes'])
attributes_dict_df = pd.concat([df[['condition']], attributes_normalized_df], axis=1)
attributes_dict_df.fillna('missing', inplace=True)

In [206]:
def encode_categorical(df):
    le = LabelEncoder()
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = le.fit_transform(df[column].astype(str))
    return df


def select_features(X, y):
    # Random Forest Feature Importance
    print('Random Forest Feature Importance')
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    rf_scores = pd.Series(rf.feature_importances_, index=X.columns)
    rf_scores = rf_scores.sort_values(ascending=False)

    # ANOVA F-value
    print('ANOVA F-value')
    f_scores, _ = f_classif(X, y)
    f_scores = pd.Series(f_scores, index=X.columns)
    f_scores = f_scores.sort_values(ascending=False)
    return f_scores, rf_scores

In [200]:
attributes_dict_encoded_df = encode_categorical(attributes_dict_df)

# Split features and target
X = attributes_dict_encoded_df.drop(columns=[target_column])
y = attributes_dict_encoded_df[target_column]

In [None]:
lr_scores = {}
for i, column in enumerate(X.columns):
    model = LogisticRegression(random_state=42)
    scores = cross_val_score(model, X[[column]], y, cv=5, scoring='accuracy')
    lr_scores[column] = np.mean(scores)
    print(f'{i + 1}/{len(X.columns)}: {column} - {lr_scores[column]:.4f}')

lr_scores = pd.Series(lr_scores).sort_values(ascending=False)

In [None]:
f_scores, rf_scores = select_features(X, y)

In [None]:
# Print top 10 features for each method
print('\nTop 10 features by ANOVA F-value:\n', f_scores.head(10))
print('\nTop 10 features by Random Forest Importance:\n', rf_scores.head(10))

In [None]:
plt.figure(figsize=(12, 4))
sns.histplot(f_scores, bins=30, color=palette[0])
plt.title('ANOVA F-value')
plt.show()

plt.figure(figsize=(12, 4))
sns.histplot(rf_scores, bins=30, color=palette[1])
plt.title('Random Forest Importance')
plt.show()

plt.figure(figsize=(12, 4))
sns.histplot(lr_scores, bins=30, color=palette[2])
plt.title('Logistic Regression')
plt.show()

In [235]:
top_attributes_anova = f_scores.where(f_scores > 500).dropna().index.tolist()
top_attributes_rf = rf_scores.where(rf_scores > 0.01).dropna().index.tolist()

In [80]:
# tags
# Turn tags into flag columns
df['tags_str'] = df['tags'].apply(lambda x: ' / '.join(x))
tags_dummies = df['tags_str'].str.get_dummies(sep=' / ')
df = pd.concat([df, tags_dummies], axis=1)
df.drop(columns=['tags', 'tags_str'], inplace=True)

In [None]:
# coverage_areas
# Drop this column since it is empty
df.drop(columns=['coverage_areas'], inplace=True)

In [None]:
# descriptions
# Drop descriptions since they are all unique
print(df.descriptions.sample(3))
df[['descriptions']].apply(lambda x: x.str.len().value_counts(dropna=False))

In this notebook, we explored a dataset with millions of Spotify songs and their playlist groupings. You saw which artists and songs are most popular and observed how the distribution of how artists are represented in playlists follows a power law.
