# Analyzing Merchant Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
merchants_df = pd.read_csv('data/unzipped/merchants.csv')

In [None]:
merchants_df.head()

## Merchant IDs

In [None]:
merchant_ids = merchants_df['merchant_id']

In [None]:
len(merchant_ids)

In [None]:
merchants_df['merchant_id'].nunique()

Apparently we have duplicate merchant IDs.

In [None]:
merchants_df[merchants_df.duplicated('merchant_id')]

We see that there are entries with equal merchant IDs that differ in merchant_group_id and/or merchant_category_id, but also entries where all these values are equal, that are still not completely equal, e.g.:

In [None]:
merchants_df.iloc[[4182, 4183]]

In [None]:
merchants_df.corr()

In [None]:
sns.heatmap(merchants_df.corr(), vmin=0, vmax=1)

In [None]:
len(merchants_df)

## Merchant Group IDs

In [None]:
merchant_group_ids = merchants_df['merchant_group_id']

In [None]:
merchant_group_ids.head()

In [None]:
merchant_group_ids.nunique()

## Merchant Category IDs

In [None]:
merchant_category_ids = merchants_df['merchant_category_id']

In [None]:
merchant_category_ids.head()

In [None]:
merchant_category_ids.nunique()

## Subsector IDs

In [None]:
subsector_ids = merchants_df['subsector_id']

In [None]:
subsector_ids.head()

In [None]:
subsector_ids.unique()

In [None]:
subsector_ids.nunique()

## Numerical 1

In [None]:
numerical_1s = merchants_df['numerical_1']

In [None]:
numerical_1s.head()

In [None]:
numerical_1s.nunique()

In [None]:
np.log(numerical_1s - min(numerical_1s) + 0.1).plot()

## Numerical 2

In [None]:
numerical_2s = merchants_df['numerical_2']

In [None]:
numerical_2s.head()

In [None]:
numerical_2s.nunique()

In [None]:
np.log(numerical_2s - min(numerical_2s) + 0.1).plot()

In [None]:
(numerical_1s - numerical_2s).plot()

## Category 1

In [None]:
category_1s = merchants_df['category_1']

In [None]:
category_1s.head()

In [None]:
category_1s.nunique()

In [None]:
category_1s_yes = np.where(category_1s == 'Y')

In [None]:
len(category_1s_yes[0])

## Most Recent Sales Range

In [None]:
most_recent_sales_ranges = merchants_df['most_recent_sales_range']

In [None]:
most_recent_sales_ranges.head()

In [None]:
most_recent_sales_ranges.unique()

In [None]:
sns.catplot(x='most_recent_sales_range', kind='count', data=merchants_df, order='ABCDE')

## Most Recent Purchase Range

In [None]:
most_recent_purchases_ranges = merchants_df['most_recent_purchases_range']

In [None]:
most_recent_purchases_ranges.head()

In [None]:
most_recent_purchases_ranges.unique()

In [None]:
sns.catplot(x='most_recent_purchases_range', kind='count', data=merchants_df, order = 'ABCDE')