In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

- load the datset
- look at the data `(head, tail, sample)`
    - get the information about the datatype `(dtypes)`
    - get the information about the columns `(info)`
    - check how many rows are missing in each column `(isnull, isna)`
    - get statistical information about the dataset `(describe)`
- if required, `clean` the dataset
    - remove the rows with missing values
    - remove the columns with missing values or they are not required
- if required, transform the dataset
    - convert the categorical columns to numerical columns
    - convert the numerical columns to categorical columns
- if required, convert the datatypes of the columns
- if required, create new columns
- if required, rename the columns
- if required, remove the duplicates
- if required, remove the outliers
- if required, scale the dataset (standardization or normalization)
- sort the dataset
- visualize the data
    - univariate analysis
        - visualizing a single column
            - histogram
            - boxplot
            - line
            - bar
            - pie
            - area
    - bivariate analysis
        - visualizing two columns
            - scatter (bubble)
            - line
            - bar
            - pie
            - area
    - multivariate analysis
        - visualizing more than two columns
            - scatter (bubble)
            - surface
---
**Note: If the dataset is too large, then take a sample of the dataset and perform the above steps.**

In [None]:
df = pd.read_json('flipkart_fashion_products_dataset.json')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# missing values
df.isnull().sum()

In [None]:
# removing unwanted columns
cols_to_delete = ['_id', 'pid', 'url','images']
df.drop(cols_to_delete, axis=1, inplace=True)
df.head()

In [None]:
def clean_price(price):
    if price is None:
        return np.nan
    elif isinstance(price, str):
        price = price.replace(',', '')
        if price:
            price = float(price)
        else:
            price = np.nan
        return price
    else:
        return price


df['actual_price'] = df['actual_price'].apply(clean_price)

In [None]:
def clean_discount(value):
    if value is None:
        return np.nan
    elif isinstance(value, str):
        value = value.replace('% off', '')
        if value:
            value = float(value)
        else:
            value = np.nan
        return value
    else:
        return value

df['discount'] = df['discount'].apply(clean_discount)

In [None]:
def clean_rating(value):
    if value is None:
        return np.nan
    elif isinstance(value, str):
        if value:
            value = float(value)
        else:
            value = np.nan
        return value
    else:
        return value

df['average_rating'] = df['average_rating'].apply(clean_rating)

In [None]:
df['selling_price'] = df['selling_price'].apply(clean_price)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.drop_duplicates(subset='title') # just a demo

In [None]:
print(df.columns.tolist())

In [None]:
# rename a column
df.rename(columns={'product_details':'details'}, inplace=True)

In [None]:
df.sort_values(by=['actual_price','discount'], ascending=False, inplace=True)

In [None]:

pricesdf = df[['title','actual_price','discount','average_rating','selling_price']]

In [None]:
pricesdf['discount'].plot.hist(bins=20, figsize=(10,5))

In [None]:
# styling your dataframe
pricesdf.head(30).style.bar(subset=['actual_price','selling_price'], color='#330033')

In [None]:
# styling your dataframe
pricesdf.head(30).style.background_gradient()