In [4]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
import plotly.offline as py
import plotly.express as px
import plotly.graph_objs as go
import json
import requests
from collections import Counter
from PIL import Image
from matplotlib.ticker import FuncFormatter
#------------------------------------- 
import re
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import joblib
from scipy import stats 
import folium
from folium.plugins import FastMarkerCluster, Fullscreen, MiniMap, HeatMap, HeatMapWithTime, LocateControl

In [6]:
path = '../data/'
#Importing Datasets
df_item = pd.read_csv(path+"olist_order_items_dataset.csv")
df_reviews = pd.read_csv(path+"olist_order_reviews_dataset.csv")
df_orders = pd.read_csv(path+"olist_orders_dataset.csv")
df_products = pd.read_csv(path+"olist_products_dataset.csv")
df_geolocation = pd.read_csv(path+"olist_geolocation_dataset.csv")
df_sellers = pd.read_csv(path+"olist_sellers_dataset.csv")
df_order_pay = pd.read_csv(path+"olist_order_payments_dataset.csv")
df_customers = pd.read_csv(path+"olist_customers_dataset.csv")
df_category = pd.read_csv(path+"product_category_name_translation.csv")

In [7]:
# Merging Datasets
df_train = df_orders.merge(df_item, on='order_id', how='left')
df_train = df_train.merge(df_order_pay, on='order_id', how='outer', validate='m:m')
df_train = df_train.merge(df_reviews, on='order_id', how='outer')
df_train = df_train.merge(df_products, on='product_id', how='outer')
df_train = df_train.merge(df_customers, on='customer_id', how='outer')
df_train = df_train.merge(df_sellers, on='seller_id', how='outer')

In [6]:
df_train['order_purchase_timestamp'] = pd.to_datetime(df_train['order_purchase_timestamp'])
# Transforming the columns (Order purchase)
df_train['day_of_week_name'] = df_train['order_purchase_timestamp'].dt.strftime('%A')
df_train['month_year'] = df_train['order_purchase_timestamp'].dt.strftime('%Y %m')
df_train['day_month_year'] = df_train['order_purchase_timestamp'].dt.strftime('%Y-%m-%d')
df_train['day_of_week_int'] = df_train['order_purchase_timestamp'].dt.weekday + 1
df_train['hour'] = df_train['order_purchase_timestamp'].dt.hour
df_train['month'] = df_train['order_purchase_timestamp'].dt.month
df_train['year'] = df_train['order_purchase_timestamp'].dt.year
df_train['order_purchase_timestamp'] = pd.to_datetime(df_train['order_purchase_timestamp'])
df_train['date'] = df_train['order_purchase_timestamp'].dt.to_period('M')
df_train['datetime'] = df_train['order_purchase_timestamp']


# Transforming the columns (Delivered customer)
df_train['order_delivered_customer_date'] = pd.to_datetime(df_train['order_delivered_customer_date'])
df_train['day_month_year_delivered'] = df_train['order_delivered_customer_date'].dt.strftime('%Y-%m-%d')
df_train['day_month_year_delivered'] = pd.to_datetime(df_train['day_month_year_delivered'])
df_train['day_month_year'] = pd.to_datetime(df_train['day_month_year'])
df_train['delivery_time'] = (df_train['order_delivered_customer_date'] - df_train['order_purchase_timestamp']).dt.days #average delivery days

In [7]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

def cross_heatmap(df, cols, normalize=False, values=None, aggfunc=None):
    temp = cols
    cm = sns.light_palette("green", as_cmap=True)
    return pd.crosstab(df[temp[0]], df[temp[1]], 
                       normalize=normalize, values=values, aggfunc=aggfunc).style.background_gradient(cmap = cm)

In [8]:
resumetable(df_train)

Dataset Shape: (119143, 50)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Third Value,Entropy
0,order_id,object,0,99441,d455a8cb295653b55abda06d434ab492,7f39ba4c9052be115350065d07583cac,9dc8d1a6f16f1b89874c29c9d8d30447,16.46
1,customer_id,object,0,99441,944b72539d7e1f7f7fc6e46639ef1fe3,d7fc82cbeafea77bd0a8fbbf6296e387,d9442164acf4b03109425633efaa0cfc,16.46
2,order_status,object,0,8,delivered,delivered,delivered,0.25
3,order_purchase_timestamp,datetime64[ns],0,98875,2017-09-26 22:17:05,2017-10-18 08:16:34,2017-10-12 13:33:22,16.44
4,order_approved_at,object,177,90733,2017-09-27 22:24:16,2017-10-18 23:56:20,2017-10-12 13:49:22,16.28
5,order_delivered_carrier_date,object,2086,81018,2017-09-29 15:53:03,2017-10-20 14:29:01,2017-10-17 15:42:42,16.03
6,order_delivered_customer_date,datetime64[ns],3421,95664,2017-10-07 16:12:47,2017-10-27 16:46:05,2017-10-24 20:17:44,16.4
7,order_estimated_delivery_date,object,0,459,2017-10-30 00:00:00,2017-11-09 00:00:00,2017-11-06 00:00:00,8.47
8,order_item_id,float64,833,21,1.0,1.0,1.0,0.72
9,product_id,object,833,32951,a2ff5a97bf95719e38ea2e3b4105bce8,a2ff5a97bf95719e38ea2e3b4105bce8,a2ff5a97bf95719e38ea2e3b4105bce8,13.63
