# Quick Data Exploration

## Import Libraries

In [None]:
from pandas_profiling import ProfileReport
import pandas as pd
from scripts.helper import reduce_mem_usage
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option("display.max_columns", 120)

## Import Datasets

In [None]:
dataset = pd.read_csv('data/shrunk_train_v2.csv')

In [None]:
dataset_test = pd.read_csv('data/shrunk_test_v2.csv')

In [None]:
# Recude train dataset size by changing the datatypes
dataset, NAlist = reduce_mem_usage(dataset)

In [None]:
# Recude train dataset size by changing the datatypes
dataset_test, NAlist_test = reduce_mem_usage(dataset_test)

In [None]:
print("Number of common visitors in train and test set : ",len(set(dataset.fullVisitorId.unique()).intersection(set(dataset_test.fullVisitorId.unique())) ))

In [None]:
print("There are {} observations and {} features in this train dataset. \n".format(dataset.shape[0],dataset.shape[1]))

In [None]:
print("There are {} observations and {} features in this test dataset. \n".format(dataset.shape[0],dataset_test.shape[1]))

## Generate Profile Report

In [None]:
profile = ProfileReport(dataset, title="Profile Report", explorative=True)

In [None]:
profile

## Distribution of Channelgrouping of all sessions

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
ax = sns.histplot(data=dataset, x="channelGrouping")

## Distribution of Channelgrouping of the sessions with non-zero Revenue

In [None]:
df_nonz = dataset[dataset['totals.transactionRevenue']>0]

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
ax = sns.histplot(data=df_nonz, x="channelGrouping")

## Converting Timestamps to extra Day, Month, Year Columns

In [None]:
# Converting POSIX data from visiStartTime column and replace it in date column
dataset['date'] = pd.to_datetime(dataset['visitStartTime'], unit='s').dt.strftime('%Y-%m-%d')
dataset = dataset.drop('visitStartTime', axis=1)

In [None]:
dataset = dataset.assign(
    Date = lambda x: pd.to_datetime(x['date']).dt.date,
    Year = lambda x: pd.to_datetime(x['date']).dt.year,
    Month = lambda x: pd.to_datetime(x['date']).dt.month,
    Day = lambda x: pd.to_datetime(x['date']).dt.day
)
print(f'Start of year: {dataset.Year.min()}')
print(f'Start of year: {dataset.Year.max()}')

## Plotting Total Transactions Revenue vs Country

In [None]:
# Grouping Dataset by Country and sort values
obj = dataset.groupby('geoNetwork.country')["totals.transactionRevenue"].sum().sort_values(ascending =False).reset_index()

In [None]:
# Plotting Country vs Transaction Revenue
fig, ax = plt.subplots(figsize=(15,8))
ax = sns.barplot(data=obj, x=obj['geoNetwork.country'].head(10), y=obj['totals.transactionRevenue'].head(10))

In [None]:
# Calculating the percentage of Non Transactions Revenue Sessions in Canada
canada_total = dataset[dataset['geoNetwork.country']== 'Canada']

In [None]:
ntr_canada = dataset[(dataset['geoNetwork.country'] == 'Canada') & (dataset['totals.transactionRevenue']!= 0)]

In [None]:
print('Percentage of Non-Zero Revenue Sessions in Canada: %.2f%%' % ((len(ntr_canada) / len(canada_total)) *100))

## Plotting Total Transactions Revenue vs Browsers

In [None]:
# Grouping Dataset by Device Browsers and sort values
obj = dataset.groupby('device.browser')["totals.transactionRevenue"].sum().sort_values(ascending =False).reset_index()

In [None]:
# Plotting Device Browsers vs Transaction Revenue
fig, ax = plt.subplots(figsize=(15,8))
ax = sns.barplot(data=obj, x=obj['device.browser'].head(10), y=obj['totals.transactionRevenue'].head(10))

## Plotting Total Transactions Revenue vs Visit Numbers

In [None]:
# Grouping Dataset by Number of Visits and sort values
obj = dataset.groupby('visitNumber')["totals.transactionRevenue"].mean().sort_values(ascending =False).reset_index()

In [None]:
# Plotting Number of Visits vs Transaction Revenue
fig, ax = plt.subplots(figsize=(15,8))
ax = sns.barplot(data=obj, x=obj['visitNumber'].head(20), y=obj['totals.transactionRevenue'].head(20))

## Plotting Total Transactions Revenue vs Page Views

In [None]:
# Grouping Dataset by Total number of Page Views and sort values
obj = dataset.groupby('totals.pageviews')["totals.transactionRevenue"].mean().sort_values(ascending =False).reset_index()
obj.head(20)

In [None]:
# Plotting Number of Total number of Page Views vs Transaction Revenue
fig, ax = plt.subplots(figsize=(15,8))
ax = sns.barplot(data=obj, x=obj['totals.pageviews'].head(20), y=obj['totals.transactionRevenue'].head(20))

## Plotting Total Transactions Revenue vs Total Hits

In [None]:
# Grouping Dataset by Total Hits and sort values
obj = dataset.groupby('totals.hits')["totals.transactionRevenue"].mean().sort_values(ascending =False).reset_index()
obj.head(20)

In [None]:
# Plotting Number of Total Hits vs Transaction Revenue
fig, ax = plt.subplots(figsize=(15,8))
ax = sns.barplot(data=obj, x=obj['totals.hits'].head(30), y=obj['totals.transactionRevenue'].head(30))

## Plotting Total Transactions Revenue vs Months in 2017 and 2018

In [None]:
# Grouping Dataset by Months of 2017 and sort values
obj = dataset[dataset['Year']==2017].groupby('Month')["totals.transactionRevenue"].sum().sort_values(ascending=False).reset_index()
obj

In [None]:
# Plotting Months of 2017 vs Transaction Revenue
plt.figure(figsize=(16,6))
plt.title('Title', fontdict={'fontsize': 15}), plt.xlabel('Month'), plt.ylabel('Revenue')
sns.lineplot(obj['Month'], obj['totals.transactionRevenue'], label='')
plt.legend(), plt.show()

In [None]:
# Grouping Dataset by Months of 2018 and sort values
obj = dataset[dataset['Year']==2018].groupby('Month')["totals.transactionRevenue"].sum().sort_values(ascending=False).reset_index()
obj

In [None]:
# Plotting Months of 2018 vs Transaction Revenue
plt.figure(figsize=(16,6))
plt.title('Title', fontdict={'fontsize': 15}), plt.xlabel('Month'), plt.ylabel('Revenue')
sns.lineplot(obj['Month'], obj['totals.transactionRevenue'], label='')
plt.legend(), plt.show()