# Data Visualization

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
pd.set_option("display.max_columns", 120)
import plotly.offline as py
%matplotlib inline

## Import Datasets

In [None]:
dataset = pd.read_csv('data/cleaned_train_v2.csv')

## Data Preparation

In [None]:
# Converting POSIX data from visiStartTime column and replace it in date column
dataset['date'] = pd.to_datetime(dataset['visitStartTime'], unit='s').dt.strftime('%Y-%m-%d-%H')

In [None]:
dataset = dataset.assign(
    Date = lambda x: pd.to_datetime(x['date']).dt.date,
    Year = lambda x: pd.to_datetime(x['date']).dt.year,
    Month = lambda x: pd.to_datetime(x['date']).dt.month,
    Day = lambda x: pd.to_datetime(x['date']).dt.day,
    hour = lambda x: pd.to_datetime(x['date']).dt.hour

)
print(f'Start of year: {dataset.Year.min()}')
print(f'Start of year: {dataset.Year.max()}')

dataset["year_month"] = pd.to_datetime(dataset['visitStartTime'], unit='s').dt.strftime('%Y-%m')

# Hypotheses:
1. Users who come from "Origanic search" are doing more transactions.
2. Users who visited the Store more than 3 times made also a transaction.
3. Most of the Sessions are from mobile users (change count to sum for amount of transactions).
4. Most visits are from USA.
5. Most revenues are generate from the USA.
6. Users made less transactions in February.
7. Sessions through Operating System MacOS tend to have more Revenue.

## 1. Users who come from "Origanic search" are doing more transactions

In [None]:
df_nonz = dataset[dataset['totals.transactionRevenue']>0].sort_values(by ="totals.transactionRevenue", ascending=False)
df_z = dataset[dataset['totals.transactionRevenue']==0].sort_values(by ="totals.transactionRevenue", ascending=False)

In [None]:
sns.set(font_scale=1.2)
fig, ax = plt.subplots(figsize=(15,8))
ax = sns.histplot(data=dataset, x="channelGrouping")

In [None]:
sns.set(font_scale=1.2)
fig, ax = plt.subplots(figsize=(15,8))
ax = sns.histplot(data=df_nonz, x="channelGrouping")

### Value of transactions per channel

In [None]:
obj = df_nonz.groupby('channelGrouping')["totals.transactionRevenue"].sum().sort_values(ascending =False).reset_index()

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
ax = sns.barplot(data=obj, x=obj['channelGrouping'].head(10), y=obj['totals.transactionRevenue'].head(10))
ax.set(xlabel="Channel Grouping", ylabel = "Revenue in $")

## 2. Users who visited the Store more than 3 times made also a transaction

In [None]:
obj = dataset.groupby('visitNumber')["totals.transactionRevenue"].mean().sort_values(ascending =False).reset_index()

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
ax = sns.barplot(data=obj, x=obj['visitNumber'].head(10), y=obj['totals.transactionRevenue'].head(10))

## 3. Most of the Sessions are from mobile users 
(change count to sum for amount of transactions)

In [None]:
obj = dataset.groupby(['channelGrouping','device.isMobile'])["totals.transactionRevenue"].count().sort_values(ascending =False).reset_index()

In [None]:
ax = sns.catplot(y="totals.transactionRevenue",x="channelGrouping",kind="bar", hue="device.isMobile",height=8.27, aspect=11.7/8.27,
            edgecolor=".6", data=obj)
ax.set(xlabel="Channel Grouping", ylabel = "No. of Sessions")

## 4. Most visits are from USA

In [None]:
tmp = dataset["geoNetwork.country"].value_counts()
country_visits = pd.DataFrame(data={'geoNetwork.country': tmp.values}, index=tmp.index).reset_index()
country_visits.columns = ['Country', 'Visits']

In [None]:
def plot_country_map(data, location, z, legend, title, colormap='Viridis'):
    data = dict(type = 'choropleth', 
                colorscale = colormap,
                autocolorscale = False,
                reversescale = False,
               locations = data[location],
               locationmode = 'country names',
               z = data[z], 
               text = data[z],
               colorbar = {'title':legend})
    layout = dict(title = title, 
                 geo = dict(showframe = False, 
                         projection = {'type': 'natural earth'}))
    choromap = go.Figure(data = [data], layout=layout)
    iplot(choromap)

In [None]:
plot_country_map(country_visits, 'Country', 'Visits', 'Visits', 'Visits per country')

## 5. Most revenues are generate from the USA

In [None]:
# Select the visits with non-zero transaction revenue and calculate the sums
tmp = df_nonz.groupby(['geoNetwork.country'])['totals.transactionRevenue'].sum()
country_total = pd.DataFrame(data={'total': tmp.values}, index=tmp.index).reset_index()
country_total.columns = ['Country', 'Total']
country_total['Total']  = np.log1p(country_total['Total'])

In [None]:
plot_country_map(country_total, 'Country', 'Total', 'Total(log)', 'Total revenues per country (log scale)')

## 6. Users made less transactions in February

In [None]:
tmp = dataset.groupby('Date')['totals.transactionRevenue'].agg(['size'])
tmp.columns = ["Total"]
tmp = tmp.sort_index()

In [None]:

def plot_scatter_data(data, xtitle, ytitle, title, color='blue'):
    trace = go.Scatter(
        x = data.index,
        y = data.values,
        name=ytitle,
        marker=dict(
            color=color
        ),
        mode='lines'
    )

    data = [trace]
    layout = dict(title = title,
              xaxis = dict(title = xtitle), yaxis = dict(title = ytitle),
             )
    fig = dict(data=data, layout=layout)
    iplot(fig, filename='lines')

In [None]:
plot_scatter_data(tmp['Total'],'Date','No. of sessions','Sessions including zero transactions','green')

In [None]:
tmp_nonz = df_nonz.groupby('Date')['totals.transactionRevenue'].agg(['size'])
tmp_nonz.columns = ["Total"]
tmp_nonz = tmp_nonz.sort_index()

In [None]:
plot_scatter_data(tmp_nonz['Total'],'Date','No. of sessions','Sessions with revenue only','red')

### Predicted Transactions

In [None]:
dataset_pred = pd.read_csv('models/dataset_pred.csv')

In [None]:
tmp0 = dataset.groupby('Date')['totals.transactionRevenue'].agg(['sum'])
tmp0.columns = ["Total"]
tmp0 = tmp0.sort_index()

In [None]:
plot_scatter_data(tmp0['Total'],'Date', 'Amountin $','Total Revenue for year 2016-2018','orange')

In [None]:
def plot_scatter_data_join(data, data2, xtitle, ytitle, title):
    trace = go.Scatter(
        x = data.index,
        y = data.values,
        name=ytitle,
        marker=dict(
            color='maroon'
        ),
        mode='lines'
    )
    
    trace2 = go.Scatter(
        x = data2.index,
        y = data2.values,
        name='Actual Rev. in $',
        marker=dict(
            color='rgb(94,163,192)'
        ),
        mode='lines'
    )
    

  #  data = [trace, trace2]
    layout = dict(title = title,
              xaxis = dict(title = xtitle), yaxis = dict(title = ytitle)
             )
    #fig = dict(data=data, layout=layout)




    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(trace)
    fig.add_trace(trace2,secondary_y=True)
    fig['layout'].update(height = 400, width = 1150, title = title,xaxis=dict(
      tickangle=0
        ))
    iplot(fig, filename='lines')


In [None]:
tmp1 = dataset_pred.groupby('date')['Label'].agg(['sum'])
tmp1.columns = ["Label"]
tmp1 = tmp1.sort_index()

In [None]:
tmp2 = dataset_pred.groupby('date')['Target'].agg(['sum'])
tmp2.columns = ["Target"]
tmp2 = tmp2.sort_index()

In [None]:
plot_scatter_data_join(tmp1['Label'],tmp2['Target'],'date', 'Predicted Rev. in $','Result: Revenue generating sessions from May - Oct 2018')

In [None]:
### Unlog 
dataset_pred['Label'] = np.expm1(dataset_pred['Label'])
tmp1 = dataset_pred.groupby('date')['Label'].agg(['sum'])
tmp1.columns = ["Label"]
tmp1 = tmp1.sort_index()

In [None]:
plot_scatter_data_join(tmp1['Label'],tmp0['Total'],'date', 'Predicted Rev. in $','Revenue generating sessions from Aug 2016 - Oct 2018')

### Total hits binned (Change to visitnumber instead of hits)

In [None]:
bins = [0, 50, 100, 150, 200, 250, 300, 350, 400]
labels = ["0-50","51-100","101-150","151-200","201-250","251-300","301-350","351-400"]
dataset['binned'] = pd.cut(dataset['visitNumber'], bins=bins, labels=labels)

In [None]:
obj = dataset.groupby('binned')["totals.transactionRevenue"].mean().sort_values(ascending =False).reset_index()
#obj.head(20)

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
ax = sns.barplot(data=obj, x=obj['binned'].head(20), y=obj['totals.transactionRevenue'])
ax.set(xlabel="Visits grouped", ylabel = "Revenue in $")

In [None]:
gdf = dataset.groupby("fullVisitorId")["totals.transactionRevenue"].sum().reset_index()
nrc = gdf[gdf['totals.transactionRevenue']==0]
rc = gdf[gdf['totals.transactionRevenue']>0]
print("The number of nonrevenue customers are ", len(nrc))
print("The number of revenue generating customers are ", len(rc))
print("the ratio of revenue generating customers are {0:0.4}%".format(len(rc)/len(gdf)*100))

In [None]:
labels = ['Non revenue generating customers','revenue generating customers']
values = [1307589,16141]
plt.axis("equal")
plt.pie(values, labels=labels, radius=1.5, autopct="%0.2f%%",shadow=True, explode=[0,0.8], colors=['lightskyblue','lightcoral'])
plt.show()

## Country Distribution

In [None]:
country_series = dataset["geoNetwork.country"].value_counts().head(25)
country_count = country_series.shape[0]
print("Total No. Of Countries: ", country_count)
country_series = country_series.head(25)

trace = go.Bar(
    x=country_series.index,
    y=country_series.values,
    marker=dict(
        color=country_series.values,
        showscale=True
    ),
)
layout = go.Layout(title="Countrywise Observation Count")
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="country")

- 50% of the observerations are registered from Americas
- 360K Observations are from USA alone.
- Note, China is not there. Google is baned in China
- Following USA, its India, Is it because of the population

## Plotly/Dash

In [None]:
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

### Sessions from revenue sessions and non-revenue sessions wrt Time

In [None]:
trace = [
    go.Histogram(x=dataset['hour'],
                opacity = 0.7,
                 name="Total Sessions",
                 hoverinfo="y",
                 marker=dict(line=dict(width=1.6),
                            color='grey')
                ),
    
    go.Histogram(x=df_nonz[df_nonz['totals.transactionRevenue'].notnull()]['hour'],
                 visible=False,
                 opacity = 0.7,
                 name = "Non-zero revenue Sessions",
                 hoverinfo="y",
                 marker=dict(line=dict(width=1.6),
                            color='green')
                ),
    
    go.Histogram(x=df_z[df_z['totals.transactionRevenue'].notnull()]['hour'],
                 visible=False,
                opacity = 0.7,
                 name = "Zero revenue Sessions",
                 hoverinfo="y",
                 marker=dict(line=dict(width=1.6),
                            color='orange')         
                )
]

layout = go.Layout(title='Sessioning hours',
    paper_bgcolor = 'rgb(240, 240, 240)',
     plot_bgcolor = 'rgb(240, 240, 240)',
    autosize=True, xaxis=dict(tickmode="linear", title="Hour of the Day for the year 2017-2018"),
                   yaxis=dict(title="No. of Sessions",
                             titlefont=dict(size=17)),
                  )

updatemenus = list([
    dict(
    buttons=list([
        dict(
            args = [{'visible': [True, False, False]}],
            label="Total Sessions",
            method='update',
        ),
        dict(
            args = [{'visible': [False, True, False]}],
            label="Non-zero revenue Sessions",
            method='update',
        ),
        dict(
            args = [{'visible': [False, False, True]}],
            label="Zero revenue Sessions",
            method='update',
        ),
        
    ]),
        direction="down",
        pad = {'r':10, "t":10},
        x=0.1,
        y=1.25,
        yanchor='top',
    )
])
layout['updatemenus'] = updatemenus

fig = dict(data=trace, layout=layout)
fig = py.iplot(fig)
fig

## 7. Sessions through Operating System MacOS tend to have more Revenue.

In [None]:

df_nonz['day_frame'] = 0
df_nonz['day_frame'] = np.where((df_nonz["hour"]>=0) & (df_nonz["hour"]<4), 'overnight', 
                           df_nonz['day_frame'])
df_nonz['day_frame'] = np.where((df_nonz["hour"]>=4) & (df_nonz["hour"]<8), 'dawn', 
                           df_nonz['day_frame'])
df_nonz['day_frame'] = np.where((df_nonz["hour"]>=8) & (df_nonz["hour"]<12), 'morning', 
                           df_nonz['day_frame'])
df_nonz['day_frame'] = np.where((df_nonz["hour"]>=12) & (df_nonz["hour"]<14), 'lunch', 
                           df_nonz['day_frame'])
df_nonz['day_frame'] = np.where((df_nonz["hour"]>=14) & (df_nonz["hour"]<18), 'afternoon', 
                           df_nonz['day_frame'])
df_nonz['day_frame'] = np.where((df_nonz["hour"]>=18) & (df_nonz["hour"]<21), 'evening', 
                           df_nonz['day_frame'])
df_nonz['day_frame'] = np.where((df_nonz["hour"]>=21) & (df_nonz["hour"]<24), 'night', 
                           df_nonz['day_frame'])

In [None]:
fv = df_nonz.pivot_table(index="device.operatingSystem",columns="day_frame",
                    values="totals.transactionRevenue",aggfunc=lambda x:x.sum())
fv = fv[['morning', 'lunch', 'afternoon', 'evening','night','overnight', 'dawn']]
fv = fv.sort_values(by='morning', ascending=False)[:6]

trace = go.Heatmap(z=[fv.values[0],fv.values[1],fv.values[2],fv.values[3],
                      fv.values[4],fv.values[5]],
                   x=['morning', 'lunch', 'afternoon', 'evening', 'night',
                      'overnight','dawn'],
                   y=fv.index.values, colorscale='Purples', reversescale = False
                  )

data=[trace]
layout = go.Layout(
    title='Total Revenue by Device OS<br>(parts of the day)')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

## Total sessions, non-zero revenue count and Revenue counts from operating systems 

In [None]:
color = ['tomato',  'bisque','lightgreen', 'gold', 'tan', 'lightgrey', 'cyan']

def PieChart(column, title, limit):
    revenue = "totals.transactionRevenue"
    count_trace = dataset.groupby(column)[revenue].size().nlargest(limit).reset_index()
    non_zero_trace = df_nonz.groupby(column)[revenue].count().nlargest(limit).reset_index()
    rev_trace = df_nonz.groupby(column)[revenue].sum().nlargest(limit).reset_index()    

    trace1 = go.Pie(labels=count_trace[column], 
                    values=count_trace[revenue], 
                    name= "Sessions", 
                    hole= .5, textfont=dict(size=10),
                    domain= {'x': [0, .32]},
                   marker=dict(colors=color))

    trace2 = go.Pie(labels=non_zero_trace[column], 
                    values=non_zero_trace[revenue], 
                    name="Revenue", 
                    hole= .5,  textfont=dict(size=10),
                    domain= {'x': [.34, .66]})
    
    trace3 = go.Pie(labels=rev_trace[column], 
                    values=rev_trace[revenue], 
                    name="Revenue", 
                    hole= .5,  textfont=dict(size=10),
                    domain= {'x': [.68, 1]})

    layout = dict(title= title, font=dict(size=15), legend=dict(orientation="h"),
                  annotations = [
                      dict(
                          x=.10, y=.5,
                          text='<b>Total <br>Sessions', 
                          showarrow=False,
                          font=dict(size=12)
                      ),
                      dict(
                          x=.50, y=.5,
                          text='<b>Non-zero <br>Sessions<br>', 
                          showarrow=False,
                          font=dict(size=12)
                      ),
                      dict(
                          x=.88, y=.5,
                          text='<b>Total<br>Revenue', 
                          showarrow=False,
                          font=dict(size=12)
                      )
        ])
    
    fig = dict(data=[trace1, trace2,trace3], layout=layout)
    py.iplot(fig)

In [None]:
PieChart("device.operatingSystem", "Operating System", 4)