**PLEASE MAKE A COPY BEFORE CHANGING**

**Copyright** 2021 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.


<b>Important</b>
This content are intended for educational and informational purposes only.

## Instructions

##### 1. Make a copy of this [Google Sheet](https://docs.google.com/spreadsheets/d/1B8jxst5t4cwYdfoycE28Jg-wYyCGEK4MfmLYDJNzH18/edit)
##### 2. Add your parameters and click "Get Google Analytics Data"
##### 3. Run this colab.

##Import Libs and configure Plotly

In [None]:
import IPython
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import math
import json
import numpy as np
import pandas as pd
from scipy import spatial
from scipy.spatial import distance
from sklearn.cluster import KMeans
from google.colab import drive
from google.colab import auth
from sklearn import preprocessing
from sklearn.preprocessing import scale
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MinMaxScaler
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
from IPython.display import display

py.init_notebook_mode(connected=False)
%matplotlib inline
py.init_notebook_mode(connected=False)


##Mount Drive and read the Analytics report json

In [None]:
drive.mount('/gdrive')
with open('/gdrive/My Drive/datapill_cmi_report.json', 'r') as f:
  data = f.read()
report = json.loads(data)

## Define Plot Function

In [None]:
def plot3d(df, item_name_col, value_name_cols):
  #add additional column if only 2 audiences presented
  if len(value_name_cols) == 2:
    df['no_audience'] = 0
    value_name_cols.append('no_audience')

  py.init_notebook_mode(connected=False)

  trace_points = go.Scatter3d(
      x=df[value_name_cols[0]],
      y=df[value_name_cols[1]],
      z=df[value_name_cols[2]],
      #z=df[value_name_cols[2]] if len(value_name_cols) > 2 else 0,
      text=df[item_name_col],
      mode='markers',
      marker=dict(
          size=12,
          line=dict(
              color='rgb(0, 0, 0, 1)',
              width=0.5
          ),
          color=df.apply(lambda x: "rgba(" + str(int(x[value_name_cols[0]]*255)) 
            + ',' + str(int(x[value_name_cols[1]]*255)) 
            + ',' + str(int(x[value_name_cols[2]]*255)) + ',1)', axis=1),
          opacity=1
      )
  )
  trace_c1 = go.Scatter3d(
      x=[1],
      y=[0],
      z=[0],
      text=value_name_cols[0],
      mode='text+markers',
      marker=dict(
          size=120,
          line=dict(
              color='rgb(255, 0, 0, 0.5)',
              width=3
          ),
          color='rgb(255, 0, 0, 0.5)',#'rgba(217, 217, 217, 0.14)
          opacity=.5,
      )
  )
  trace_c2 = go.Scatter3d(
      x=[0],
      y=[1],
      z=[0],
      text=value_name_cols[1],
      mode='text+markers',
      marker=dict(
          size=120,
          line=dict(
              color='rgb(0, 255, 0, 0.5)',
              width=3
          ),
          color='rgb(0, 255, 0, 0.5)',#'rgba(217, 217, 217, 0.14)
          opacity=.5,
      )
  )
  trace_c3 = go.Scatter3d(
      x=[0],
      y=[0],
      z=[1],
      text=value_name_cols[2],
      mode='text+markers',
      marker=dict(
          size=120,
          line=dict(
              color='rgb(0, 0, 255, 0.5)',
              width=3
          ),
          color='rgb(0, 0, 255, 0.5)',#'rgba(217, 217, 217, 0.14)
          opacity=.5,
      )
  )
  data = [trace_points, trace_c1,trace_c2,trace_c3]
  layout = go.Layout(
      margin=dict(
          l=0,
          r=0,
          b=0,
          t=0
      )
  )
  fig = go.Figure(data=data, layout=layout)
  #py.iplot(fig, filename='simple-3d-scatter')

  py.iplot(data)
  # Plot and embed in ipython notebook!
  #py.iplot(data, filename='basic-scatter')

def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-1.5.1.min.js?noext',
            },
          });
        </script>
        '''))

## Define TF-IDF Function

In [None]:
def scalarToSigmod(scalar):#0-1 input
  x = (scalar-.5)*8
  return 1 / (1 + math.exp(-x))

def scalarToTanh(scalar):
  x = (scalar-.5)*6
  return (math.tanh(x)+1)/2

def calc_tfidf(df, label_col_name, transformation='tanh'):
  transformer = TfidfTransformer(smooth_idf=True, norm='l1')

  X = df.copy()
  y = X[label_col_name]
  X = X.drop([label_col_name], axis=1)

  tfidf = transformer.fit_transform(X)
  #create pd with results
  results = pd.DataFrame.from_records(tfidf.toarray() , columns=list(X.columns.values))
  #transpose
  results_transposed = results.T.reset_index()
  results_transposed.columns = ["COMPARED_USERLIST_FULL_NAME"] + list(y)
  results_transposed
  #scale to 0-1
  scaler = MinMaxScaler()
  results_transposed[list(y)] = scaler.fit_transform(results_transposed[list(y)])

  for col in list(y):
    if transformation == 'sig':
      results_transposed[col] = results_transposed.apply(lambda x: scalarToSigmod(x[col]), axis=1)
    elif transformation == 'tanh':
      results_transposed[col] = results_transposed.apply(lambda x: scalarToTanh(x[col]), axis=1)
  return results_transposed

## Define GA API reporting functions

In [None]:
def process_report(report):
  data=[]
  columnHeader = report.get('columnHeader', {})
  dimensionHeaders = columnHeader.get('dimensions', [])
  metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
  metricHeaders = [header['name'] for header in metricHeaders]
  df_headers = dimensionHeaders + metricHeaders

  for row in report['data']['rows']:
    d = row['dimensions']
    m = row['metrics'][0]['values']
    data.append(d+m)
  df = pd.DataFrame(data, columns=df_headers)
  pivot = pd.pivot_table(df, 
                        index=[df.columns[0]], 
                        columns=['ga:segment'],
                        aggfunc='sum').T
  df = pd.DataFrame(pivot.fillna(0).to_records())
  return df[df.columns[1:]]

In [None]:
df = process_report(report['reports'][0])
cmi_df = calc_tfidf(df, 'ga:segment')
cmi_df.head()

In [None]:
configure_plotly_browser_state()
y = list(cmi_df.drop(['COMPARED_USERLIST_FULL_NAME'],axis=1).columns)
plot3d(cmi_df,'COMPARED_USERLIST_FULL_NAME',list(y))

In [None]:
vecs = [[1,0,0], [0,1,0], [0,0,1]]
segments = list(cmi_df.columns[1:])
cmi_df['vector'] = cmi_df[[*segments]].values.tolist()
for i in range(len(segments)):
  data = []
  col = 'distance_{}'.format(segments[i])
  for row in cmi_df.iterrows():
    euc = distance.euclidean(row[1]['vector'], vecs[i])
    data.append(euc)
  cmi_df[col] = data


for col in cmi_df.columns[-3:]:
  display(cmi_df[['COMPARED_USERLIST_FULL_NAME', col]].sort_values(by=col, ascending=True))