# User groups

In [40]:
request = "POST https://analyticsreporting.googleapis.com/v4/reports:batchGet?fields=reports(columnHeader%2Cdata(rows%2Ctotals))&key={YOUR_API_KEY}"
request = {
 "reportRequests": [
  {
   "viewId": "123303369",
   "dateRanges": [
    {
     "startDate": "2017-01-01",
     "endDate": "2017-04-30"
    }
   ],
   "metrics": [
    {
     "expression": "ga:sessions"
    },
    {
     "expression": "ga:sessionDuration"
    },
    {
     "expression": "ga:goal1Completions"
    },
    {
     "expression": "ga:bounceRate"
    }
   ],
   "dimensions": [
    {
     "name": "ga:city"
    },
    {
     "name": "ga:userAgeBracket"
    }
   ]
  }
 ]
}

In [41]:
import json

with open('data/TMRW_user_groups.json') as file:
    input_ugroups = json.load(file)
    
#input_ugroups

# Define dimensions list
input_ugroups_dimensions = input_ugroups['reports'][0]['columnHeader']['dimensions']

dimension_count = len(input_ugroups_dimensions)

# Define metrics list
input_ugroups_metrics = input_ugroups['reports'][0]['columnHeader']['metricHeader']['metricHeaderEntries']

def create_metric_list(raw_data):
    lst = []
    for item in raw_data:
        lst.append(item['name'])
    return lst

input_ugroups_metrics = create_metric_list(input_ugroups_metrics)


# Create input data

input_ugroups_data = input_ugroups['reports'][0]['data']['rows']

input_ugroups_data

[{'dimensions': ['Croydon', '18-24'],
  'metrics': [{'values': ['101',
     '41.584158415841586',
     '4',
     '3.9603960396039604']}]},
 {'dimensions': ['Croydon', '25-34'],
  'metrics': [{'values': ['334',
     '47.90419161676647',
     '17',
     '5.089820359281437']}]},
 {'dimensions': ['Croydon', '35-44'],
  'metrics': [{'values': ['223',
     '43.04932735426009',
     '7',
     '3.1390134529147984']}]},
 {'dimensions': ['Croydon', '45-54'],
  'metrics': [{'values': ['90',
     '45.55555555555556',
     '2',
     '2.2222222222222223']}]},
 {'dimensions': ['Croydon', '55-64'],
  'metrics': [{'values': ['32', '53.125', '1', '3.125']}]},
 {'dimensions': ['London', '18-24'],
  'metrics': [{'values': ['167',
     '49.700598802395206',
     '8',
     '4.790419161676647']}]},
 {'dimensions': ['London', '25-34'],
  'metrics': [{'values': ['842',
     '59.14489311163895',
     '24',
     '2.8503562945368173']}]},
 {'dimensions': ['London', '35-44'],
  'metrics': [{'values': ['482',
     

In [42]:
values_list = []
for group in input_ugroups_data:
    new_dim_name = group['dimensions'][0] + ", " + group['dimensions'][1]
    group[new_dim_name] = group['metrics'][0]
    del group['dimensions']
    del group['metrics']
    
    #conv_rate = round(float(int(group[new_dim_name]['values'][2])/int(group[new_dim_name]['values'][0])*100),2)
    values_list.append(group[new_dim_name]['values'])

    #group[new_dim_name]['values'].append(conv_rate)
    
    
#values_list    
input_ugroups_data

[{'Croydon, 18-24': {'values': ['101',
    '41.584158415841586',
    '4',
    '3.9603960396039604']}},
 {'Croydon, 25-34': {'values': ['334',
    '47.90419161676647',
    '17',
    '5.089820359281437']}},
 {'Croydon, 35-44': {'values': ['223',
    '43.04932735426009',
    '7',
    '3.1390134529147984']}},
 {'Croydon, 45-54': {'values': ['90',
    '45.55555555555556',
    '2',
    '2.2222222222222223']}},
 {'Croydon, 55-64': {'values': ['32', '53.125', '1', '3.125']}},
 {'London, 18-24': {'values': ['167',
    '49.700598802395206',
    '8',
    '4.790419161676647']}},
 {'London, 25-34': {'values': ['842',
    '59.14489311163895',
    '24',
    '2.8503562945368173']}},
 {'London, 35-44': {'values': ['482',
    '54.77178423236515',
    '14',
    '2.904564315352697']}},
 {'London, 45-54': {'values': ['205',
    '55.60975609756098',
    '3',
    '1.4634146341463417']}},
 {'London, 55-64': {'values': ['37', '48.64864864864865', '0', '0.0']}}]

In [43]:
# Define each metric dict

ugroups_data = {}

for ugroup in input_ugroups_data:
    #print (ugroup)
    
    for gr in ugroup:
        ugroups_data[gr] = {'sessions':0,
                       'bounce_rate':0,
                       'conversions':0,
                       'conversion_rate':0}
        
        ugroups_data[gr]['sessions'] = round(float(ugroup[gr]['values'][0]),2)
        ugroups_data[gr]['conversions'] = round(float(ugroup[gr]['values'][1]),2)
        ugroups_data[gr]['bounce_rate'] = round(float(ugroup[gr]['values'][2]),2)
        ugroups_data[gr]['conversion_rate'] = round(float(ugroup[gr]['values'][3]),2)
        
#ugroups_data

In [44]:
rows = list(ugroups_data.keys())
rows

['Croydon, 18-24',
 'Croydon, 25-34',
 'Croydon, 35-44',
 'Croydon, 45-54',
 'Croydon, 55-64',
 'London, 18-24',
 'London, 25-34',
 'London, 35-44',
 'London, 45-54',
 'London, 55-64']

In [45]:
import collections
from collections import OrderedDict

columns = []
for u in ugroups_data:
    #print (test[r])
    for metric in ugroups_data[u]:
        columns.append(metric)
    
columns = list(OrderedDict.fromkeys(columns))    
columns

['sessions', 'bounce_rate', 'conversions', 'conversion_rate']

In [46]:
import pandas as pd

In [47]:
df = pd.DataFrame(values_list,
                  index = rows,
                  columns = columns)

df.to_json(orient='split')
table = pd.read_json(df.to_json(orient='split'), orient='split')
table

Unnamed: 0,sessions,bounce_rate,conversions,conversion_rate
"Croydon, 18-24",101,41.584158,4,3.960396
"Croydon, 25-34",334,47.904192,17,5.08982
"Croydon, 35-44",223,43.049327,7,3.139013
"Croydon, 45-54",90,45.555556,2,2.222222
"Croydon, 55-64",32,53.125,1,3.125
"London, 18-24",167,49.700599,8,4.790419
"London, 25-34",842,59.144893,24,2.850356
"London, 35-44",482,54.771784,14,2.904564
"London, 45-54",205,55.609756,3,1.463415
"London, 55-64",37,48.648649,0,0.0


In [58]:
samples1=[]
for i in range(0,len(table)):
    a = [table.sessions[i],round(float(table.bounce_rate[i]),2), table.conversions[i],round(float(table.conversion_rate[i]),2)]
    #print(a)
    samples1.append(a)
samples1
#return samples1
#print(samples1)


[[101, 41.58, 4, 3.96],
 [334, 47.9, 17, 5.09],
 [223, 43.05, 7, 3.14],
 [90, 45.56, 2, 2.22],
 [32, 53.12, 1, 3.12],
 [167, 49.7, 8, 4.79],
 [842, 59.14, 24, 2.85],
 [482, 54.77, 14, 2.9],
 [205, 55.61, 3, 1.46],
 [37, 48.65, 0, 0.0]]

In [62]:
summ=[]
for i in range(0,len(samples1)):
    a = round(float(sum(samples1[i])),2)
    summ.append(a) 
summ

[150.54, 403.99, 276.19, 139.78, 89.24, 229.49, 927.99, 553.67, 265.07, 85.65]

In [63]:
import random
import math

NUM_CLUSTERS = 2
TOTAL_DATA = len(samples1)
LOWEST_SAMPLE_POINT = summ.index(min(summ)) #element 9 of SAMPLES.
HIGHEST_SAMPLE_POINT = summ.index(max(summ)) #element 6 of SAMPLES.
BIG_NUMBER = math.pow(10, 10)


SAMPLES = samples1
data1 = []
centroids = []

class DataPoint:
    def __init__(self, x, y, z, f):
        self.x = x
        self.y = y
        self.z = z
        self.f = f
    
    def set_x(self, x):
        self.x = x
    
    def get_x(self):
        return self.x
    
    def set_y(self, y):
        self.y = y
    
    def get_y(self):
        return self.y
    
    def set_z(self, z):
        self.z = z
    
    def get_z(self):
        return self.z
    
    def set_f(self, f):
        self.f = f
    
    def get_f(self):
        return self.f
    
    def set_cluster(self, clusterNumber):
        self.clusterNumber = clusterNumber
    
    def get_cluster(self):
        return self.clusterNumber

class Centroid:
    def __init__(self, x, y, z, f):
        self.x = x
        self.y = y
        self.z = z
        self.f = f
    
    def set_x(self, x):
        self.x = x
    
    def get_x(self):
        return self.x
    
    def set_y(self, y):
        self.y = y
    
    def get_y(self):
        return self.y
    
    def set_z(self, z):
        self.z = z
    
    def get_z(self):
        return self.z
    
    def set_f(self, f):
        self.f = f
    
    def get_f(self):
        return self.f



In [64]:
def initialize_centroids():
    # Set the centoid coordinates to match the data points furthest from each other.
    # In this example, [31, 51.613, 1, 3.2260000000000004] and [758, 59.234999999999999, 22, 2.9019999999999997]
    centroids.append(Centroid(SAMPLES[LOWEST_SAMPLE_POINT][0], SAMPLES[LOWEST_SAMPLE_POINT][1],SAMPLES[LOWEST_SAMPLE_POINT][2],SAMPLES[LOWEST_SAMPLE_POINT][3]))
    centroids.append(Centroid(SAMPLES[HIGHEST_SAMPLE_POINT][0], SAMPLES[HIGHEST_SAMPLE_POINT][1],SAMPLES[HIGHEST_SAMPLE_POINT][2],SAMPLES[HIGHEST_SAMPLE_POINT][3]))
    
    print("Centroids initialized at:")
    print("(", centroids[0].get_x(), ", ", centroids[0].get_y(), ", ", centroids[0].get_z(),  ", ", centroids[0].get_f(),")")
    print("(", centroids[1].get_x(), ", ", centroids[1].get_y(), ", ", centroids[1].get_z(),  ", ", centroids[1].get_f(),")")
    print()
    return
print(initialize_centroids())


Centroids initialized at:
( 37 ,  48.65 ,  0 ,  0.0 )
( 842 ,  59.14 ,  24 ,  2.85 )

None


In [65]:
def initialize_datapoints():
    # DataPoint objects' x and y values are taken from the SAMPLE array.
    # The DataPoints associated with LOWEST_SAMPLE_POINT and HIGHEST_SAMPLE_POINT are initially
    # assigned to the clusters matching the LOWEST_SAMPLE_POINT and HIGHEST_SAMPLE_POINT centroids.
    for i in range(TOTAL_DATA):
        newPoint = DataPoint(SAMPLES[i][0], SAMPLES[i][1],SAMPLES[i][2],SAMPLES[i][3])
        
        if(i == LOWEST_SAMPLE_POINT):
            newPoint.set_cluster(0)
        elif(i == HIGHEST_SAMPLE_POINT):
            newPoint.set_cluster(1)
        else:
            newPoint.set_cluster(None)
            
        data1.append(newPoint)
    
    return



In [66]:
def get_distance(dataPointX, dataPointY,dataPointZ, dataPointF,centroidX, centroidY,centroidZ, centroidF):
    # Calculate Euclidean distance.
    return math.sqrt(math.pow((centroidY - dataPointY), 2) + math.pow((centroidX - dataPointX), 2) + math.pow((centroidZ - dataPointZ), 2) + math.pow((centroidF - dataPointF), 2))


In [67]:
def recalculate_centroids():
    totalX = 0
    totalY = 0
    totalZ = 0
    totalF = 0
    totalInCluster = 0
    
    for j in range(NUM_CLUSTERS):
        for k in range(len(data1)):
            if(data1[k].get_cluster() == j):
                totalX += data1[k].get_x()
                totalY += data1[k].get_y()
                totalZ += data1[k].get_z()
                totalF += data1[k].get_f()
                totalInCluster += 1
        
        if(totalInCluster > 0):
            centroids[j].set_x(totalX / totalInCluster)
            centroids[j].set_y(totalY / totalInCluster)
            centroids[j].set_z(totalZ / totalInCluster)
            centroids[j].set_f(totalF / totalInCluster)
    
    return

print(recalculate_centroids())

None


In [68]:
def update_clusters():
    isStillMoving = 0
    
    for i in range(TOTAL_DATA):
        bestMinimum = BIG_NUMBER
        currentCluster = 0
        
        for j in range(NUM_CLUSTERS):
            distance = get_distance(data1[i].get_x(), data1[i].get_y(), data1[i].get_z(), data1[i].get_f(), centroids[j].get_x(), centroids[j].get_y(),centroids[j].get_z(), centroids[j].get_f())
            if(distance < bestMinimum):
                bestMinimum = distance
                currentCluster = j
        
        data1[i].set_cluster(currentCluster)
        
        if(data1[i].get_cluster() is None or data1[i].get_cluster() != currentCluster):
            data1[i].set_cluster(currentCluster)
            isStillMoving = 1
    
    return isStillMoving

In [85]:
def perform_kmeans():
    isStillMoving = 1
    
    initialize_centroids()
    
    initialize_datapoints()
    
    while(isStillMoving):
        recalculate_centroids()
        isStillMoving = update_clusters()
    
    return

def print_results():
    for i in range(NUM_CLUSTERS):
        print("Cluster ", i, " includes:")
        for j in range(TOTAL_DATA):
            if(data1[j].get_cluster() == i):
                print("(", data1[j].get_x(), ", ", data1[j].get_y(),", ", data1[j].get_z(), ", ", data1[j].get_f(), ")")
        #print(data1[j].get_x())
        print()
    
    return

perform_kmeans()
print_results()
#print(data1[j].get_x())


Centroids initialized at:
( 64.0909090909 ,  48.50090909090908 ,  1.13636363636 ,  0.8495454545454546 )
( 392.45 ,  52.89825000000003 ,  11.0 ,  1.807 )

Cluster  0  includes:
( 101 ,  41.58 ,  4 ,  3.96 )
( 223 ,  43.05 ,  7 ,  3.14 )
( 90 ,  45.56 ,  2 ,  2.22 )
( 32 ,  53.12 ,  1 ,  3.12 )
( 167 ,  49.7 ,  8 ,  4.79 )
( 205 ,  55.61 ,  3 ,  1.46 )
( 37 ,  48.65 ,  0 ,  0.0 )

Cluster  1  includes:
( 334 ,  47.9 ,  17 ,  5.09 )
( 842 ,  59.14 ,  24 ,  2.85 )
( 482 ,  54.77 ,  14 ,  2.9 )

