In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
import json
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
from google.cloud import bigquery
from sqlalchemy import create_engine
from dotenv import load_dotenv
from scipy import stats
%matplotlib inline

In [2]:
load_dotenv(".env2")

host=os.environ.get('DB_HOST')
port=os.environ.get('DB_PORT')
database=os.environ.get('DB_DATABASE')
user=os.environ.get('DB_USER')
password=os.environ.get('DB_PASSWORD')
project_id = os.environ.get('PROJECT_ID')

In [3]:
conn_string = f"postgresql://{user}:{password}@{host}/{database}"
postgres_engine = create_engine(conn_string)

In [95]:
dfs = []

for i in df_original.index:
    df_hits = df_original['hits'][i]
    df_hits = pd.json_normalize(df_hits)
    dfs.append(df_hits)

combined_df = pd.concat(dfs, ignore_index=True)

In [10]:
# List of JSON columns to be unnested
json_columns = ['totals', 'trafficSource', 'device', 'geoNetwork']

# Unnest JSON columns using a for loop
for column in json_columns:
    try:
        unnested_df = pd.json_normalize(df[column])
        unnested_df.columns = [f"{col}" for col in unnested_df.columns]
        unnested_df = pd.concat([df['visitId'], unnested_df], axis=1)
        unnested_df = unnested_df.dropna(axis=1, how="all")
        unnested_df.to_sql(f"{column}", postgres_engine, if_exists='replace', index=False)
        print(f"Completed unnesting the {column}")
    except KeyError as e:
        print(f"Error: Column '{column}' not found in the DataFrame.")       


Completed exporting the totals
Completed exporting the trafficSource
Completed exporting the device
Completed exporting the geoNetwork


In [5]:
bigquery_id = project_id

# Initialize the BigQuery client
client = bigquery.Client(project=bigquery_id)

# SQL query to get a sample of the data
query = """
select *
from `bigquery-public-data.google_analytics_sample.ga_sessions_20160801`
"""

# Run the query
query_job = client.query(query)
results = query_job.result()

# Convert the results to a pandas DataFrame
df_original = results.to_dataframe()

In [6]:
df_original

Unnamed: 0,visitorId,visitNumber,visitId,visitStartTime,date,totals,trafficSource,device,geoNetwork,customDimensions,hits,fullVisitorId,userId,channelGrouping,socialEngagementType
0,,1,1470046245,1470046245,20160801,"{'visits': 1, 'hits': 24, 'pageviews': 17, 'ti...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Firefox', 'browserVersion': 'not ...","{'continent': 'Europe', 'subContinent': 'Weste...","[{'index': 4, 'value': 'EMEA'}]","[{'hitNumber': 1, 'time': 0, 'hour': 3, 'minut...",895954260133011192,,Organic Search,Not Socially Engaged
1,,1,1470084717,1470084717,20160801,"{'visits': 1, 'hits': 24, 'pageviews': 18, 'ti...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Internet Explorer', 'browserVersi...","{'continent': 'Americas', 'subContinent': 'Nor...","[{'index': 4, 'value': 'North America'}]","[{'hitNumber': 1, 'time': 0, 'hour': 13, 'minu...",0288478011259077136,,Direct,Not Socially Engaged
2,,3,1470078988,1470078988,20160801,"{'visits': 1, 'hits': 27, 'pageviews': 17, 'ti...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Safari', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...","[{'index': 4, 'value': 'North America'}]","[{'hitNumber': 1, 'time': 0, 'hour': 12, 'minu...",6440789996634275026,,Organic Search,Not Socially Engaged
3,,4,1470075581,1470075581,20160801,"{'visits': 1, 'hits': 27, 'pageviews': 19, 'ti...","{'referralPath': '/', 'campaign': '(not set)',...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...","[{'index': 4, 'value': 'North America'}]","[{'hitNumber': 1, 'time': 0, 'hour': 11, 'minu...",8520115029387302083,,Referral,Not Socially Engaged
4,,30,1470099026,1470099026,20160801,"{'visits': 1, 'hits': 27, 'pageviews': 17, 'ti...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...","[{'index': 4, 'value': 'North America'}]","[{'hitNumber': 1, 'time': 0, 'hour': 17, 'minu...",6792260745822342947,,Organic Search,Not Socially Engaged
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1706,,2,1470041604,1470041604,20160801,"{'visits': 1, 'hits': 1, 'pageviews': 1, 'time...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Oceania', 'subContinent': 'Aust...","[{'index': 4, 'value': 'APAC'}]","[{'hitNumber': 1, 'time': 0, 'hour': 1, 'minut...",5882258963347216049,,Organic Search,Not Socially Engaged
1707,,4,1470066323,1470066323,20160801,"{'visits': 1, 'hits': 1, 'pageviews': 1, 'time...","{'referralPath': '/', 'campaign': '(not set)',...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...","[{'index': 4, 'value': 'North America'}]","[{'hitNumber': 1, 'time': 0, 'hour': 8, 'minut...",6932408648900988870,,Referral,Not Socially Engaged
1708,,2,1470092775,1470092775,20160801,"{'visits': 1, 'hits': 1, 'pageviews': 1, 'time...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...","[{'index': 4, 'value': 'North America'}]","[{'hitNumber': 1, 'time': 0, 'hour': 16, 'minu...",1492217702561732960,,Direct,Not Socially Engaged
1709,,2,1470046044,1470046044,20160801,"{'visits': 1, 'hits': 1, 'pageviews': 1, 'time...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Safari', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Cen...","[{'index': 4, 'value': 'Central America'}]","[{'hitNumber': 1, 'time': 0, 'hour': 3, 'minut...",1276631666870427430,,Organic Search,Not Socially Engaged


In [10]:
for i in df_original.index:
    df_hits = df_original['hits'][i]
    df_hits = pd.json_normalize(df_hits)
    df_hits = pd.concat([df_original['visitId'], df_hits], axis=1)
    

In [23]:
# Initialize an empty list to store the normalized DataFrames
normalized_data = []

for i in df_original.index:
    df_hits = df_original['hits'][i]
    df_hits = pd.json_normalize(df_hits)
    df_hits = df_hits.assign(visitId=df_original['visitId'][i])  # Add the 'visitId' column without overwriting
    normalized_data.append(df_hits)

# Concatenate all the normalized DataFrames
result_df = pd.concat(normalized_data)

result_df

Unnamed: 0,hitNumber,time,hour,minute,isSecure,isInteraction,isEntrance,isExit,referer,contentInfo,...,latencyTracking.serverConnectionTime,latencyTracking.serverResponseTime,latencyTracking.domLatencyMetricsSample,latencyTracking.domInteractiveTime,latencyTracking.domContentLoadedTime,latencyTracking.userTimingValue,latencyTracking.userTimingSample,latencyTracking.userTimingVariable,latencyTracking.userTimingCategory,latencyTracking.userTimingLabel
0,1,0,3,10,,True,True,,https://www.google.de,,...,,,,,,,,,,
1,2,40860,3,11,,True,,,,,...,,,,,,,,,,
2,3,51159,3,11,,True,,,,,...,,,,,,,,,,
3,4,76627,3,12,,True,,,,,...,,,,,,,,,,
4,5,112096,3,12,,True,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1,0,1,53,,True,True,True,,,...,,,,,,,,,,
0,1,0,8,45,,True,True,True,,,...,,,,,,,,,,
0,1,0,16,6,,True,True,True,,,...,,,,,,,,,,
0,1,0,3,7,,True,True,True,,,...,,,,,,,,,,


In [33]:
normalized_data = []

for i in df_original.index:    
    df_hits = df_original['hits'][i]
    df_hits = pd.json_normalize(df_hits)
    df_hits.insert(0, 'visitId', df_original['visitId'][i])  # Insert the 'visitId' column as the first column
    normalized_data.append(df_hits)

result_df = pd.concat(normalized_data)
result_df

Unnamed: 0,visitId,hitNumber,time,hour,minute,isSecure,isInteraction,isEntrance,isExit,referer,...,latencyTracking.serverConnectionTime,latencyTracking.serverResponseTime,latencyTracking.domLatencyMetricsSample,latencyTracking.domInteractiveTime,latencyTracking.domContentLoadedTime,latencyTracking.userTimingValue,latencyTracking.userTimingSample,latencyTracking.userTimingVariable,latencyTracking.userTimingCategory,latencyTracking.userTimingLabel
0,1470046245,1,0,3,10,,True,True,,https://www.google.de,...,,,,,,,,,,
1,1470046245,2,40860,3,11,,True,,,,...,,,,,,,,,,
2,1470046245,3,51159,3,11,,True,,,,...,,,,,,,,,,
3,1470046245,4,76627,3,12,,True,,,,...,,,,,,,,,,
4,1470046245,5,112096,3,12,,True,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1470041604,1,0,1,53,,True,True,True,,...,,,,,,,,,,
0,1470066323,1,0,8,45,,True,True,True,,...,,,,,,,,,,
0,1470092775,1,0,16,6,,True,True,True,,...,,,,,,,,,,
0,1470046044,1,0,3,7,,True,True,True,,...,,,,,,,,,,


In [32]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41064 entries, 0 to 23
Columns: 101 entries, visitId to eventInfo.eventValue
dtypes: bool(2), float64(22), int64(6), object(71)
memory usage: 31.4+ MB


In [24]:
df_original['visitId'][i]

1470075236

In [9]:
json_columns = ['device', 'geoNetwork', 'totals', 'trafficSource']

df = pd.read_csv('D:/data/google analytics/google_store_dataset.csv',
                  converters={column: json.loads for column in json_columns}, 
                  dtype={'fullVisitorId': 'str'})

for column in json_columns:
    try:
        unnested_df = pd.json_normalize(df[column])
        unnested_df.columns = [f"{col}" for col in unnested_df.columns]
        df = pd.concat([df.drop(columns=column), unnested_df], axis=1)
        print(f"Completed unnesting {column}")
    except KeyError as e:
        print(f"Error: Column '{column}' not found in the DataFrame.")    

Completed to unnest device
Completed to unnest geoNetwork
Completed to unnest totals
Completed to unnest trafficSource


In [65]:
df

Unnamed: 0,channelGrouping,customDimensions,date,fullVisitorId,hits,socialEngagementType,visitId,visitNumber,visitStartTime,browser,...,medium,keyword,adContent,isTrueDirect,adwordsClickInfo.criteriaParameters,adwordsClickInfo.page,adwordsClickInfo.slot,adwordsClickInfo.gclId,adwordsClickInfo.adNetworkType,adwordsClickInfo.isVideoAd
0,Organic Search,"[{'index': '4', 'value': 'APAC'}]",20180511,7460955084541987166,"[{'hitNumber': '1', 'time': '0', 'hour': '21',...",Not Socially Engaged,1526099341,2,1526099341,Chrome,...,organic,(not provided),(not set),True,not available in demo dataset,,,,,
1,Direct,"[{'index': '4', 'value': 'North America'}]",20180511,460252456180441002,"[{'hitNumber': '1', 'time': '0', 'hour': '11',...",Not Socially Engaged,1526064483,166,1526064483,Chrome,...,(none),(not set),(not set),True,not available in demo dataset,,,,,
2,Organic Search,"[{'index': '4', 'value': 'North America'}]",20180511,3461808543879602873,"[{'hitNumber': '1', 'time': '0', 'hour': '12',...",Not Socially Engaged,1526067157,2,1526067157,Chrome,...,organic,(not provided),(not set),True,not available in demo dataset,,,,,
3,Direct,"[{'index': '4', 'value': 'North America'}]",20180511,975129477712150630,"[{'hitNumber': '1', 'time': '0', 'hour': '23',...",Not Socially Engaged,1526107551,4,1526107551,Chrome,...,(none),(not set),(not set),True,not available in demo dataset,,,,,
4,Organic Search,"[{'index': '4', 'value': 'North America'}]",20180511,8381672768065729990,"[{'hitNumber': '1', 'time': '0', 'hour': '10',...",Not Socially Engaged,1526060254,1,1526060254,Internet Explorer,...,organic,(not provided),(not set),,not available in demo dataset,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401584,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",20180907,6701149525099562370,"[{'hitNumber': '1', 'time': '0', 'hour': '13',...",Not Socially Engaged,1536353803,1,1536353803,Chrome,...,organic,(not provided),(not set),,not available in demo dataset,,,,,
401585,Organic Search,"[{'index': '4', 'value': 'North America'}]",20180907,6154541330147351453,"[{'hitNumber': '1', 'time': '0', 'hour': '23',...",Not Socially Engaged,1536388075,1,1536388075,Chrome,...,organic,(not provided),(not set),,not available in demo dataset,,,,,
401586,Organic Search,[],20180907,6013469762773705448,"[{'hitNumber': '1', 'time': '0', 'hour': '13',...",Not Socially Engaged,1536351791,3,1536351791,Firefox,...,organic,(not provided),(not set),True,not available in demo dataset,,,,,
401587,Organic Search,"[{'index': '4', 'value': 'North America'}]",20180907,4565378823441900999,"[{'hitNumber': '1', 'time': '0', 'hour': '10',...",Not Socially Engaged,1536340217,1,1536340217,Chrome,...,organic,(not provided),(not set),,not available in demo dataset,,,,,


In [37]:
def df_summary(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 
    
    return summary

df_summary(df)

In [None]:
#unnested_df_hits = pd.concat([df['visitId'], df['hits']], axis=1)
#unnested_df_hits

for i in df.index:
    df_hits = df['hits'][i]
    df_hits = pd.json_normalize(df_hits)
    df_hits = pd.concat([df['visitId'], df['hits']], axis=1)
    

In [68]:
from flatten_json import flatten

def flatten_data(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out