This notebook is intended to perform some basic data cleaning before uploading to BigQuery

In [1]:
import pandas as pd
import os
from google.cloud import bigquery
import plotly.express as px

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './bq-service-account.json'

In [3]:
client = bigquery.Client(project='eg-data-assessment')

# Exercise 1

In [4]:
df = pd.read_csv('./raw_files/Exercise_1_-_Webpage_Performance.csv', thousands=',')

In [5]:
df.head()

Unnamed: 0,date,variant,metric,value
0,1/1/2024,A,visits,2115
1,1/2/2024,A,visits,5406
2,1/3/2024,A,visits,2278
3,1/4/2024,A,visits,6725
4,1/5/2024,A,visits,8920


In [6]:
df.describe()

Unnamed: 0,value
count,84.0
mean,4660.928571
std,10821.613112
min,5.0
25%,51.5
50%,201.0
75%,2890.5
max,54944.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     84 non-null     object
 1   variant  84 non-null     object
 2   metric   84 non-null     object
 3    value   84 non-null     int64 
dtypes: int64(1), object(3)
memory usage: 2.8+ KB


In [8]:
# convert our dates
df['date'] = pd.to_datetime(df['date'])

In [9]:
# strip any unncessary spaces from our column headers
df.columns = df.columns.str.strip()

In [10]:
# pivot table to where downloads and visits are each a column. Will make additional analysis simpler
df_pivot = df.pivot_table(index=['date', 'variant'], columns='metric', values='value', aggfunc=sum).reset_index()

In [None]:
df_pivot.to_gbq('raw_data.web_performance', if_exists='replace')

# Exercise 3

In [None]:
df_installs = pd.read_csv('./raw_files/Exercise_3_-_Installs (1).csv', thousands=',')

In [None]:
df

In [None]:
df_installs.columns = df_installs.columns.str.strip()
df_installs.columns = df_installs.columns.str.lower()

In [None]:
df_installs['install_date'] = pd.to_datetime(df_installs['install_date'])

In [None]:
df_installs.describe()

In [None]:
df_installs.info()

In [None]:
df_installs.to_gbq('raw_data.user_installs', if_exists='replace')

In [None]:
df_activity = pd.read_csv('./raw_files/Exercise_3_-_Activity (1).csv')

In [None]:
df_activity.head()

In [None]:
df_activity.info()

In [None]:
df_activity.columns = df_activity.columns.str.strip()
df_activity.columns = df_activity.columns.str.lower()

In [None]:
df_activity['play_date'] = pd.to_datetime(df_activity['play_date'])

In [None]:
df_activity.head()

In [None]:
df_activity.describe()

In [None]:
df_activity.to_gbq('raw_data.user_activity', if_exists='replace')

In [None]:
df_activity