In [None]:
import pandas as pd
from typing import Dict
from datetime import datetime
import pandas as pd
import datetime
from datetime import date, timedelta
import re
import numpy as np
import matplotlib.pyplot as plt
from google.oauth2 import service_account
from googleapiclient.discovery import build, Resource
from IPython.display import display, Markdown
from scipy.stats import linregress
import warnings

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Variablen anpassen

Welche Domain soll analysiert werden?

In [None]:
############################################################################################
#Variables 
### Zunächst wird danach gefiltert, welche Suchbegriffe nicht enthalten sein sollen (notContains)
brand = "guru"

#AT(aut), Switzerland(che), Netherlands(nl), Spain(esp), Germany(deu)
COUNTRY_FILTER = ["deu"]

#DE(urlaubsguru.de), NLD(holidayguru.nl), ES(holidayguru.es), AT(urlaubsguru.at). CH(holidayguru.ch)
domain_name = "urlaubsguru.de" 

#wie viele tage an daten möchtest du haben?
delta_days = 30

############################################################################################

In [None]:
DIMENSIONS_BYURL = ["page", "query", "date", "country"]
DOMAIN = "sc-domain:" + domain_name
credential_filepath = ".secrets/creds.json"

#calculate date
end_date = date.today()
start_date = end_date - timedelta(days=delta_days)


## Data Pipeline GSC

In diesem Bereich werden die Daten der letzten 30 Tage aus der Google Search Console gezogen.

In [None]:
############################################################################################
#Functions
#Für Timeframe Comparison:

def determine_timerange(row, comparison_start_date):
    if row["date"] > start_date:
        return "after"
    else:
        return "prior"

## für Auth
def auth_using_key_file(key_filepath):
    credentials = service_account.Credentials.from_service_account_file(
        key_filepath, scopes=SCOPE
    )
    service = build(API_SERVICE_NAME, API_VERSION, credentials=credentials)
    return service

def query(client: Resource, payload: Dict[str, str]) -> Dict[str, any]:
    response = client.searchanalytics().query(siteUrl=DOMAIN, body=payload).execute()
    return response

# Plot Function (wird mit Input aus der Loop gefüttert)
def plot_data(title, x_label, y_label, data_frame, url):
    fig, ax = plt.subplots(figsize=(10, 6))

    for column in data_frame.columns:
        #filter out values containing 0
        filtered_data = data_frame[data_frame[column] != 0]
        ax.plot(filtered_data.index, filtered_data[column], marker='o', label=column)

    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_title(title)
    ax.legend()
    ax.grid(True)

    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def plot_data_inverted_y_axis(title, x_label, y_label, data_frame, url):
    fig, ax = plt.subplots(figsize=(10, 6))

    for column in data_frame.columns:
        # Filter out values containing 0
        filtered_data = data_frame[data_frame[column] != 0]
        ax.plot(filtered_data.index, filtered_data[column], marker='o', label=column)

    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_title(title)
    ax.legend()
    ax.grid(True)

    plt.xticks(rotation=45)

    # Invert the y-axis
    ax.invert_yaxis()
    plt.tight_layout()
    plt.show()



# Plot Trend Line with Clicks
def add_trend_line(ax, data_frame, x_label, y_label, title):
    x = np.arange(len(data_frame))
    y = data_frame.sum(axis=1)  # Sum the values across columns to get a 1D array

    # Fit a linear regression model to your data
    coeffs = np.polyfit(x, y, 1)
    trend_line = np.poly1d(coeffs)

    x_values = np.arange(len(data_frame))  # Create a numeric array for x values

    for column in data_frame.columns:
        ax.plot(x_values, data_frame[column], marker='o', label=column)

    ax.plot(x_values, trend_line(x_values), linestyle="--", label="Trend Line")
    
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_title(title)
    ax.legend()
    ax.grid(True)

    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

#metric difference calculation
def calculate_difference(df, url, query):
    prior_clicks = df[(df["page"] == url) & (df["timerange"] == "prior") & (df["query"] == query)]["clicks"].sum()
    after_clicks = df[(df["page"] == url) & (df["timerange"] == "after") & (df["query"] == query)]["clicks"].sum()

    prior_avg_position = df[(df["page"] == url) & (df["timerange"] == "prior") & (df["query"] == query)]["position"].mean()
    after_avg_position = df[(df["page"] == url) & (df["timerange"] == "after") & (df["query"] == query)]["position"].mean()

    clicks_difference = after_clicks - prior_clicks
    avg_position_difference = after_avg_position - prior_avg_position

    return clicks_difference, avg_position_difference, prior_clicks, after_clicks, prior_avg_position, after_avg_position


def plot_dual_bar_chart(data_df, x_label, y1_label, y2_label, y1_color='skyblue', y2_color='salmon', rotation=45):
    x = np.arange(len(data_df.index))
    bar_width = 0.35
    
    fig, ax1 = plt.subplots(figsize=(10, 6))

    bars1 = ax1.bar(x - bar_width/2, data_df[y1_label], width=bar_width, color=y1_color, label=y1_label)
    ax1.set_xlabel(x_label)
    ax1.set_ylabel(y1_label, color=y1_color)
    ax1.tick_params(axis='y', labelcolor=y1_color)

    for bar in bars1:
        height = bar.get_height()
        ax1.annotate(f'{height:.2f}', 
                     xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 3),  # 3 points vertical offset
                     textcoords="offset points",
                     ha='center', va='bottom')

    ax2 = ax1.twinx()

    bars2 = ax2.bar(x + bar_width/2, data_df[y2_label], width=bar_width, color=y2_color, label=y2_label)
    ax2.set_ylabel(y2_label, color=y2_color)
    ax2.tick_params(axis='y', labelcolor=y2_color)

    for bar in bars2:
        height = bar.get_height()
        ax2.annotate(f'{int(height)}', 
                     xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 3),  # 3 points vertical offset
                     textcoords="offset points",
                     ha='center', va='bottom')

    ax1.set_xticks(x)
    ax1.set_xticklabels(data_df.index, rotation=rotation)

    plt.title(f'{y1_label} and {y2_label} by {x_label}')
    plt.tight_layout()
    plt.show()




In [None]:
############################################################################################
# GSC API
API_SERVICE_NAME = "webmasters"
API_VERSION = "v3"
SCOPE = ["https://www.googleapis.com/auth/webmasters.readonly"]
MAX_ROWS = 25_000

KEY_FILE = credential_filepath
service = auth_using_key_file(key_filepath=KEY_FILE)

# by URL



i = 0
reponse_by_url = []
while True:
    payload_main_range = {
        "startDate": start_date.strftime("%Y-%m-%d"),
        "endDate": end_date.strftime("%Y-%m-%d"),
        "dimensions": DIMENSIONS_BYURL,
        "dimensionFilterGroups": [{
            "filters": [{
                "dimension": "country",
                "operator": "contains",
                "expression": COUNTRY_FILTER
                },
            {
                "dimension": "query",
                "operator": "notContains",
                "expression": brand
            },
            {
                "dimension": "page",
                "operator": "notContains",
                "expression": "#"
            }]
        }],
        "rowLimit": MAX_ROWS,
        "startRow": i * MAX_ROWS
    }

    # make request to API
    response_main_range = query(service, payload_main_range)

    # if there are rows in the response, append to the temporary list
    if response_main_range.get("rows"):
        reponse_by_url.extend(response_main_range["rows"])
        i += 1
    else:
        break

    print(f"Collected {len(reponse_by_url):,} rows.")
    
# Create a DataFrame from the temporary list
by_url_data = pd.DataFrame(reponse_by_url)
by_url_data[DIMENSIONS_BYURL] = pd.DataFrame(by_url_data["keys"].tolist(), index=by_url_data.index)
df = by_url_data.drop(columns="keys")

## Data Manipulation

In [None]:
### Datentypen anpassen
df["position"] = df["position"].round(3)
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
df["Month"] = df["date"].dt.month
df["Day"] = df["date"].dt.day
df["Year"] = df["date"].dt.year
df["Day of Year"] = df["date"].dt.dayofyear
df["Year Month"] = df["date"].dt.strftime("%Y-%m")

### Daten pivotieren
table = pd.pivot_table(df, values="clicks", index=["page"], columns=["Day of Year"], aggfunc=np.sum)
table = table.fillna(0)
table = table.astype(int)
table = table.reset_index()
#table["Highest Amount of Clicks"] = table.loc[:, table.columns.str.match("20[0-9]{2}-[0-9]{2}")].max(axis=1)

table["Sum of Clicks"] = table.loc[:, [col for col in table.columns if str(col).isdigit() and 1 <= int(col) <= 400]].sum(axis=1)

rows_list = []

for row in table.itertuples(index=False):
    values_list = row[2:-1]
    x_values = [i + 1 for i in range(len(values_list))]
    
    if len(set(values_list)) > 1:
        slope, intercept, r_value, p_value, std_err = linregress(values_list, x_values)
        rows_list.append(slope)
    else:
        rows_list.append(None)  # or any other default value, as linear regression is not possible
    


In [None]:
slope_df = pd.DataFrame(rows_list, columns=["Slope"])
slope_df["Slope"] = (slope_df["Slope"] * 10).round(3)
# Dataframes Join on Index
df_w_slope = pd.merge(table, slope_df, left_index=True, right_index=True)

columns_to_keep = [col for col in df_w_slope.columns if str(col).isdigit() and 1 <= int(col) <= 400]

# Select the columns to keep
df_w_slope = df_w_slope.drop(columns=columns_to_keep)

# Binning
num_bins = 10
percentiles = [i * 100 / num_bins for i in range(num_bins + 1)]
bin_edges = [df_w_slope["Sum of Clicks"].quantile(p / 100) for p in percentiles]

# Create labels for the bins
#bin_labels = [f'Bin {i+1}' for i in range(num_bins)]

# Create a new column with the binned data
df_w_slope["Clicks Binned"] = pd.cut(df_w_slope["Sum of Clicks"], bins=bin_edges, labels=False, duplicates="drop", include_lowest=True)


## Bins Visualization

In [None]:
############################################################################################
############################################################################################
### VIZUALIZE BINS AND STUFF

# Pivot the DataFrame to get average slope and unique page counts for each bin
df_w_slope_positive = df_w_slope[df_w_slope["Slope"] > 0]
df_w_slope_negative = df_w_slope[df_w_slope["Slope"] < 0]
pivot_df_negative = df_w_slope_negative.pivot_table(index='Clicks Binned', values=['Slope', 'Sum of Clicks'], aggfunc={'Slope': 'mean', 'Sum of Clicks': 'sum'})
pivot_df_positive = df_w_slope_positive.pivot_table(index='Clicks Binned', values=['Slope', 'Sum of Clicks'], aggfunc={'Slope': 'mean', 'Sum of Clicks': 'sum'})

#Plotting
display(Markdown(f"## Negative Slope for last {delta_days} Days\n\n"))
plot_dual_bar_chart(pivot_df_negative, 'Bins', 'Slope', 'Sum of Clicks')
display(Markdown(f"## Positive Slope for last {delta_days} Days\n\n"))
plot_dual_bar_chart(pivot_df_positive, 'Bins', 'Slope', 'Sum of Clicks')



In [None]:
############################################################################################
# CHOOSE BINS TO ANALYZE
bins_category_wanted = 9
positive_or_negative = "negative" #either "positive" or "negative"
top_n_results_wanted = 100
############################################################################################

In [None]:
if positive_or_negative == "negative":
    df_w_slope_negative = df_w_slope[(df_w_slope["Slope"] < 0) & (df_w_slope["Clicks Binned"] == bins_category_wanted)]
    top_n_negative_slope = df_w_slope_negative.sort_values(by=["Sum of Clicks", "Slope"], ascending=[False, True]).head(top_n_results_wanted)
    slope_list = top_n_negative_slope["page"].tolist()
else:
    df_w_slope_positive = df_w_slope[(df_w_slope["Slope"] > 0) & (df_w_slope["Clicks Binned"] == bins_category_wanted)]
    top_n_positive_slope = df_w_slope_positive.sort_values(by=["Sum of Clicks", "Slope"], ascending=[False, False]).head(top_n_results_wanted)
    slope_list = top_n_positive_slope["page"].tolist()

#calculate date and stuff
today = date.today()
end_date = today
start_date = end_date - timedelta(days=delta_days)
end_date_comp = start_date - timedelta(days=1)
start_date_comp = end_date_comp - timedelta(days=30)

## Data Pipeline GSC

Hier wird nun für jede URL in dem jeweiligen Bin aktuelle Daten sowie Vergleichsdaten des Zeitraums zuvor gezogen. Dies dient dazu etwas "Kontext" zu erhalten.

In [None]:
############################################################################################
# GSC API FOR CHANGE STUFF
API_SERVICE_NAME = "webmasters"
API_VERSION = "v3"
SCOPE = ["https://www.googleapis.com/auth/webmasters.readonly"]
MAX_ROWS = 25_000

KEY_FILE = credential_filepath
service = auth_using_key_file(key_filepath=KEY_FILE)


def query(client: Resource, payload: Dict[str, str]) -> Dict[str, any]:
    response = client.searchanalytics().query(siteUrl=DOMAIN, body=payload).execute()
    return response

# by URL
#start date calculatin
start_date = end_date - timedelta(days=delta_days)

# dataframe um daten über die API zu speichern
data_frames = []

c = 0 #counter for print
# by URL
for url in slope_list:
    i = 0
    reponse_by_url = []
    while True:
        payload_main_range = {
            "startDate": start_date.strftime("%Y-%m-%d"),
            "endDate": end_date.strftime("%Y-%m-%d"),
            "dimensions": DIMENSIONS_BYURL,
            "dimensionFilterGroups": [{
                "filters": [{
                    "dimension": "page",
                    "expression": url
                },
                    {
                    "dimension": "country",
                    "operator": "contains",
                    "expression": COUNTRY_FILTER
                    }]
            }],
            "rowLimit": MAX_ROWS,
            "startRow": i * MAX_ROWS
        }

        # make request to API
        response_main_range = query(service, payload_main_range)

        # if there are rows in the response, append to the temporary list
        if response_main_range.get("rows"):
            reponse_by_url.extend(response_main_range["rows"])
            i += 1
        else:
            break

        #print(f"Collected {len(reponse_by_url):,} rows (main range) for {url}.")
    
    # Create a DataFrame from the temporary list
    by_url_data = pd.DataFrame(reponse_by_url)
    if "keys" in by_url_data.columns:
        by_url_data[DIMENSIONS_BYURL] = pd.DataFrame(by_url_data["keys"].tolist(), index=by_url_data.index)
        by_url_data = by_url_data.drop(columns="keys")

        # Add a new column "timerange" and populate it based on payload range
        by_url_data["date"] = pd.to_datetime(by_url_data["date"]).dt.date
        by_url_data["timerange"] = by_url_data.apply(lambda row: determine_timerange(row, start_date), axis=1)
        data_frames.append(by_url_data)
    else:
        print(f"Skipping processing for URL {url} as 'keys' column is not present.")
    
    a = 0
    response_by_comparison_url = []
    while True:
        payload_comparison_range = {
            "startDate": start_date_comp.strftime("%Y-%m-%d"),
            "endDate": end_date_comp.strftime("%Y-%m-%d"),
            "dimensions": DIMENSIONS_BYURL,
             "dimensionFilterGroups": [{
                "filters": [{
                    "dimension": "page",
                    "expression": url
                },
                    {
                    "dimension": "country",
                    "operator": "contains",
                    "expression": COUNTRY_FILTER
                    }]
            }],
            "rowLimit": MAX_ROWS,
            "startRow": a * MAX_ROWS
        }

        # make request to API
        response_comparison_range = query(service, payload_comparison_range)

        # if there are rows in the response, append to the temporary list
        if response_comparison_range.get("rows"):
            response_by_comparison_url.extend(response_comparison_range["rows"])
            a += 1
        else:
            break

        #print(f"Collected {len(reponse_by_url):,} rows (comparison range) for {url}.")
        c += 1
        print(f"Collecting data for page {c}/{len(slope_list)}...")

    # Create a DataFrame from the temporary list
    by_url_data_response = pd.DataFrame(response_by_comparison_url)
    if "keys" in by_url_data_response.columns:
        by_url_data_response[DIMENSIONS_BYURL] = pd.DataFrame(by_url_data_response["keys"].tolist(), index=by_url_data_response.index)
        by_url_data_response = by_url_data_response.drop(columns="keys")

        # Add a new column "timerange" and populate it based on payload range
        by_url_data_response["date"] = pd.to_datetime(by_url_data_response["date"]).dt.date
        by_url_data_response["timerange"] = by_url_data_response.apply(lambda row: determine_timerange(row, start_date_comp), axis=1)

        # Append the data frame to the list
        data_frames.append(by_url_data_response)
    else:
        print(f"Skipping processing for URL {url} as 'keys' column is not present.")

# Concatenate all DataFrames in the list into a single DataFrame
all_data = pd.concat(data_frames, ignore_index=True)
df = all_data

## Data Manipulation

In [None]:
#Manipulation
df = df[~df["query"].str.contains(brand)] # Brand Queries raushauen#
df = df[~(df == False).all(axis=1)]

# Convert "date" column to datetime format
df["date"] = pd.to_datetime(df["date"])
# Create a new column "time_group" based on the year and week
df['time_group'] = df['date'].dt.strftime('%Y-%U')
# Assign a time label
time_label = "Year-Week"

for url in slope_list:
    filtered_df = df[df["page"] == url]
    title_overall = f"## Performance Overview für {url}\n\n"
    display(Markdown(title_overall))
    
    #click daten
    filtered_df_pivot_clicks = pd.pivot_table(
        filtered_df, 
        values="clicks", 
        index=["date"], #["time_group"]
        columns=["timerange"], 
        aggfunc=np.sum).fillna(0).astype(int)

    #click daten im letzten monat
    filtered_df_only_after_timerange = filtered_df[filtered_df["timerange"] == "after"]
    filtered_df_pivot_clicks_only_after_timerange = pd.pivot_table(
        filtered_df_only_after_timerange, 
        values="clicks", 
        index=["date"], #["time_group"]
        columns=["timerange"], 
        aggfunc=np.sum).fillna(0).astype(int)

    #query calculations
    diff_data = []
    top_queries_prior = filtered_df[filtered_df["timerange"] == "prior"].groupby("query")["clicks"].sum().nlargest(10)
    for query in top_queries_prior.index:
        clicks_diff, avg_position_diff, prior_clicks, after_clicks, prior_avg_position, after_avg_position = calculate_difference(filtered_df, url, query)
        diff_data.append({"Query": query, "Clicks Prior": prior_clicks, "Clicks After": after_clicks, "Clicks Difference": clicks_diff, "Position Prior": prior_avg_position, "Position After": after_avg_position, "Avg Position Difference": avg_position_diff})
    difference_df = pd.DataFrame(diff_data)
    difference_df["Avg Position Difference"] = difference_df["Avg Position Difference"].apply(lambda x: round(x, 2) if not pd.isna(x) else x)
    difference_df["Position Prior"] = difference_df["Position Prior"].apply(lambda x: round(x, 2))
    difference_df["Position After"] = difference_df["Position After"].apply(lambda x: round(x, 2))
    title_markdown = "### Suchbegriffe, welche im Monat zuvor die meisten Klicks brachten, haben sich wie folgt verändert.\n\n"
    display(Markdown(title_markdown))
    display(difference_df)
    
    diff_data = []
    top_queries_after = filtered_df[filtered_df["timerange"] == "after"].groupby("query")["clicks"].sum().nlargest(10)
    for query in top_queries_after.index:
        clicks_diff, avg_position_diff, prior_clicks, after_clicks, prior_avg_position, after_avg_position = calculate_difference(filtered_df, url, query)
        diff_data.append({"Query": query, "Clicks Prior": prior_clicks, "Clicks After": after_clicks, "Clicks Difference": clicks_diff, "Position Prior": prior_avg_position, "Position After": after_avg_position, "Avg Position Difference": avg_position_diff})
    difference_df = pd.DataFrame(diff_data)
    difference_df["Avg Position Difference"] = difference_df["Avg Position Difference"].apply(lambda x: round(x, 2) if not pd.isna(x) else x)
    difference_df["Position Prior"] = difference_df["Position Prior"].apply(lambda x: round(x, 2))
    difference_df["Position After"] = difference_df["Position After"].apply(lambda x: round(x, 2))


    
    # Pivot table for query count over time
    filtered_df_forquerycount = filtered_df[filtered_df["clicks"] > 0]
    filtered_df_pivot_querycount = pd.pivot_table(
        filtered_df_forquerycount,
        values="query",
        index=["date"],
        columns=["timerange"],
        aggfunc="count"
    ).fillna(0).astype(int)

    #top keyword
    top_kw = filtered_df.groupby(by="query")["clicks"].sum().sort_values(ascending=False).head(1).index.tolist()
    top_kw_as_string = " ".join(top_kw)
    filtered_df_for_query = filtered_df[filtered_df["query"].isin(top_kw)]
    filtered_df_pivot_position = pd.pivot_table(
        filtered_df_for_query, 
        values="position", 
        index=["date"], 
        columns=["timerange"], 
        aggfunc=np.average).fillna(0).astype(float)
    
    #plotting
    plot_click_title_month_over_month = "### Visualization of clicks over time vs. previous month"
    display(Markdown(plot_click_title_month_over_month))
    plot_data(
        title=f"Clicks for \"{url}\"",
        x_label="Date",
        y_label="Sum of Clicks",
        data_frame=filtered_df_pivot_clicks,
        url=url
    )
    plot_click_title = "### Visualization of click trend"
    display(Markdown(plot_click_title))
    fig, ax = plt.subplots(figsize=(10, 6))
    add_trend_line(ax, filtered_df_pivot_clicks_only_after_timerange, "Date", "Sum of Clicks", f"Clicks inkl. Trend for \"{url}\"")
    #plt.show()

    plot_avgposition_title = "### Visualization of average ranking over time of top keyword by clicks"
    display(Markdown(plot_avgposition_title))
    plot_data_inverted_y_axis(
        title=f"Average Ranking for \"{top_kw_as_string}\" | URL: \"{url}\"",
        x_label="Date",
        y_label="Average Ranking",
        data_frame=filtered_df_pivot_position,
        url=url
    )

    
