Capeet Giglist
===

## Load latest

Filtered to "Wien"

In [17]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import os
pd.set_option('display.max_rows', 500)


FILTER = ", Wien"
FILEPATH = "~/Dropbox/capeet_prev.csv"
URL = "http://www.capeet.com/gigs_list.html"
NOW = datetime.now()

html_string = requests.get(URL).text

soup = BeautifulSoup(html_string, "html.parser")

text_content = soup.get_text().split("""\n\n\n\nKW""")[1]
text_content = [i for i in text_content.splitlines() if (i != "") & (FILTER in i)]

df = pd.DataFrame({"Text": text_content})
df = df.Text.str.split("\.:\s", n=1, regex=True, expand=True)
df = df.dropna(axis=0, how="any")
df.columns = ["Date", "Text"]
df["Date"] = df["Date"].str.replace("\t", "")
df[["Day", "Month"]] = df["Date"].str.split(".", n=1, expand=True)
df["Day"] = df["Day"].astype(int)
df["Month"] = df["Month"].astype(int)
df = df.drop("Date", axis=1)
df[["Event", "Location"]] = df["Text"].str.split("@", n=1, expand=True)
df["Location"] = df["Location"].str.replace("\[fb\]$", "", regex=True)
df["Location"] = df["Location"].str.replace("\[web\]$", "", regex=True)
df["Location"] = df["Location"].str.replace(", Wien", "", regex=False)
df = df.drop("Text", axis=1)

# year is missing in capeet giglist, so we will add it here
year_to_add = NOW.year
previous_month = NOW.month

def add_year(month):
    global year_to_add, previous_month

    if previous_month > month:
        year_to_add += 1
    previous_month = month

    return year_to_add


df["Year"] = df["Month"].apply(add_year)
df["Date"] = df[["Year", "Month", "Day"]].apply(
    lambda x: pd.Timestamp(x["Year"], x["Month"], x["Day"]), axis=1
)
df = df.drop(["Day", "Month", "Year"], axis=1)
df = df.reindex(columns=["Date", "Event", "Location"])

## Load previous

In [18]:
if os.path.isfile(FILEPATH):
    df_prev = pd.read_csv(FILEPATH, parse_dates=["Date"])
    creation_time = datetime.fromtimestamp(os.path.getmtime(FILEPATH))
    print(
        "Previous giglist from", creation_time.strftime("%Y-%m-%d %H:%M"), "available"
    )
else:
    print("No previous giglist available")
    creation_time = NOW
    df_prev = df

Previous giglist from 2023-11-09 23:13 available


## Show diff

In [19]:
df_diff = df.merge(
    df_prev, on=["Date", "Event", "Location"], how="outer", indicator=True
)

print(
    "Changes between",
    creation_time.strftime("%Y-%m-%d %H:%M"),
    "and",
    NOW.strftime("%Y-%m-%d %H:%M"),
    "\n",
)

df_diff._merge = df_diff._merge.apply(lambda x: {"left_only":"new", "right_only":"removed", "both": ""}[x])
df_diff = df_diff.loc[df_diff._merge != ""]
df_diff = df_diff.sort_values(["Date", "Event", "Location"]).reset_index(drop=True)

df_diff

Changes between 2023-11-09 23:13 and 2023-11-10 10:53 



Unnamed: 0,Date,Event,Location,_merge
0,2023-11-09,BLACKWATER HOLYLIGHT (usa) / IRON JINN (ned),Arena (kl. Halle),removed
1,2023-11-09,CULT OF YOUTH (usa),Viper Room,removed
2,2023-11-09,DISTORTED PONY (usa) / ANNE PEDERSDOTTER,rhiz,removed
3,2023-11-09,GOV'T MULE (usa),Globe,removed
4,2023-11-09,HOLDING ABSENCE (uk),Flex,removed
5,2023-11-09,LB Marszalek (bel) / RYVERS,Einbaumöbel,removed
6,2023-11-09,The HEROINE WHORES (d) / TV MOMS (usa) / HIDDE...,Venster,removed
7,2024-02-06,BUZZ KULL (aus),Venster,removed
8,2024-02-06,BUZZ KULL (aus) / SUNDL,Venster,new
9,2024-03-16,JAYA THE CAT (ned) / Kemo The Blaxican (usa),Arena (gr. Halle),new


## Update previous

In [20]:
df.to_csv("capeet_prev.csv", index=False)