In [2]:
import pandas as pd
from pprint import pprint
import sqlite3
import plotly.express as px

In [3]:
df = pd.read_csv("../../Data/Umsatzdaten/Gastronomieumsaetze_flat.csv", encoding="unicode_escape",  sep=';')

# Preprocessing Steps:
# Shaping, Removing redundant columns and columns having secondary tables
columns = {column[0] for column in [*df] if column[0].isdigit()}
columns  # Get all encoded columns

# Pull together all Code and Label rows into a single row.
table_list = {}
for i in columns:
    table_list[df[i + '_Merkmal_Label'].iloc[0]] = {key: value for _i, key, value in df[[i + '_Auspraegung_Code', i + '_Auspraegung_Label']].drop_duplicates().itertuples()}

indexed_columns = [index + '_Auspraegung_Code' for index in sorted([*columns])]

# Rename and Refactor Columns to remove redundancy
df_reduced = df[['Statistik_Code', 'Zeit'] + indexed_columns + ['UMS002__Umsatz__2015=100']]
df_reduced = df_reduced.rename(columns={name: df[name[0] + "_Merkmal_Label"].iloc[0] for name in indexed_columns})

# Remove all columns in which only 1 value is ever present
superfluous_columns = ["Statistik_Code", "Deutschland insgesamt", "Preisarten", "Original- und bereinigte Daten"]

assert all([df_reduced[sc].nunique() == 1 for sc in superfluous_columns]), f"One or more Columns in {superfluous_columns} are not unqiue anymore."

df_reduced = df_reduced.drop(columns=["Statistik_Code", "Deutschland insgesamt", "Preisarten", "Original- und bereinigte Daten"])

# Store it as partial date or start with the first of the month?
# https://stackoverflow.com/questions/6882788/store-incomplete-date-in-mysql-date-field

# Renaming the columns for easier processing
df_reduced = df_reduced.rename(columns={"UMS002__Umsatz__2015=100": "Umsatz", "WZ2008 (2- bis 3-Steller): Gastgewerbe": "Gastgewerbe"})

# Convert Sales
df_reduced['Umsatz'] = df_reduced['Umsatz'].str.replace(",", ".")
df_reduced['Umsatz'] = pd.to_numeric(df_reduced['Umsatz'], errors='coerce')

# Convert Months and Year (Zeit) to Datetime for SQL
# Use the first as a stand in for an empty day column
df_reduced['Monate'] = df_reduced['Monate'].str.replace("MONAT", "")
df_reduced['Monate'] = pd.to_numeric(df_reduced['Monate'], errors='coerce')
df_reduced['Datum'] = pd.to_datetime(dict(year=df_reduced.Zeit, month=df_reduced.Monate, day=[1]*len(df_reduced)))
df_reduced = df_reduced.drop(columns=["Zeit", "Monate"])

df_line = df_reduced[df_reduced['Gastgewerbe'] == "WZ08-553"].copy()
df_line['Year'] = pd.DatetimeIndex(df_line['Datum']).year
df_line = df_line.groupby('Year')[['Umsatz']].mean().reset_index()
px.line(df_line, x='Year', y='Umsatz')

In [5]:
gastgewerbe = table_list['WZ2008 (2- bis 3-Steller): Gastgewerbe']
gw_code = gastgewerbe.keys()
gw_bezeichner = gastgewerbe.values()

pd.DataFrame({"GastgewerbeCode": gw_code, "GastgewerbeBezeichner": gw_bezeichner}).to_csv("../../Data/Umsatzdaten/Gastgewerbe.csv", index=False)

In [57]:
if False:
    # Save the result of the computation
    df_reduced.to_csv("../../Data/Umsatzdaten/processed_umsatz.csv", index=False)

    # Prototyp sql statement for inserting these into the datalake if necessary
    connection = sqlite3.connect('test.db')
    cursor = connection.cursor()

    cursor.execute('''drop table if exists Bezeichner_Gastgewerbe;''')
    cursor.execute('''
    create table Bezeichner_Gastgewerbe(
    Gastgewerbe varchar(16),
    Name varchar(64),
    primary key(Gastgewerbe)
    );
    ''')
    cursor.execute('''drop table if exists Gastro_Umsatz;''')
    cursor.execute('''create table Gastro_Umsatz(
    Datum date,
    Gastgewerbe varchar(16),
    Angepasster_Umsatz float,
    primary key (Datum, Gastgewerbe),
    foreign key(Angepasster_Umsatz) references Bezeichner_Gastgewerbe(Gastgewerbe)
    ) ''')

    cursor.execute('''select * from Bezeichner_Gastgewerbe''').fetchall()

[]