In [None]:
from pathlib import Path

import pandas as pd
from ydata_profiling import ProfileReport

In [None]:
ROOT_DIR = Path.cwd().parent.parent
DATA_DIR = ROOT_DIR / "data"

In [None]:
# list all files in the data directory
for file in list(DATA_DIR.iterdir()):
    if not file.name.endswith("#"):
        print(file.name)


##### Getting the encoding of a file
In Linux to get the encoding of a file you can use the `file` command: 

```bash
file -bi Export\ _\ Ausstiegsseiten\ _\ 3.\ August\ 2022\ –\ 20.\ März\ 2023.csv
text/csv; charset=utf-16le
```

In [None]:
# read the data
df = pd.read_csv(
    DATA_DIR / "Export _ Ausstiegsseiten _ 3. August 2022 – 20. März 2023.csv",
    encoding="utf-16le",
)

In [None]:
df_exit_pages = pd.read_csv(
    DATA_DIR / "Export _ Ausstiegsseiten _ 3. August 2022 – 20. März 2023.csv",
    encoding="utf-16le",
)
df_exit_pages.name = "Exit Pages"
df_entry_pages = pd.read_csv(
    DATA_DIR / "Export _ Einstiegsseiten _ 3. August 2022 – 20. März 2023.csv",
    encoding="utf-16le",
)
df_entry_pages.name = "Entry Pages"
df_search_engines = pd.read_csv(
    DATA_DIR / "Export _ Suchmaschinen _ 3. August 2022 – 20. März 2023.csv",
    encoding="utf-16le",
)
df_search_engines.name = "Search Engines"
df_channel_type = pd.read_csv(
    DATA_DIR / "Export _ Kanaltyp _ 3. August 2022 – 20. März 2023 (1).csv",
    encoding="utf-16le",
)
df_channel_type.name = "Channel Type"

In [None]:
df_main = pd.read_csv(
    DATA_DIR / "Export _  _ 3. August 2022 – 8. März 2023.csv", encoding="utf-16le"
)

In [None]:
profile = ProfileReport(df_main, minimal=True, title="Main Table")
profile.to_file("output/Main Table.html")

In [None]:
dfs = [df_exit_pages, df_entry_pages, df_search_engines, df_channel_type]

In [None]:
for df in dfs:
    profile = ProfileReport(df, minimal=True, title=df.name)
    profile.to_file(f"output/{df.name}.html")

#### Column names

In [None]:
df_exit_pages_column_translations = pd.read_csv(
    "exit_pages_columns_translation.csv", sep="\t"
)
df_entry_pages_column_translations = pd.read_csv(
    "entry_pages_columns_translation.csv", sep="\t"
)
df_search_engines_column_translations = pd.read_csv(
    "search_engines_columns_translation.csv", sep="\t"
)
df_channel_type_column_translations = pd.read_csv(
    "channel_type_columns_translation.csv", sep="\t"
)
# df_entry_pages_column_translations