# Beeradvocate pre-processing
Given that the ratings.txt and reviews.txt are hard to pars and to handle we will convert them in a more easy to use format (parquet). We will also check how much information overlap between the two files to understand if we can drop one of them. <br>
Finally we will do some processing on the reviews to make them compatible with our pipeline. <br><br><br>
How to run this notebook:
- Unpack all the files in the corresponding folder in the data folder
- Run the notebook

A converted reviews.pq and a converted ratings.pq will be saved in the data folder

##### Definition of some global variables and imports

In [1]:
import polars as pl
import polars as pl
import tqdm
from datetime import datetime
import os

DATA_FOLDER = '../../data'
DATA_FOLDER = os.path.abspath(DATA_FOLDER)

##### Identification of unique labels

In [2]:
for file_name in ["ratings", "reviews"]:
    unique_labels = set()
    with open(f"{DATA_FOLDER}/{file_name}.txt") as file:
        for line in tqdm.tqdm(file):
            if line == "\n":
                continue
            label, _ = line.split(":", 1)
            unique_labels.add(label.strip())

    print(f"Unique labels in {file_name}.txt:")
    print(unique_labels)
    print()

151074576it [01:11, 2127343.81it/s]


Unique labels in ratings.txt:
{'brewery_name', 'rating', 'aroma', 'brewery_id', 'taste', 'user_id', 'palate', 'beer_name', 'date', 'text', 'style', 'beer_id', 'overall', 'appearance', 'user_name', 'abv', 'review'}



44022962it [00:22, 1991584.24it/s]

Unique labels in reviews.txt:
{'brewery_name', 'rating', 'aroma', 'brewery_id', 'taste', 'user_id', 'palate', 'beer_name', 'date', 'text', 'style', 'beer_id', 'overall', 'appearance', 'user_name', 'abv'}






##### Conversion from txt to parquet

In [3]:
# Define the mapping betwen column names and polars types
mapping_pl = {
    "ratings": {
        "user_id": pl.Utf8,
        "rating": pl.Float64,
        "review": pl.Boolean,
        "abv": pl.Float64,
        "brewery_name": pl.Utf8,
        "user_name": pl.Utf8,
        "beer_id": pl.Int64,
        "appearance": pl.Float64,
        "palate": pl.Float64,
        "text": pl.Utf8,
        "aroma": pl.Float64,
        "overall": pl.Float64,
        "taste": pl.Float64,
        "style": pl.Utf8,
        "beer_name": pl.Utf8,
        "brewery_id": pl.Int64,
        "date": pl.Datetime
    },
    "reviews": {
        "user_id": pl.Utf8,
        "rating": pl.Float64,
        "abv": pl.Float64,
        "brewery_name": pl.Utf8,
        "user_name": pl.Utf8,
        "beer_id": pl.Int64,
        "appearance": pl.Float64,
        "palate": pl.Float64,
        "text": pl.Utf8,
        "aroma": pl.Float64,
        "overall": pl.Float64,
        "taste": pl.Float64,
        "style": pl.Utf8,
        "beer_name": pl.Utf8,
        "brewery_id": pl.Int64,
        "date": pl.Datetime
    }
}

files_names = ["reviews", "ratings"]

for file_name in files_names:
    # Create an empty list to collect rows
    rows = []

    # Open the file to read the reviews
    with open(f'{DATA_FOLDER}/{file_name}.txt', 'r') as f:
        for line in tqdm.tqdm(f):
            # Remove leading/trailing whitespaces
            line = line.strip()
            
            # Create a dictionary to store the content of the row
            content = {label: None for label in mapping_pl[file_name].keys()}

            # Process the line until we get a complete record
            while line:
                # Split the line into label and value
                label, value = line.split(":", 1)
                label = label.strip()
                value = value.strip()

                # Skip 'nan' values (these values are used to indicate missing data)
                if value != 'nan':
                    # Cast the value to the correct type based on the mapping
                    if mapping_pl[file_name][label] == pl.Int64:
                        value = int(value)
                    elif mapping_pl[file_name][label] == pl.Float64:
                        value = float(value)
                    elif mapping_pl[file_name][label] == pl.Utf8:
                        value = str(value)
                    elif mapping_pl[file_name][label] == pl.Datetime:
                        value = datetime.fromtimestamp(int(value))
                    elif mapping_pl[file_name][label] == pl.Boolean:
                        value = value == "True"

                    # Store the value in the content dictionary
                    content[label] = value

                # Read the next line (for multiline records, like reviews)
                line = f.readline().strip()

            # Add the processed row to the list
            rows.append(content)

    # After processing all lines, create a DataFrame from the accumulated rows
    df = pl.DataFrame(rows)

    # Save it as parquet
    df.write_parquet(f'{DATA_FOLDER}/{file_name}.pq')

    # Remove the dataframe from memory
    del df

2589586it [00:46, 55484.04it/s]
8393032it [02:18, 60574.16it/s]


##### Overlap between ratings and reviews

In [4]:
# Load the data
reviews = pl.read_parquet(f'{DATA_FOLDER}/reviews.pq')
ratings = pl.read_parquet(f'{DATA_FOLDER}/ratings.pq')

# Filter out the reviews that are not reviews
ratings_filtered = ratings.filter(pl.col("review") == True)

# Drop the review column
ratings_filtered = ratings_filtered.drop("review")

# Check that the two files have the same elements
assert pl.DataFrame.equals(reviews, ratings_filtered)

From this we see that the reviews dataframe is formed by the ratings with a false rating column, as expected. As a consequence we can use the ratings.pq dataframe and drop the reviews.pq dataframe without loss of information.