In [1]:
import pandas as pd
from transformers import pipeline

file_path = 'WineDataset.csv'
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = df.dropna(subset=['Title', 'Description', 'Price','Capacity', 'Grape', 'Closure','Country', 'Unit', 'Characteristics', 'Per bottle / case / each', 'Type', 'ABV', 'Region', 'Style', 'Vintage'])

In [3]:
def clean_title(title):
    # split on the comma and take the first part
    return title.split(',')[0]

def clean_price(price):
    # remove the pound sign and remove "per bottle"
    return float(price.replace('£', '').replace(' per bottle', '').strip())

def clean_ABV(abv):
    # remove the ABV and the % sign
    return float(abv.replace('ABV ', '').replace('%', '').strip())

In [4]:
# apply al cleaning funcitons to the DataFrame
df['Title'] = df['Title'].apply(clean_title)
df['Price'] = df['Price'].apply(clean_price)
df['ABV'] = df['ABV'].apply(clean_ABV)

# show to the output
df.head(50)

Unnamed: 0,Title,Description,Price,Capacity,Grape,Secondary Grape Varieties,Closure,Country,Unit,Characteristics,Per bottle / case / each,Type,ABV,Region,Style,Vintage,Appellation
1,Bread & Butter 'Winemaker's Selection' Chardon...,This really does what it says on the tin. It’s...,15.99,75CL,Chardonnay,,Natural Cork,USA,10.1,"Vanilla, Almond, Coconut, Green Apple, Peach, ...",per bottle,White,13.5,California,Rich & Toasty,2021,Napa Valley
2,Oyster Bay Sauvignon Blanc 2022,Oyster Bay has been an award-winning gold-stan...,12.49,75CL,Sauvignon Blanc,,Screwcap,New Zealand,9.8,"Tropical Fruit, Gooseberry, Grapefruit, Grass,...",per bottle,White,13.0,Marlborough,Crisp & Zesty,2022,
3,Louis Latour Mâcon-Lugny 2021/22,We’ve sold this wine for thirty years – and fo...,17.99,75CL,Chardonnay,,Natural Cork,France,10.1,"Peach, Apricot, Floral, Lemon",per bottle,White,13.5,Burgundy,Ripe & Rounded,2022,Macon
4,Bread & Butter 'Winemaker's Selection' Pinot N...,Bread & Butter is that thing that you can coun...,15.99,75CL,Pinot Noir,,Natural Cork,USA,10.1,"Smoke, Black Cherry, Cedar, Raspberry, Red Fruit",per bottle,Red,13.5,California,Smooth & Mellow,2021,Napa Valley
6,La Gioiosa Prosecco DOC,"In Treviso, Prosecco’s heartland, the locals h...",12.99,75CL,Glera,,Natural Cork,Italy,8.3,"Green Apple, Citrus Fruit, Floral",per bottle,White,11.0,Prosecco Doc,Light & Refreshing,NV,
8,Bouvet Ladubay Saumur Brut,The Loire Valley’s Saumur Brut is the go-to ap...,13.99,75CL,Chenin Blanc,,Natural Cork,France,9.4,"Honeysuckle, Citrus Fruit",per bottle,White,12.5,Loire,Rich & Toasty,NV,Saumur
9,LB7 Red 2020/21,Portuguese red blends are a Majestic specialit...,8.99,75CL,Castelão,"Touriga Nacional, Tinta Roriz",Natural Cork,Portugal,10.1,"Spice, Black Fruit, Black Plum, Blackberry",per bottle,Red,13.5,Lisboa,Rich & Juicy,2021,
10,The Ned 'Waihopai River' Sauvignon Blanc 2023,Our bestselling white wine. Winemaker Brent Ma...,11.99,75CL,Sauvignon Blanc,,Screwcap,New Zealand,9.8,"Tropical Fruit, Gooseberry, Grapefruit, Grass,...",per bottle,White,13.0,Marlborough,Crisp & Zesty,2023,
12,Chosen By Majestic Greek White 2022,We sent our award-winning buying team all arou...,11.29,75CL,Malagousia,"Assyrtiko, Roditis",Screwcap,Greece,9.8,"Blossom, Lemon, Lime, Peach, Pear",per bottle,White,13.0,Greece,Fresh & Elegant,2022,
13,Miraval Rosé 2021/22,Miraval shot to fame when Angelina Jolie and B...,22.99,75CL,Cinsault,"Syrah, Grenache, Rolle",Natural Cork,France,9.8,"Strawberry, Herbaceous, Peach, Raspberry",per bottle,Rosé,13.0,Provence,Delicate & Dry,2022,Côtes De Provence


In [None]:
# Initialize the zero-shot classification pipeline using the Hugging Face model
classifier = pipeline(model="facebook/bart-large-mnli")

# Define the candidate dish categories for pairing
candidate_labels = ["meat", "poultry", "fish", "cheese", "vegetables"]

In [7]:
def classify_wine(description):
    # For the 4 stupid wines that had no description
    if not isinstance(description, str):
        return {label: 0.0 for label in candidate_labels}
    result = classifier(description, candidate_labels=candidate_labels)
    # print(result)
    return {label: score for label, score in zip(result['labels'], result['scores'])}

In [None]:
# Apply the classification function to the 'Description' column 
# This returns a Series of dictionaries with dish scores for each row
classification_results = df['Description'].apply(classify_wine)

# Convert the Series of dictionaries into a DataFrame where each dish category becomes a column
classification_df = pd.DataFrame(classification_results.tolist())

# Concatenate the original DataFrame with the new DataFrame containing the classification scores
df = pd.concat([df, classification_df], axis=1)

In [5]:
# Save the modified dataset with classification scores to a JSON file
df.to_json('WineDataset.json', orient='records')