In [27]:
import pandas as pd
import sklearn

file_path = 'WineDataset.csv'
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

In [28]:
df = df.dropna(subset=['Title', 'Description', 'Price','Capacity', 'Grape', 'Closure','Country', 'Unit', 'Characteristics', 'Per bottle / case / each', 'Type', 'ABV', 'Region', 'Style', 'Vintage'])

In [29]:
def clean_title(title):
    # split on the comma and take the first part
    return title.split(',')[0]

def clean_price(price):
    # remove the pound sign and remove "per bottle"
    clean_price_gbp = float(price.replace('£', '').replace(' per bottle', '').strip())
    clean_price_usd = round(clean_price_gbp * 1.4, 2)
    return clean_price_usd

def clean_capacity(capacity):
    if 'CL' in capacity:
        return float(capacity.replace('CL', ''))
    elif 'ML' in capacity:
        return float(capacity.replace('ML', '')) / 10
    elif 'LTR' in capacity:
        return float(capacity.replace('LTR', '')) * 100

def clean_ABV(abv):
    # remove the ABV and the % sign
    return float(abv.replace('ABV ', '').replace('%', '').strip())


In [30]:
# apply al cleaning funcitons to the DataFrame
df['Title'] = df['Title'].apply(clean_title)
df['Price'] = df['Price'].apply(clean_price)
df['ABV'] = df['ABV'].apply(clean_ABV)
df['Capacity'] = df['Capacity'].apply(clean_capacity)

In [31]:
df.head(5)

Unnamed: 0,Title,Description,Price,Capacity,Grape,Secondary Grape Varieties,Closure,Country,Unit,Characteristics,Per bottle / case / each,Type,ABV,Region,Style,Vintage,Appellation
1,Bread & Butter 'Winemaker's Selection' Chardon...,This really does what it says on the tin. It’s...,22.39,75.0,Chardonnay,,Natural Cork,USA,10.1,"Vanilla, Almond, Coconut, Green Apple, Peach, ...",per bottle,White,13.5,California,Rich & Toasty,2021,Napa Valley
2,Oyster Bay Sauvignon Blanc 2022,Oyster Bay has been an award-winning gold-stan...,17.49,75.0,Sauvignon Blanc,,Screwcap,New Zealand,9.8,"Tropical Fruit, Gooseberry, Grapefruit, Grass,...",per bottle,White,13.0,Marlborough,Crisp & Zesty,2022,
3,Louis Latour Mâcon-Lugny 2021/22,We’ve sold this wine for thirty years – and fo...,25.19,75.0,Chardonnay,,Natural Cork,France,10.1,"Peach, Apricot, Floral, Lemon",per bottle,White,13.5,Burgundy,Ripe & Rounded,2022,Macon
4,Bread & Butter 'Winemaker's Selection' Pinot N...,Bread & Butter is that thing that you can coun...,22.39,75.0,Pinot Noir,,Natural Cork,USA,10.1,"Smoke, Black Cherry, Cedar, Raspberry, Red Fruit",per bottle,Red,13.5,California,Smooth & Mellow,2021,Napa Valley
6,La Gioiosa Prosecco DOC,"In Treviso, Prosecco’s heartland, the locals h...",18.19,75.0,Glera,,Natural Cork,Italy,8.3,"Green Apple, Citrus Fruit, Floral",per bottle,White,11.0,Prosecco Doc,Light & Refreshing,NV,


In [34]:
# perform topic word extraction on the description, and add the topic words to the DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# create a CountVectorizer object
count_vectorizer = CountVectorizer(stop_words='english')

# fit and transform the data
count_data = count_vectorizer.fit_transform(df['Description'])

# create a LatentDirichletAllocation object
lda = LatentDirichletAllocation(n_components=5, random_state=0)

# fit the data
lda.fit(count_data)

# get the topics
def get_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        topic = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(topic)
    return topics

# Example usage
topics = get_topics(lda, count_vectorizer, 10)

# Add the topics to the DataFrame
df['Topics'] = lda.transform(count_data).argmax(axis=1)
df['Topics'] = df['Topics'].apply(lambda x: topics[x])

# display the DataFrame
df.head(5)

Unnamed: 0,Title,Description,Price,Capacity,Grape,Secondary Grape Varieties,Closure,Country,Unit,Characteristics,Per bottle / case / each,Type,ABV,Region,Style,Vintage,Appellation,Topics
1,Bread & Butter 'Winemaker's Selection' Chardon...,This really does what it says on the tin. It’s...,22.39,75.0,Chardonnay,,Natural Cork,USA,10.1,"Vanilla, Almond, Coconut, Green Apple, Peach, ...",per bottle,White,13.5,California,Rich & Toasty,2021,Napa Valley,"[wine, notes, pinot, fresh, grapes, best, flav..."
2,Oyster Bay Sauvignon Blanc 2022,Oyster Bay has been an award-winning gold-stan...,17.49,75.0,Sauvignon Blanc,,Screwcap,New Zealand,9.8,"Tropical Fruit, Gooseberry, Grapefruit, Grass,...",per bottle,White,13.0,Marlborough,Crisp & Zesty,2022,,"[wine, sauvignon, flavours, wines, notes, frui..."
3,Louis Latour Mâcon-Lugny 2021/22,We’ve sold this wine for thirty years – and fo...,25.19,75.0,Chardonnay,,Natural Cork,France,10.1,"Peach, Apricot, Floral, Lemon",per bottle,White,13.5,Burgundy,Ripe & Rounded,2022,Macon,"[wine, white, fresh, flavours, vines, grapes, ..."
4,Bread & Butter 'Winemaker's Selection' Pinot N...,Bread & Butter is that thing that you can coun...,22.39,75.0,Pinot Noir,,Natural Cork,USA,10.1,"Smoke, Black Cherry, Cedar, Raspberry, Red Fruit",per bottle,Red,13.5,California,Smooth & Mellow,2021,Napa Valley,"[wine, white, vines, region, notes, fresh, del..."
6,La Gioiosa Prosecco DOC,"In Treviso, Prosecco’s heartland, the locals h...",18.19,75.0,Glera,,Natural Cork,Italy,8.3,"Green Apple, Citrus Fruit, Floral",per bottle,White,11.0,Prosecco Doc,Light & Refreshing,NV,,"[wine, notes, pinot, fresh, grapes, best, flav..."


In [38]:
# extract the topics to a list of words
topics = df['Topics'].tolist()
topics

[['wine',
  'notes',
  'pinot',
  'fresh',
  'grapes',
  'best',
  'flavours',
  'new',
  'wines',
  'white',
  'region',
  'perfect',
  'fruit',
  'noir',
  'crisp',
  'grape',
  'rosé',
  'great',
  'refreshing',
  'expect'],
 ['wine',
  'sauvignon',
  'flavours',
  'wines',
  'notes',
  'fruit',
  'best',
  'rich',
  'winemaker',
  'ripe',
  'vintage',
  'world',
  'winery',
  'blanc',
  'won',
  'south',
  'grapes',
  'winemaking',
  '2022',
  'expect'],
 ['wine',
  'white',
  'fresh',
  'flavours',
  'vines',
  'grapes',
  'citrus',
  'notes',
  'burgundy',
  'wines',
  'chardonnay',
  'expect',
  'peach',
  'apple',
  'finish',
  'crisp',
  'family',
  'acidity',
  'minerality',
  'old'],
 ['wine',
  'white',
  'vines',
  'region',
  'notes',
  'fresh',
  'delicious',
  'rich',
  'flavours',
  'fruit',
  'soils',
  'wines',
  'perfect',
  'years',
  'dishes',
  'old',
  'grapes',
  'expect',
  'elegant',
  'champagne'],
 ['wine',
  'notes',
  'pinot',
  'fresh',
  'grapes',
  'be

In [29]:
# show all items in Capacity column
print(df['Capacity'].unique())

# function to clean capacity to only show CL
# ['75CL' '750ML' '1.5LTR' '37.5CL' '150CL' '300CL']


['75CL' '750ML' '1.5LTR' '37.5CL' '150CL' '300CL']


1.37

In [None]:
# Initialize the zero-shot classification pipeline using the Hugging Face model
classifier = pipeline(model="facebook/bart-large-mnli")

# Define the candidate dish categories for pairing
candidate_labels = ["meat", "poultry", "fish", "cheese", "vegetables"]

In [7]:
def classify_wine(description):
    # For the 4 stupid wines that had no description
    if not isinstance(description, str):
        return {label: 0.0 for label in candidate_labels}
    result = classifier(description, candidate_labels=candidate_labels)
    # print(result)
    return {label: score for label, score in zip(result['labels'], result['scores'])}

In [None]:
# Apply the classification function to the 'Description' column 
# This returns a Series of dictionaries with dish scores for each row
classification_results = df['Description'].apply(classify_wine)

# Convert the Series of dictionaries into a DataFrame where each dish category becomes a column
classification_df = pd.DataFrame(classification_results.tolist())

# Concatenate the original DataFrame with the new DataFrame containing the classification scores
df = pd.concat([df, classification_df], axis=1)

In [5]:
# Save the modified dataset with classification scores to a JSON file
df.to_json('WineDataset.json', orient='records')