In [None]:
import pandas as pd
import os

import ast

from tqdm.notebook import tqdm

In [None]:
# Path to folder with JSON files
folder_path = '/content/drive/MyDrive/datasets/wines'

progress_bar = tqdm(os.listdir(folder_path), desc="Processing files", leave=True)

# Initialize an empty list to store DataFrames
dataframes = []

# Go through the files in the folder and load them into the DataFrame
for file in os.listdir(folder_path):
    if file.endswith('.json'):
        file_path = os.path.join(folder_path, file)
        try:
            df = pd.read_json(file_path)
            dataframes.append(df)
        except Exception as e:
            print(f"File loading error {file}: {e}")

# Combine all DataFrames into one
merged_dataframe = pd.concat(dataframes, ignore_index=True)

# Save to CSV
csv_filename = '/content/drive/MyDrive/datasets/merged_wine_data.csv'
merged_dataframe.to_csv(csv_filename, index=False)

print(f"Successfully merged and saved in {csv_filename}")

Processing files:   0%|          | 0/1073 [00:00<?, ?it/s]

Successfully merged and saved in /content/drive/MyDrive/datasets/merged_wine_data.csv


In [None]:
dataframe = pd.read_csv('/content/drive/MyDrive/datasets/merged_wine_data.csv')

In [None]:
dataframe.head

In [None]:
dataframe.iloc[11]['wines']

"{'name': 'Merlot', 'price': {'id': 28440603, 'merchant_id': 31839, 'amount': 29.99, 'discounted_from': None, 'discount_percent': None, 'type': 'vc', 'sku': '63574', 'url': 'https://www.classycorksdc.com/wines/Starmont-Winery-Merlot-2018-w4314924v2?utm_source=Vivino&utm_medium=Feed', 'visibility': 1, 'bottle_type_id': 1, 'currency': {'code': 'USD', 'name': 'US Dollars', 'prefix': '$', 'suffix': None}, 'bottle_type': {'id': 1, 'name': 'Bottle (0.75l)', 'short_name': 'bottle', 'short_name_plural': 'bottles', 'volume_ml': 750}}, 'taste': {'structure': {'acidity': 1.857961, 'fizziness': None, 'intensity': 3.9818263, 'sweetness': 1.9654433, 'tannin': 2.1349912, 'user_structure_count': 16, 'calculated_structure_count': 121}, 'flavor': [{'group': 'oak', 'stats': {'count': 79, 'score': 12550, 'mentions_count': 139}, 'primary_keywords': [{'id': 292, 'name': 'oak', 'count': 39}, {'id': 434, 'name': 'vanilla', 'count': 38}, {'id': 101, 'name': 'chocolate', 'count': 16}, {'id': 422, 'name': 'tobac

In [None]:
data_dict = ast.literal_eval(dataframe['wines'][0])
reviews = data_dict['reviews']

In [None]:
len(reviews)

3

In [None]:
reviews[0]['note']

'Wow I was incredibly taken back by this moscato. Light yet has notes of white peach & lychee with the mid pallet some  minerals truly lovely and a perfect sweet balance not too sweet not to light. '

In [None]:
data_list = []

for i in range(dataframe.shape[0]):
    data_dict = ast.literal_eval(dataframe['wines'][i])
    selected_data = {}
    selected_data['style_name'] = data_dict['name']
    selected_data['price'] = data_dict['price']['amount']
    selected_data['currency'] = data_dict['price']['currency']['code']
    selected_data['volume'] = data_dict['price']['bottle_type']['volume_ml']

    if data_dict['taste'] is not None and data_dict['taste']['structure'] is not None:
        selected_data['acidity'] = data_dict['taste']['structure'].get('acidity', None)
        selected_data['fizziness'] = data_dict['taste']['structure'].get('fizziness', None)
        selected_data['sweetness'] = data_dict['taste']['structure'].get('sweetness', None)
        selected_data['tannin'] = data_dict['taste']['structure'].get('tannin', None)
        selected_data['user_structure_count'] = data_dict['taste']['structure'].get('user_structure_count', None)
        selected_data['calculated_structure_count'] = data_dict['taste']['structure'].get('calculated_structure_count', None)


    reviews = data_dict['reviews']
    vintages = [reviews[i]['vintage'] for i in range(len(reviews))]
    notes = [reviews[i]['note'] for i in range(len(reviews))]
    selected_data['wine_name'] = vintages[0]['name']
    region = vintages[0]['wine']['region']
    selected_data['rating'] = reviews[0]['rating']
    selected_data['region_name'] = region['name']
    selected_data['country'] = region['country']['name']
    selected_data['winery'] = vintages[0]['wine']['winery']['name']
    selected_data['image'] = vintages[0]['image']['location']

    sequence_tokens = ['SEP'] + notes + ['SEP']
    selected_data['combined_reviews'] = ' '.join('SEP ' + str(note) for note in notes)

    selected_data['flavours'] = []
    for review in reviews:
        flavor_matches = review.get('flavor_word_matches')
        if flavor_matches is not None:
            selected_data['flavours'].extend(match.get('match') for match in flavor_matches)



    data_list.append(selected_data)

df = pd.DataFrame(data_list)


In [None]:
df.head()

Unnamed: 0,style_name,price,currency,volume,acidity,fizziness,sweetness,tannin,user_structure_count,calculated_structure_count,wine_name,rating,region_name,country,winery,image,combined_reviews,flavours
0,Moscato,12.99,USD,750,3.067054,,3.271635,,18.0,192.0,Seven Daughters Moscato U.V.,4.1,Veneto,Italy,Seven Daughters,//images.vivino.com/thumbs/mE6S9dUrSUOFI8qTodT...,SEP Wow I was incredibly taken back by this mo...,"[lychee, minerals, white peach, apple, pear]"
1,Indisciplinato,27.99,USD,750,3.442514,,1.623086,,9.0,1.0,Tenuta San Marcello Indisciplinato U.V.,2.5,Marche,Italy,Tenuta San Marcello,//images.vivino.com/labels/kJVZn2JsRiiUc7Xfz7x...,"SEP Anfora verdicchio, not satisfied, for the ...","[dried mango, orange]"
2,Reserve Collection Pinot Noir,24.99,USD,750,3.870318,,1.416349,1.883968,1.0,3.0,Erath Reserve Collection Pinot Noir 2019,4.2,Willamette Valley,United States,Erath,//images.vivino.com/labels/DU6Kr8eeTGSC9Cl-5MV...,SEP Excellent expression of Willamette valley!...,[]
3,Stokes' Ghost Petite Sirah,29.916667,USD,750,3.413706,,1.921906,4.333656,22.0,145.0,Scheid Vineyards Stokes' Ghost Petite Sirah U.V.,4.1,Monterey,United States,Scheid Vineyards,//images.vivino.com/thumbs/VhNssH95RtKFPe73dxK...,"SEP Wowee- great story, great wine. A two-fer!...",[]
4,Défier Coquelicot Vineyard Merlot,29.99,USD,750,1.827581,,1.946129,2.162903,2.0,11.0,J.Brix Défier Coquelicot Vineyard Merlot U.V.,4.1,Santa Ynez Valley,United States,J.Brix,//images.vivino.com/thumbs/DWwwq3AtSdWLB_SrbxY...,SEP Surprisingly green despite being carbonic....,"[black fruit, black olive, plum]"


In [None]:
df['combined_reviews'][1]

'SEP Anfora verdicchio, not satisfied, for the price especially, a lot of vinegard smell and not much more, after a while peach and flower but I have to say, looks like a farmer wine, good farmer wine but very rude. I had the feeling that natural here is an excuse. Totally unbalanced, not interesting and very rough. Try it again for me is a no a big no especially for 24.5 euro. Absolutely not.  SEP Dried mango, balanced and clean, organic and biodynamic wine producer - tasty\n SEP Very good orange wine, unfiltered and dry on the tongue with a strong aftertaste'

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.to_csv('/content/drive/MyDrive/datasets/wines_processed/wines.csv', index=False)