In [2]:
#Load Dependencies
import unidecode
import csv
import pandas as pd
import re
import numpy as np

## Removing Diacritics From Excel

In [2]:
# Sanity Check
wine_scores = pd.read_csv('data\intermediate\kaggle_wine_scores.csv')
wine_scores.head()

Unnamed: 0,id,country,designation,points,price,province,region_1,region_2,title,variety,winery
0,0,Italy,Vulkà Bianco,87,Null,Sicily & Sardinia,Etna,Null,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,Avidagos,87,15,Douro,Null,Null,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,Null,87,14,Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,Reserve Late Harvest,87,13,Michigan,Lake Michigan Shore,Null,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,Vintner's Reserve Wild Child Block,87,65,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [3]:
wine_scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           129971 non-null  int64 
 1   country      129971 non-null  object
 2   designation  129971 non-null  object
 3   points       129971 non-null  int64 
 4   price        129971 non-null  object
 5   province     129971 non-null  object
 6   region_1     129971 non-null  object
 7   region_2     129971 non-null  object
 8   title        129971 non-null  object
 9   variety      129971 non-null  object
 10  winery       129971 non-null  object
dtypes: int64(2), object(9)
memory usage: 10.9+ MB


In [4]:
# Define file paths
input_csv = 'data\intermediate\kaggle_wine_scores.csv' 
output_csv = 'data\intermediate\wine_scores_with_diacritics_removed.csv'

# Function to clean the strings by replacing diacritics with ASCII equivalents
def clean_string(s):
    return unidecode.unidecode(s)

# Open the input CSV file and create a new output CSV file
with open(input_csv, mode='r', encoding='utf-8') as infile, open(output_csv, mode='w', newline='', encoding='utf-8') as outfile:
    # Create a CSV reader and writer
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # Read and write the header
    header = next(reader)
    writer.writerow(header)

    # Process each row in the CSV
    for row in reader:
        # Clean each cell in the row
        cleaned_row = [clean_string(cell) for cell in row]
        # Write the cleaned row to the output CSV
        writer.writerow(cleaned_row)

print('Diacritics converted to plain English. Converted file saved as:', output_csv)

Diacritics converted to plain English. Converted file saved as: data\intermediate\wine_scores_with_diacritics_removed.csv


In [10]:
# Sanity Check
df = pd.read_csv('data\intermediate\wine_scores_with_diacritics_removed.csv')
df.head()

Unnamed: 0,id,country,designation,points,price,province,region_1,region_2,title,variety,winery
0,0,Italy,Vulka Bianco,87,Null,Sicily & Sardinia,Etna,Null,Nicosia 2013 Vulka Bianco (Etna),White Blend,Nicosia
1,1,Portugal,Avidagos,87,15,Douro,Null,Null,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,Null,87,14,Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,Reserve Late Harvest,87,13,Michigan,Lake Michigan Shore,Null,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,Vintner's Reserve Wild Child Block,87,65,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           129971 non-null  int64 
 1   country      129971 non-null  object
 2   designation  129971 non-null  object
 3   points       129971 non-null  int64 
 4   price        129971 non-null  object
 5   province     129971 non-null  object
 6   region_1     129971 non-null  object
 7   region_2     129971 non-null  object
 8   title        129971 non-null  object
 9   variety      129971 non-null  object
 10  winery       129971 non-null  object
dtypes: int64(2), object(9)
memory usage: 10.9+ MB


## Extract Vintage information from title column

In [12]:
# Function to extract vintage from the title
def extract_vintage(title):
    # Regular expression to find four consecutive digits
    match = re.search(r'(\b\d{4}\b)', title)
    # If a match is found, return the match, otherwise return None
    return int(match.group(1)) if match else pd.NA

# Apply the function to the 'title' column and create a new 'vintage' column
df['vintage'] = df['title'].apply(extract_vintage).astype('Int64')

# Save the modified DataFrame to a new CSV file
df.to_csv('data\intermediate\wine_score_with_vintage.csv', index=False)
print('New CSV with vintage column saved as: wine_score_with_vintage.csv')

New CSV with vintage column saved as: wine_score_with_vintage.csv


In [13]:
#Sanity Check
df.head()

Unnamed: 0,id,country,designation,points,price,province,region_1,region_2,title,variety,winery,vintage
0,0,Italy,Vulka Bianco,87,Null,Sicily & Sardinia,Etna,Null,Nicosia 2013 Vulka Bianco (Etna),White Blend,Nicosia,2013
1,1,Portugal,Avidagos,87,15,Douro,Null,Null,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011
2,2,US,Null,87,14,Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013
3,3,US,Reserve Late Harvest,87,13,Michigan,Lake Michigan Shore,Null,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013
4,4,US,Vintner's Reserve Wild Child Block,87,65,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           129971 non-null  int64 
 1   country      129971 non-null  object
 2   designation  129971 non-null  object
 3   points       129971 non-null  int64 
 4   price        129971 non-null  object
 5   province     129971 non-null  object
 6   region_1     129971 non-null  object
 7   region_2     129971 non-null  object
 8   title        129971 non-null  object
 9   variety      129971 non-null  object
 10  winery       129971 non-null  object
 11  vintage      125362 non-null  Int64 
dtypes: Int64(1), int64(2), object(9)
memory usage: 12.0+ MB


## Change "Null" string value to PANDAS compatible null value

In [16]:
# Replace 'Null' strings with numpy.nan across the entire DataFrame
df.replace('Null', np.nan, inplace=True)

In [17]:
# Sanity Check
df.head()

Unnamed: 0,id,country,designation,points,price,province,region_1,region_2,title,variety,winery,vintage
0,0,Italy,Vulka Bianco,87,,Sicily & Sardinia,Etna,,Nicosia 2013 Vulka Bianco (Etna),White Blend,Nicosia,2013
1,1,Portugal,Avidagos,87,15.0,Douro,,,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011
2,2,US,,87,14.0,Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013
3,3,US,Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013
4,4,US,Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           129971 non-null  int64 
 1   country      129908 non-null  object
 2   designation  92506 non-null   object
 3   points       129971 non-null  int64 
 4   price        120975 non-null  object
 5   province     129908 non-null  object
 6   region_1     108724 non-null  object
 7   region_2     50511 non-null   object
 8   title        129971 non-null  object
 9   variety      129970 non-null  object
 10  winery       129971 non-null  object
 11  vintage      125362 non-null  Int64 
dtypes: Int64(1), int64(2), object(9)
memory usage: 12.0+ MB


In [19]:
# Convert price to Int64
df['price'] = df['price'].astype('Int64')
# Check Dtype
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           129971 non-null  int64 
 1   country      129908 non-null  object
 2   designation  92506 non-null   object
 3   points       129971 non-null  int64 
 4   price        120975 non-null  Int64 
 5   province     129908 non-null  object
 6   region_1     108724 non-null  object
 7   region_2     50511 non-null   object
 8   title        129971 non-null  object
 9   variety      129970 non-null  object
 10  winery       129971 non-null  object
 11  vintage      125362 non-null  Int64 
dtypes: Int64(2), int64(2), object(8)
memory usage: 12.1+ MB


In [20]:
#Recheck Dataframe
df.head()

Unnamed: 0,id,country,designation,points,price,province,region_1,region_2,title,variety,winery,vintage
0,0,Italy,Vulka Bianco,87,,Sicily & Sardinia,Etna,,Nicosia 2013 Vulka Bianco (Etna),White Blend,Nicosia,2013
1,1,Portugal,Avidagos,87,15.0,Douro,,,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011
2,2,US,,87,14.0,Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013
3,3,US,Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013
4,4,US,Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012


In [21]:
# Export to CSV
df.to_csv('data\clean\cleaned_wine_score.csv', index=False)
print('New CSV with cleaned data saved as: cleaned_wine_score.csv')

New CSV with cleaned data saved as: cleaned_wine_score.csv


In [12]:
# Load Dataframe
df = pd.read_csv('data\clean\cleaned_wine_score.csv')

# Check null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           129971 non-null  int64  
 1   country      129908 non-null  object 
 2   designation  92506 non-null   object 
 3   points       129971 non-null  int64  
 4   price        120975 non-null  float64
 5   province     129908 non-null  object 
 6   region_1     108724 non-null  object 
 7   region_2     50511 non-null   object 
 8   title        129971 non-null  object 
 9   variety      129970 non-null  object 
 10  winery       129971 non-null  object 
 11  vintage      125362 non-null  float64
dtypes: float64(2), int64(2), object(8)
memory usage: 11.9+ MB


In [29]:
# Load Scrapped Data
scraped = pd.read_csv('webscrapper\output\wine_info.csv')
scraped.head()

Unnamed: 0,Wine Name,Region 1,Region 2,Region 3,Country,Score,Price,Winery,Variety,Wine Type
0,Benovia 2021 Chardonnay (Russian River Valley),Sonoma,California,,US,93,$48,Benovia,Chardonnay,White
1,Benovia 2021 Pinot Noir (Russian River Valley),Sonoma,California,,US,92,$55,Benovia,Pinot Noir,Red
2,Benovia 2020 Cohn Vineyard Pinot Noir (Sonoma ...,Sonoma,California,,US,95,$100,Benovia,Pinot Noir,Red
3,Ross Knoll Vineyard 2021 Pinot Noir (Russian R...,Sonoma,California,,US,92,$60,Ross Knoll Vineyard,Pinot Noir,Red
4,Belle Glos 2022 Clark & Telephone Pinot Noir (...,Central Coast,California,,US,92,$55,Belle Glos,Pinot Noir,Red


In [6]:
scraped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42479 entries, 0 to 42478
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Wine Name  42479 non-null  object
 1   Region 1   34635 non-null  object
 2   Region 2   23504 non-null  object
 3   Region 3   16229 non-null  object
 4   Country    42479 non-null  object
 5   Score      42479 non-null  int64 
 6   Price      38849 non-null  object
 7   Winery     42479 non-null  object
 8   Variety    40626 non-null  object
 9   Wine Type  42479 non-null  object
dtypes: int64(1), object(9)
memory usage: 3.2+ MB


In [30]:
# Function to extract vintage from the title
def extract_vintage(title):
    # Regular expression to find four consecutive digits
    match = re.search(r'(\b\d{4}\b)', title)
    # If a match is found, return the match, otherwise return None
    return int(match.group(1)) if match else pd.NA

# Apply the function to the 'title' column and create a new 'vintage' column
scraped['vintage'] = scraped['Wine Name'].apply(extract_vintage).astype('Int64')

# Save the modified DataFrame to a new CSV file
scraped.to_csv('data\intermediate\scraped_info_with_vintage.csv', index=False)
print('New CSV with vintage column saved as: scraped_info_with_vintage.csv')

New CSV with vintage column saved as: scraped_info_with_vintage.csv


In [8]:
scraped.head()

Unnamed: 0,Wine Name,Region 1,Region 2,Region 3,Country,Score,Price,Winery,Variety,Wine Type,vintage
0,Benovia 2021 Chardonnay (Russian River Valley),Sonoma,California,,US,93,$48,Benovia,Chardonnay,White,2021
1,Benovia 2021 Pinot Noir (Russian River Valley),Sonoma,California,,US,92,$55,Benovia,Pinot Noir,Red,2021
2,Benovia 2020 Cohn Vineyard Pinot Noir (Sonoma ...,Sonoma,California,,US,95,$100,Benovia,Pinot Noir,Red,2020
3,Ross Knoll Vineyard 2021 Pinot Noir (Russian R...,Sonoma,California,,US,92,$60,Ross Knoll Vineyard,Pinot Noir,Red,2021
4,Belle Glos 2022 Clark & Telephone Pinot Noir (...,Central Coast,California,,US,92,$55,Belle Glos,Pinot Noir,Red,2022


In [13]:
# Reorganizing downloaded data

# First, drop the 'id' and 'designation' columns
df.drop(columns=['id', 'designation'], inplace=True)

# Then, rename the columns
df.rename(columns={
    'title': 'Wine Name',
    'region_1': 'Region 1',
    'province': 'Region 2',
    'region_2': 'Region 3',
    'country': 'Country',
    'points': 'Score',
    'price': 'Price',
    'winery': 'Winery',
    'variety': 'Variety',
    'vintage': 'Vintage'
}, inplace=True)

# Finally, reorder the columns
df = df[['Wine Name', 'Region 1', 'Region 2', 'Region 3', 'Country', 'Score', 'Price', 'Winery', 'Variety', 'Vintage']]

# Check results
df.head()

Unnamed: 0,Wine Name,Region 1,Region 2,Region 3,Country,Score,Price,Winery,Variety,Vintage
0,Nicosia 2013 Vulka Bianco (Etna),Etna,Sicily & Sardinia,,Italy,87,,Nicosia,White Blend,2013.0
1,Quinta dos Avidagos 2011 Avidagos Red (Douro),,Douro,,Portugal,87,15.0,Quinta dos Avidagos,Portuguese Red,2011.0
2,Rainstorm 2013 Pinot Gris (Willamette Valley),Willamette Valley,Oregon,Willamette Valley,US,87,14.0,Rainstorm,Pinot Gris,2013.0
3,St. Julian 2013 Reserve Late Harvest Riesling ...,Lake Michigan Shore,Michigan,,US,87,13.0,St. Julian,Riesling,2013.0
4,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Willamette Valley,Oregon,Willamette Valley,US,87,65.0,Sweet Cheeks,Pinot Noir,2012.0


In [25]:
# Fix issues with how Vintage is being displayed
df['Vintage'].fillna(pd.NA, inplace=True)
df['Vintage'] = df['Vintage'].astype(int)

# Check results
df.head()

Unnamed: 0,Wine Name,Region 1,Region 2,Region 3,Country,Score,Price,Winery,Variety,Vintage
0,Nicosia 2013 Vulka Bianco (Etna),Etna,Sicily & Sardinia,,Italy,87,,Nicosia,White Blend,2013
1,Quinta dos Avidagos 2011 Avidagos Red (Douro),,Douro,,Portugal,87,15.0,Quinta dos Avidagos,Portuguese Red,2011
2,Rainstorm 2013 Pinot Gris (Willamette Valley),Willamette Valley,Oregon,Willamette Valley,US,87,14.0,Rainstorm,Pinot Gris,2013
3,St. Julian 2013 Reserve Late Harvest Riesling ...,Lake Michigan Shore,Michigan,,US,87,13.0,St. Julian,Riesling,2013
4,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Willamette Valley,Oregon,Willamette Valley,US,87,65.0,Sweet Cheeks,Pinot Noir,2012


In [26]:
# Save the modified DataFrame to a new CSV file
df.to_csv('data\intermediate\standardized_downloaded_wine_data.csv', index=False)
print('New CSV with vintage column saved as: standardized_downloaded_wine_data.csv')

New CSV with vintage column saved as: standardized_downloaded_wine_data.csv


In [31]:
scraped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42479 entries, 0 to 42478
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Wine Name  42479 non-null  object
 1   Region 1   34635 non-null  object
 2   Region 2   23504 non-null  object
 3   Region 3   16229 non-null  object
 4   Country    42479 non-null  object
 5   Score      42479 non-null  int64 
 6   Price      38849 non-null  object
 7   Winery     42479 non-null  object
 8   Variety    40626 non-null  object
 9   Wine Type  42479 non-null  object
 10  vintage    37354 non-null  Int64 
dtypes: Int64(1), int64(1), object(9)
memory usage: 3.6+ MB


In [32]:
# Now we standardize the scraped data
# Drop the 'Wine Type' column
scraped.drop(columns=['Wine Type'], inplace=True)

# Rename 'vintage' to 'Vintage'
scraped.rename(columns={'vintage': 'Vintage'}, inplace=True)

# Check Results
scraped.head()

Unnamed: 0,Wine Name,Region 1,Region 2,Region 3,Country,Score,Price,Winery,Variety,Vintage
0,Benovia 2021 Chardonnay (Russian River Valley),Sonoma,California,,US,93,$48,Benovia,Chardonnay,2021
1,Benovia 2021 Pinot Noir (Russian River Valley),Sonoma,California,,US,92,$55,Benovia,Pinot Noir,2021
2,Benovia 2020 Cohn Vineyard Pinot Noir (Sonoma ...,Sonoma,California,,US,95,$100,Benovia,Pinot Noir,2020
3,Ross Knoll Vineyard 2021 Pinot Noir (Russian R...,Sonoma,California,,US,92,$60,Ross Knoll Vineyard,Pinot Noir,2021
4,Belle Glos 2022 Clark & Telephone Pinot Noir (...,Central Coast,California,,US,92,$55,Belle Glos,Pinot Noir,2022


In [20]:
# Save the modified DataFrame to a new CSV file
df.to_csv('data\intermediate\standardized_scraped_wine_data.csv', index=False)
print('New CSV with vintage column saved as: standardized_scraped_wine_data.csv')

New CSV with vintage column saved as: standardized_scraped_wine_data.csv


In [33]:
# Combine 'df' and 'scraped' DataFrames
combined_df = pd.concat([df, scraped])

# Sort the combined DataFrame by 'Vintage' in increasing order
combined_df.sort_values(by='Vintage', inplace=True)

# Reset the index of the sorted DataFrame
combined_df.reset_index(drop=True, inplace=True)

# Check Results
combined_df.head()

Unnamed: 0,Wine Name,Region 1,Region 2,Region 3,Country,Score,Price,Winery,Variety,Vintage
0,Cleto Chiarli NV Vigneto Enrico Cialdini (Lam...,Lambrusco Grasparossa di Castelvetro,Central Italy,,Italy,89,,Cleto Chiarli,Lambrusco Grasparossa,-1
1,Chateau de Montgueret NV Brut Sparkling (Crema...,Cremant de Loire,Loire Valley,,France,88,20.0,Chateau de Montgueret,Sparkling Blend,-1
2,Adami NV Vigneto Giardino Dry (Prosecco di Va...,Prosecco di Valdobbiadene,Veneto,,Italy,87,22.0,Adami,Prosecco,-1
3,Feudi di San Gregorio NV Dubl Rosato Metodo Cl...,Vino Spumante,Italy Other,,Italy,92,43.0,Feudi di San Gregorio,Aglianico,-1
4,Wolfberger NV Brut Pinot Gris (Cremant d'Alsace),Cremant d'Alsace,Alsace,,France,88,20.0,Wolfberger,Pinot Gris,-1


In [34]:
# Changing all -1 values in the 'Vintage' column to null (NaN) values
combined_df['Vintage'].replace(-1, pd.NA, inplace=True)

# Check Results
combined_df.head()

Unnamed: 0,Wine Name,Region 1,Region 2,Region 3,Country,Score,Price,Winery,Variety,Vintage
0,Cleto Chiarli NV Vigneto Enrico Cialdini (Lam...,Lambrusco Grasparossa di Castelvetro,Central Italy,,Italy,89,,Cleto Chiarli,Lambrusco Grasparossa,
1,Chateau de Montgueret NV Brut Sparkling (Crema...,Cremant de Loire,Loire Valley,,France,88,20.0,Chateau de Montgueret,Sparkling Blend,
2,Adami NV Vigneto Giardino Dry (Prosecco di Va...,Prosecco di Valdobbiadene,Veneto,,Italy,87,22.0,Adami,Prosecco,
3,Feudi di San Gregorio NV Dubl Rosato Metodo Cl...,Vino Spumante,Italy Other,,Italy,92,43.0,Feudi di San Gregorio,Aglianico,
4,Wolfberger NV Brut Pinot Gris (Cremant d'Alsace),Cremant d'Alsace,Alsace,,France,88,20.0,Wolfberger,Pinot Gris,


In [35]:
# Save the modified DataFrame to a new CSV file
combined_df.to_csv('data\intermediate\combined_wine_data.csv', index=False)
print('New CSV with vintage column saved as: combined_wine_data.csv')

New CSV with vintage column saved as: combined_wine_data.csv


In [36]:
# Now that the scrapped data is combined with the downloaded data, there may still be diacritics that need to be cleaned.
# Define file paths
input_csv = 'data\intermediate\combined_wine_data.csv' 
output_csv = 'data\intermediate\combined_wine_data_with_diacritics_removed.csv'

# Function to clean the strings by replacing diacritics with ASCII equivalents
def clean_string(s):
    return unidecode.unidecode(s)

# Open the input CSV file and create a new output CSV file
with open(input_csv, mode='r', encoding='utf-8') as infile, open(output_csv, mode='w', newline='', encoding='utf-8') as outfile:
    # Create a CSV reader and writer
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # Read and write the header
    header = next(reader)
    writer.writerow(header)

    # Process each row in the CSV
    for row in reader:
        # Clean each cell in the row
        cleaned_row = [clean_string(cell) for cell in row]
        # Write the cleaned row to the output CSV
        writer.writerow(cleaned_row)

print('Diacritics converted to plain English. Converted file saved as:', output_csv)

Diacritics converted to plain English. Converted file saved as: data\intermediate\combined_wine_data_with_diacritics_removed.csv


In [71]:
combined_df = pd.read_csv('data\intermediate\combined_wine_data_with_diacritics_removed.csv')
combined_df.head()

Unnamed: 0,Wine Name,Region 1,Region 2,Region 3,Country,Score,Price,Winery,Variety,Vintage
0,Cleto Chiarli NV Vigneto Enrico Cialdini (Lam...,Lambrusco Grasparossa di Castelvetro,Central Italy,,Italy,89,,Cleto Chiarli,Lambrusco Grasparossa,
1,Chateau de Montgueret NV Brut Sparkling (Crema...,Cremant de Loire,Loire Valley,,France,88,20.0,Chateau de Montgueret,Sparkling Blend,
2,Adami NV Vigneto Giardino Dry (Prosecco di Va...,Prosecco di Valdobbiadene,Veneto,,Italy,87,22.0,Adami,Prosecco,
3,Feudi di San Gregorio NV Dubl Rosato Metodo Cl...,Vino Spumante,Italy Other,,Italy,92,43.0,Feudi di San Gregorio,Aglianico,
4,Wolfberger NV Brut Pinot Gris (Cremant d'Alsace),Cremant d'Alsace,Alsace,,France,88,20.0,Wolfberger,Pinot Gris,


In [73]:
# Since we can only use data that have a vintage and we only have weather data as early as 1991, we need to run the following
# script to make sure there isn't any incorrect vintages and drop all vintages that is showing up earlier than 1991.

def re_extract_vintage_adjusted_v2(row):
    # Regular expression to find four-digit numbers
    matches = re.findall(r'\b\d{4}\b', row['Wine Name'])
    # Filter out numbers less than 1991
    valid_matches = [int(match) for match in matches if int(match) >= 1991]
    
    if valid_matches:
        # Return the first valid match if found
        return valid_matches[0]
    # If no valid match is found, return pd.NA to indicate dropping the row later
    return pd.NA

# Apply the function to all rows in the DataFrame
combined_df['Vintage'] = combined_df.apply(re_extract_vintage_adjusted_v2, axis=1)

# Drop rows where 'Vintage' is null or less than 1991
combined_df = combined_df.dropna(subset=['Vintage'])
combined_df = combined_df[combined_df['Vintage'] >= 1991]

# Sort the combined DataFrame by 'Vintage' in increasing order
combined_df.sort_values(by='Vintage', inplace=True)

# Reset the index of the DataFrame
combined_df.reset_index(drop=True, inplace=True)

# Check results
combined_df.head()

Unnamed: 0,Wine Name,Region 1,Region 2,Region 3,Country,Score,Price,Winery,Variety,Vintage
0,Iron Horse 1991 Brut L.D. (Sonoma County),Sonoma County,California,Sonoma,US,93,60.0,Iron Horse,Champagne Blend,1991
1,Bellavista 1991 Vittorio Moretti Reserve Cuvee...,Franciacorta,Lombardy,,Italy,83,100.0,Bellavista,Champagne Blend,1991
2,Sebastiani 1991 Cherryblock Vineyard Cabernet ...,Sonoma Valley,California,Sonoma,US,88,160.0,Sebastiani,Cabernet Sauvignon,1991
3,Moulin Touchais 1991 Chenin Blanc (Coteaux du ...,Loire Valley,,Coteaux du Layon,France,92,$72,Moulin Touchais,Chenin Blanc,1991
4,Argyle 1991 Extended Tirage - Disgorged on Dem...,Willamette Valley,Oregon,,US,88,$22,Argyle,Sparkling,1991


In [75]:
# Price data also need to be standardized
def clean_price(price):
    if pd.isna(price):
        return price
    # Remove $ sign if present and convert to float
    return float(price.replace('$', ''))

# Apply the function to the 'Price' column
combined_df['Price'] = combined_df['Price'].apply(clean_price)

# Format the 'Price' column to show value up to the second decimal place
combined_df['Price'] = combined_df['Price'].map('{:.2f}'.format)

# Sort the combined DataFrame by 'Vintage' in increasing order
combined_df.sort_values(by='Vintage', inplace=True)

# Reset the index of the DataFrame
combined_df.reset_index(drop=True, inplace=True)

# Check results
combined_df.head(10)

Unnamed: 0,Wine Name,Region 1,Region 2,Region 3,Country,Score,Price,Winery,Variety,Vintage
0,Iron Horse 1991 Brut L.D. (Sonoma County),Sonoma County,California,Sonoma,US,93,60.0,Iron Horse,Champagne Blend,1991
1,Montecillo 1991 Seleccion Especial Gran Reserv...,Northern Spain,,Rioja,Spain,92,65.0,Montecillo,Tempranillo,1991
2,Gloria Ferrer 1991 Carneros Cuvee Brut LD (Ca...,Carneros,California,Napa-Sonoma,US,90,32.0,Gloria Ferrer,Champagne Blend,1991
3,Cantine Florio 1991 Targa Riserva White (Marsala),Sicily & Sardinia,,Marsala,Italy,90,,Cantine Florio,White Blend,1991
4,Blandy's 1991 Malmsey Malmsey (Madeira),,Madeira,,Portugal,96,362.0,Blandy's,Madeira,1991
5,Fortino 1991 Montonico Reserve Montonico Bianc...,Central Coast,California,,US,92,35.0,Fortino,Italian White,1991
6,Argyle 1991 Extended Tirage - Disgorged on Dem...,Willamette Valley,Oregon,,US,88,22.0,Argyle,Sparkling,1991
7,Moulin Touchais 1991 Chenin Blanc (Coteaux du ...,Loire Valley,,Coteaux du Layon,France,92,72.0,Moulin Touchais,Chenin Blanc,1991
8,Sebastiani 1991 Cherryblock Vineyard Cabernet ...,Sonoma Valley,California,Sonoma,US,88,160.0,Sebastiani,Cabernet Sauvignon,1991
9,Bellavista 1991 Vittorio Moretti Reserve Cuvee...,Franciacorta,Lombardy,,Italy,83,100.0,Bellavista,Champagne Blend,1991


In [76]:
# Dropping duplicate rows based on the 'Wine Name' column
combined_df = combined_df.drop_duplicates(subset=['Wine Name'])

# Sort the combined DataFrame by 'Vintage' in increasing order
combined_df.sort_values(by='Vintage', inplace=True)

# Reset the index of the DataFrame
combined_df.reset_index(drop=True, inplace=True)

# Check results
combined_df.head(10)

Unnamed: 0,Wine Name,Region 1,Region 2,Region 3,Country,Score,Price,Winery,Variety,Vintage
0,Iron Horse 1991 Brut L.D. (Sonoma County),Sonoma County,California,Sonoma,US,93,60.0,Iron Horse,Champagne Blend,1991
1,Howard's Folly 1991 Casa Manoel Boullush White...,,Carcavelos,,Portugal,93,70.0,Howard's Folly,Portuguese White,1991
2,Bellavista 1991 Vittorio Moretti Reserve Cuvee...,Franciacorta,Lombardy,,Italy,83,100.0,Bellavista,Champagne Blend,1991
3,Moulin Touchais 1991 Chenin Blanc (Coteaux du ...,Loire Valley,,Coteaux du Layon,France,92,72.0,Moulin Touchais,Chenin Blanc,1991
4,Argyle 1991 Extended Tirage - Disgorged on Dem...,Willamette Valley,Oregon,,US,88,22.0,Argyle,Sparkling,1991
5,Sebastiani 1991 Cherryblock Vineyard Cabernet ...,Sonoma Valley,California,Sonoma,US,88,160.0,Sebastiani,Cabernet Sauvignon,1991
6,Blandy's 1991 Malmsey Malmsey (Madeira),,Madeira,,Portugal,96,362.0,Blandy's,Madeira,1991
7,Cantine Florio 1991 Targa Riserva White (Marsala),Sicily & Sardinia,,Marsala,Italy,90,,Cantine Florio,White Blend,1991
8,Gloria Ferrer 1991 Carneros Cuvee Brut LD (Ca...,Carneros,California,Napa-Sonoma,US,90,32.0,Gloria Ferrer,Champagne Blend,1991
9,Montecillo 1991 Seleccion Especial Gran Reserv...,Northern Spain,,Rioja,Spain,92,65.0,Montecillo,Tempranillo,1991


In [77]:
# Check remaining rows of data
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143092 entries, 0 to 143091
Data columns (total 10 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Wine Name  143092 non-null  object
 1   Region 1   119073 non-null  object
 2   Region 2   129417 non-null  object
 3   Region 3   56490 non-null   object
 4   Country    143035 non-null  object
 5   Score      143092 non-null  int64 
 6   Price      143092 non-null  object
 7   Winery     143092 non-null  object
 8   Variety    141580 non-null  object
 9   Vintage    143092 non-null  int64 
dtypes: int64(2), object(8)
memory usage: 10.9+ MB


In [78]:
# Save the modified DataFrame to a new CSV file
combined_df.to_csv('data\clean\cleaned_combined_wine_data.csv', index=False)
print('New CSV with vintage column saved as: cleaned_combined_wine_data.csv')

New CSV with vintage column saved as: cleaned_combined_wine_data.csv
