In [15]:
#Load Dependencies
import unidecode
import csv
import pandas as pd
import re
import numpy as np

## Removing Diacritics From Excel

In [2]:
# Sanity Check
wine_scores = pd.read_csv('data\intermediate\kaggle_wine_scores.csv')
wine_scores.head()

Unnamed: 0,id,country,designation,points,price,province,region_1,region_2,title,variety,winery
0,0,Italy,Vulkà Bianco,87,Null,Sicily & Sardinia,Etna,Null,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,Avidagos,87,15,Douro,Null,Null,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,Null,87,14,Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,Reserve Late Harvest,87,13,Michigan,Lake Michigan Shore,Null,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,Vintner's Reserve Wild Child Block,87,65,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [3]:
wine_scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           129971 non-null  int64 
 1   country      129971 non-null  object
 2   designation  129971 non-null  object
 3   points       129971 non-null  int64 
 4   price        129971 non-null  object
 5   province     129971 non-null  object
 6   region_1     129971 non-null  object
 7   region_2     129971 non-null  object
 8   title        129971 non-null  object
 9   variety      129971 non-null  object
 10  winery       129971 non-null  object
dtypes: int64(2), object(9)
memory usage: 10.9+ MB


In [4]:
# Define file paths
input_csv = 'data\intermediate\kaggle_wine_scores.csv' 
output_csv = 'data\intermediate\wine_scores_with_diacritics_removed.csv'

# Function to clean the strings by replacing diacritics with ASCII equivalents
def clean_string(s):
    return unidecode.unidecode(s)

# Open the input CSV file and create a new output CSV file
with open(input_csv, mode='r', encoding='utf-8') as infile, open(output_csv, mode='w', newline='', encoding='utf-8') as outfile:
    # Create a CSV reader and writer
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # Read and write the header
    header = next(reader)
    writer.writerow(header)

    # Process each row in the CSV
    for row in reader:
        # Clean each cell in the row
        cleaned_row = [clean_string(cell) for cell in row]
        # Write the cleaned row to the output CSV
        writer.writerow(cleaned_row)

print('Diacritics converted to plain English. Converted file saved as:', output_csv)

Diacritics converted to plain English. Converted file saved as: data\intermediate\wine_scores_with_diacritics_removed.csv


In [10]:
# Sanity Check
df = pd.read_csv('data\intermediate\wine_scores_with_diacritics_removed.csv')
df.head()

Unnamed: 0,id,country,designation,points,price,province,region_1,region_2,title,variety,winery
0,0,Italy,Vulka Bianco,87,Null,Sicily & Sardinia,Etna,Null,Nicosia 2013 Vulka Bianco (Etna),White Blend,Nicosia
1,1,Portugal,Avidagos,87,15,Douro,Null,Null,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,Null,87,14,Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,Reserve Late Harvest,87,13,Michigan,Lake Michigan Shore,Null,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,Vintner's Reserve Wild Child Block,87,65,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           129971 non-null  int64 
 1   country      129971 non-null  object
 2   designation  129971 non-null  object
 3   points       129971 non-null  int64 
 4   price        129971 non-null  object
 5   province     129971 non-null  object
 6   region_1     129971 non-null  object
 7   region_2     129971 non-null  object
 8   title        129971 non-null  object
 9   variety      129971 non-null  object
 10  winery       129971 non-null  object
dtypes: int64(2), object(9)
memory usage: 10.9+ MB


## Extract Vintage information from title column

In [12]:
# Function to extract vintage from the title
def extract_vintage(title):
    # Regular expression to find four consecutive digits
    match = re.search(r'(\b\d{4}\b)', title)
    # If a match is found, return the match, otherwise return None
    return int(match.group(1)) if match else pd.NA

# Apply the function to the 'title' column and create a new 'vintage' column
df['vintage'] = df['title'].apply(extract_vintage).astype('Int64')

# Save the modified DataFrame to a new CSV file
df.to_csv('data\intermediate\wine_score_with_vintage.csv', index=False)
print('New CSV with vintage column saved as: wine_score_with_vintage.csv')

New CSV with vintage column saved as: wine_score_with_vintage.csv


In [13]:
#Sanity Check
df.head()

Unnamed: 0,id,country,designation,points,price,province,region_1,region_2,title,variety,winery,vintage
0,0,Italy,Vulka Bianco,87,Null,Sicily & Sardinia,Etna,Null,Nicosia 2013 Vulka Bianco (Etna),White Blend,Nicosia,2013
1,1,Portugal,Avidagos,87,15,Douro,Null,Null,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011
2,2,US,Null,87,14,Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013
3,3,US,Reserve Late Harvest,87,13,Michigan,Lake Michigan Shore,Null,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013
4,4,US,Vintner's Reserve Wild Child Block,87,65,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           129971 non-null  int64 
 1   country      129971 non-null  object
 2   designation  129971 non-null  object
 3   points       129971 non-null  int64 
 4   price        129971 non-null  object
 5   province     129971 non-null  object
 6   region_1     129971 non-null  object
 7   region_2     129971 non-null  object
 8   title        129971 non-null  object
 9   variety      129971 non-null  object
 10  winery       129971 non-null  object
 11  vintage      125362 non-null  Int64 
dtypes: Int64(1), int64(2), object(9)
memory usage: 12.0+ MB


## Change "Null" string value to PANDAS compatible null value

In [16]:
# Replace 'Null' strings with numpy.nan across the entire DataFrame
df.replace('Null', np.nan, inplace=True)

In [17]:
# Sanity Check
df.head()

Unnamed: 0,id,country,designation,points,price,province,region_1,region_2,title,variety,winery,vintage
0,0,Italy,Vulka Bianco,87,,Sicily & Sardinia,Etna,,Nicosia 2013 Vulka Bianco (Etna),White Blend,Nicosia,2013
1,1,Portugal,Avidagos,87,15.0,Douro,,,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011
2,2,US,,87,14.0,Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013
3,3,US,Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013
4,4,US,Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           129971 non-null  int64 
 1   country      129908 non-null  object
 2   designation  92506 non-null   object
 3   points       129971 non-null  int64 
 4   price        120975 non-null  object
 5   province     129908 non-null  object
 6   region_1     108724 non-null  object
 7   region_2     50511 non-null   object
 8   title        129971 non-null  object
 9   variety      129970 non-null  object
 10  winery       129971 non-null  object
 11  vintage      125362 non-null  Int64 
dtypes: Int64(1), int64(2), object(9)
memory usage: 12.0+ MB


In [19]:
# Convert price to Int64
df['price'] = df['price'].astype('Int64')
# Check Dtype
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           129971 non-null  int64 
 1   country      129908 non-null  object
 2   designation  92506 non-null   object
 3   points       129971 non-null  int64 
 4   price        120975 non-null  Int64 
 5   province     129908 non-null  object
 6   region_1     108724 non-null  object
 7   region_2     50511 non-null   object
 8   title        129971 non-null  object
 9   variety      129970 non-null  object
 10  winery       129971 non-null  object
 11  vintage      125362 non-null  Int64 
dtypes: Int64(2), int64(2), object(8)
memory usage: 12.1+ MB


In [20]:
#Recheck Dataframe
df.head()

Unnamed: 0,id,country,designation,points,price,province,region_1,region_2,title,variety,winery,vintage
0,0,Italy,Vulka Bianco,87,,Sicily & Sardinia,Etna,,Nicosia 2013 Vulka Bianco (Etna),White Blend,Nicosia,2013
1,1,Portugal,Avidagos,87,15.0,Douro,,,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011
2,2,US,,87,14.0,Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013
3,3,US,Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013
4,4,US,Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012


In [21]:
# Export to CSV
df.to_csv('data\clean\cleaned_wine_score.csv', index=False)
print('New CSV with cleaned data saved as: cleaned_wine_score.csv')

New CSV with cleaned data saved as: cleaned_wine_score.csv
