# Exploratory Data Analysis of Wine Dataset

## Load and Inspect the Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

file_path = 'EDA.csv'  
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,id,country,description,designation,points,price,province,title,variety,winery
0,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
1,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
2,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
3,5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
4,6,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16.0,Sicily & Sardinia,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo


## Data Description

In [2]:
df.describe()

Unnamed: 0,id,points,price
count,86148.0,86148.0,86148.0
mean,65008.434729,88.729907,37.556403
std,37531.085008,3.05198,36.390439
min,1.0,80.0,4.0
25%,32448.5,87.0,18.0
50%,65192.5,89.0,28.0
75%,97470.5,91.0,45.0
max,129970.0,100.0,2013.0


In [3]:
df.nunique()

id             86148
country           41
description    79477
designation    35750
points            21
price            330
province         402
title          78719
variety          630
winery         11762
dtype: int64

In [16]:
import pycountry_convert as pc

def country_to_continent(country_name):
    try:
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    except:
        if country_name == 'US':
            return 'North America'
        elif country_name == 'England':
            return 'Europe'
        return 'Not found'  # Returns 'Not found' if the country name is not recognized. I checked and it recognizes all
    return country_continent_name

df['continent'] = df['country'].apply(country_to_continent)

In [62]:
df_treemap

Unnamed: 0,continent,country,province,count
0,Africa,Morocco,Guerrouane,3
1,Africa,Morocco,Morocco,5
2,Africa,Morocco,Zenata,8
3,Africa,South Africa,Bot River,2
4,Africa,South Africa,Breedekloof,2
...,...,...,...,...
397,South America,Uruguay,Juanico,8
398,South America,Uruguay,Montevideo,8
399,South America,Uruguay,Progreso,11
400,South America,Uruguay,San Jose,3


In [106]:
df_treemap.dtypes

count    int64
dtype: object

In [119]:
df_treemap = pd.DataFrame(df.copy().groupby(['continent', 'country', 'province', 'winery']).size(), columns=['count'])

def df_to_nested_dict(df):
    result = {}
    for index, value in df.iterrows():
        # Navigate through the multi-index and assign the value
        d = result
        for level in index[:-1]:  # Go until the second last index level
            if level not in d:
                d[level] = {}
            d = d[level]
        # Assign the count to the last level
        d[index[-1]] = int(value['count'])
    return result

# Convert and print the nested dictionary
nested_dict = df_to_nested_dict(df_treemap)


In [120]:
import json
with open('treemap.json', 'w') as fp:
    json.dump(nested_dict, fp, indent=2)

In [46]:
df_treemap

Unnamed: 0,continent,country,province,count
0,Africa,Morocco,Guerrouane,3
1,Africa,Morocco,Morocco,5
2,Africa,Morocco,Zenata,8
3,Africa,South Africa,Bot River,2
4,Africa,South Africa,Breedekloof,2
...,...,...,...,...
397,South America,Uruguay,Juanico,8
398,South America,Uruguay,Montevideo,8
399,South America,Uruguay,Progreso,11
400,South America,Uruguay,San Jose,3
