# Coffee dataset EDA

## Imports

In [1]:
import pandas as pd

In [67]:
coffee_df = pd.read_csv('data\coffee_desk_dataset_clean.csv', index_col='idx')
coffee_df

Unnamed: 0_level_0,process,destination,roast,blend,arabica_robusta,origin,price_per_kg
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Macerated Natural,drip (alternative brewing methods),light,beans,100% Arabica,Laos,52.22
2,Natural,drip (alternative brewing methods),medium,beans,100% Arabica,Brazylia,31.92
3,Natural,drip (alternative brewing methods),light,beans,100% Arabica,Etiopia,39.20
4,Washed,drip (alternative brewing methods),light,beans,100% Arabica,Etiopia,39.20
5,Natural,drip (alternative brewing methods),dark,beans,100% Arabica,Indonezja,35.20
...,...,...,...,...,...,...,...
862,Washed,"drip, espresso",light,beans,100% Arabica,Gwatemala,73.33
863,Natural,espresso,light,beans,40/60,Panama,50.00
864,Macerated Natural,drip (alternative brewing methods),light,beans,100% Arabica,Nikaragua,36.00
865,Washed,drip (alternative brewing methods),light,beans,100% Arabica,Gwatemala,25.00


In [75]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

px.histogram(coffee_df, "price_per_kg", marginal='box', title="Histogram of price (per kg) for coffee") #marginal shows distribution

## Notes:
1. there is obsevable outlier in the dataset, which might need to be dropped later on
2. there are two coffees with missing target variable

## Plotting categorical data

* All of the variables are categorical except for the target one.

In [7]:
coffee_df.columns

Index(['process', 'destination', 'roast', 'blend', 'arabica_robusta', 'origin',
       'price_per_kg'],
      dtype='object')

In [58]:
categorical_data = ['process', 'destination', 'roast', 'blend', 'arabica_robusta', 'origin']

process_fig = px.violin(coffee_df, x='process', y='price_per_kg', box=True, points="all", title="Coffee processing method vs coffee price")
process_fig.update_layout(xaxis_type="category", xaxis={'categoryorder':'mean ascending'})

In [59]:
destination_fig = px.violin(coffee_df, x='destination', y='price_per_kg', box=True, points="all", title="Coffee brewing method vs coffee price")
destination_fig.update_layout(xaxis_type="category", xaxis={'categoryorder':'mean ascending'})

In [60]:
roast_fig = px.violin(coffee_df, x='roast', y='price_per_kg', box=True, points="all", title="Coffee roast degree vs coffee price")
roast_fig.update_layout(xaxis_type="category", xaxis={'categoryorder':'mean ascending'})

## Notes:

* seems like roast and brewing method are related

In [61]:
blend_fig = px.violin(coffee_df, x='blend', y='price_per_kg', box=True, title="Coffee grind degree vs coffee price")
blend_fig.update_layout(xaxis_type="category", xaxis={'categoryorder':'mean ascending'})

In [68]:
arabica_robusta_fig = px.violin(coffee_df, x='arabica_robusta', y='price_per_kg', box=True, points="all", title="Arabica to robusta proportion vs coffee price")
arabica_robusta_fig.update_layout(xaxis_type="category", xaxis={'categoryorder':'mean ascending'})

## Notes

It would be worth changing ranges in proportion for arabica to robusta and to change them to numerical variable

In [69]:
coffee_df['arabica_robusta'].value_counts()

100% Arabica    747
80/20            22
60/40            14
40/60            11
100% Robusta     11
75/25            10
70/30             9
90/10             9
50/50             9
85/15             6
30/70             4
92/8              3
55/45             2
95/5              2
78/22             2
84/16             1
10/90             1
35/65             1
65/35             1
Name: arabica_robusta, dtype: int64

In [70]:
coffee_df[['arabica (%)', 'robusta (%)']] = coffee_df['arabica_robusta'].str.split('/', 1, expand=True) # spliting column into two
coffee_df['arabica (%)'].value_counts()

100% Arabica    747
80               22
60               14
40               11
100% Robusta     11
75               10
50                9
70                9
90                9
85                6
30                4
92                3
95                2
55                2
78                2
10                1
35                1
65                1
84                1
Name: arabica (%), dtype: int64

In [71]:
coffee_df['arabica (%)'].replace({'100% Arabica' : 100, '100% Robusta' : 0}, inplace=True) # cleaning variable
coffee_df['arabica (%)'].value_counts()

100    747
80      22
60      14
40      11
0       11
75      10
70       9
50       9
90       9
85       6
30       4
92       3
55       2
78       2
95       2
35       1
65       1
10       1
84       1
Name: arabica (%), dtype: int64

In [54]:
coffee_df['arabica (%)'] = coffee_df['arabica (%)'].astype('int64')
coffee_df.dtypes

process             object
destination         object
roast               object
blend               object
arabica_robusta     object
origin              object
price_per_kg       float64
arabica (%)          int64
robusta (%)         object
dtype: object

We no longer need robusta nor the original column so we will drop it from df not to analyze it.
Robusta is complimentary to Arabica and would add noise to data.

In [72]:
coffee_df = coffee_df.drop(['arabica_robusta', 'robusta (%)'], axis=1)
coffee_df.head(10)

Unnamed: 0_level_0,process,destination,roast,blend,origin,price_per_kg,arabica (%)
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Macerated Natural,drip (alternative brewing methods),light,beans,Laos,52.22,100
2,Natural,drip (alternative brewing methods),medium,beans,Brazylia,31.92,100
3,Natural,drip (alternative brewing methods),light,beans,Etiopia,39.2,100
4,Washed,drip (alternative brewing methods),light,beans,Etiopia,39.2,100
5,Natural,drip (alternative brewing methods),dark,beans,Indonezja,35.2,100
6,Natural,drip (alternative brewing methods),medium,beans,Brazylia,20.76,100
7,Washed,drip (alternative brewing methods),light,beans,Kenia,45.0,100
8,Carbonic Maceration,drip (alternative brewing methods),light,beans,Panama,280.0,100
9,Red Honey,drip (alternative brewing methods),light,ground,Kolumbia,100.0,100
10,Washed,drip (alternative brewing methods),light,beans,Salwador,45.6,100


In [73]:
origin_fig = px.violin(coffee_df, x='origin', y='price_per_kg', box=True, title="Coffee origin vs coffee price")
origin_fig.update_layout(xaxis_type="category", xaxis={'categoryorder':'mean ascending'})

In [90]:
categorical_data = ['destination', 'roast', 'blend', 'origin', 'arabica (%)']
rows = 3
cols = 2
subplot_titles = tuple(cat + " vs coffee price (per kg)" for cat in categorical_data)
fig = make_subplots(rows=rows, cols=cols, subplot_titles=subplot_titles)

for i, cat in enumerate(categorical_data):
    row = (i // cols) + 1
    col = (i % cols) + 1
    
    fig.add_trace(go.Violin(
        x=coffee_df[cat], y=coffee_df['price_per_kg'], name=cat, box_visible=True
    ), row=row, col=col)
    
    fig.update_xaxes(patch=dict(type='category', categoryorder='mean ascending'), row=row, col=col)
    
    
fig.update_layout(height=1000, width=1300, title='Violin plots of feature vs coffee price', showlegend=True)
fig.show()

## Observations:

* there is one outlier to be removed
* plots for roast and brweing method are very similar, which highligths the risk of potential dependency between the two

## Numeric variable

In [None]:

process_fig = px.violin(coffee_df, x='process', y='price_per_kg', box=True, points="all", title="Coffee processing method vs coffee price")
process_fig.update_layout(xaxis_type="category", xaxis={'categoryorder':'mean ascending'})