# Coffee dataset EDA

## Imports

In [3]:
import pandas as pd

In [6]:
coffee_df = pd.read_csv('data\coffee_desk_dataset_clean.csv', index_col='idx')
coffee_df

Unnamed: 0_level_0,process,brewing method,roast,grind,arabica_robusta,origin,price_per_kg
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Monsooning,drip (alternative brewing methods),light,beans,100% Arabica,Laos,52.22
2,Natural,drip (alternative brewing methods),medium,beans,100% Arabica,Brazylia,31.92
3,Natural,drip (alternative brewing methods),light,beans,100% Arabica,Etiopia,39.20
4,Washed,drip (alternative brewing methods),light,beans,100% Arabica,Etiopia,39.20
5,Natural,drip (alternative brewing methods),dark,beans,100% Arabica,Indonezja,35.20
...,...,...,...,...,...,...,...
862,Rum Aged,drip (alternative brewing methods),light,beans,100% Arabica,Gwatemala,73.33
863,Natural,espresso,light,beans,30/70,Panama,50.00
864,Pulped natural,drip (alternative brewing methods),light,beans,100% Arabica,Nikaragua,36.00
865,Washed,drip (alternative brewing methods),light,beans,100% Arabica,Gwatemala,25.00


In [7]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

px.histogram(coffee_df, "price_per_kg", marginal='box', title="Histogram of price (per kg) for coffee") #marginal shows distribution

## Notes:
1. there is obsevable outlier in the dataset, which might need to be dropped later on
2. there are two coffees with missing target variable
3. there is observable right tailed distribution

In [8]:
coffee_df = coffee_df[coffee_df["price_per_kg"] != 0] # removing the missing target value rows
coffee_df

Unnamed: 0_level_0,process,brewing method,roast,grind,arabica_robusta,origin,price_per_kg
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Monsooning,drip (alternative brewing methods),light,beans,100% Arabica,Laos,52.22
2,Natural,drip (alternative brewing methods),medium,beans,100% Arabica,Brazylia,31.92
3,Natural,drip (alternative brewing methods),light,beans,100% Arabica,Etiopia,39.20
4,Washed,drip (alternative brewing methods),light,beans,100% Arabica,Etiopia,39.20
5,Natural,drip (alternative brewing methods),dark,beans,100% Arabica,Indonezja,35.20
...,...,...,...,...,...,...,...
862,Rum Aged,drip (alternative brewing methods),light,beans,100% Arabica,Gwatemala,73.33
863,Natural,espresso,light,beans,30/70,Panama,50.00
864,Pulped natural,drip (alternative brewing methods),light,beans,100% Arabica,Nikaragua,36.00
865,Washed,drip (alternative brewing methods),light,beans,100% Arabica,Gwatemala,25.00


## Plotting categorical data

* All of the variables are categorical except for the target one.

In [9]:
coffee_df.columns

Index(['process', 'brewing method', 'roast', 'grind', 'arabica_robusta',
       'origin', 'price_per_kg'],
      dtype='object')

In [10]:
categorical_data = ['process', 'brewing method', 'roast', 'blend', 'arabica_robusta', 'origin']

process_fig = px.violin(coffee_df, x='process', y='price_per_kg', box=True, points="all", title="Coffee processing method vs coffee price")
process_fig.update_layout(xaxis_type="category", xaxis={'categoryorder':'mean ascending'})

In [11]:
destination_fig = px.violin(coffee_df, x='brewing method', y='price_per_kg', box=True, points="all", title="Coffee brewing method vs coffee price")
destination_fig.update_layout(xaxis_type="category", xaxis={'categoryorder':'mean ascending'})

In [12]:
roast_fig = px.violin(coffee_df, x='roast', y='price_per_kg', box=True, points="all", title="Coffee roast degree vs coffee price")
roast_fig.update_layout(xaxis_type="category", xaxis={'categoryorder':'mean ascending'})

## Notes:

* seems like roast and brewing method are related

In [13]:
blend_fig = px.violin(coffee_df, x='grind', y='price_per_kg', box=True, title="Coffee grind degree vs coffee price")
blend_fig.update_layout(xaxis_type="category", xaxis={'categoryorder':'mean ascending'})

In [14]:
arabica_robusta_fig = px.violin(coffee_df, x='arabica_robusta', y='price_per_kg', box=True, points="all", title="Arabica to robusta proportion vs coffee price")
arabica_robusta_fig.update_layout(xaxis_type="category", xaxis={'categoryorder':'mean ascending'})

## Notes

It would be worth changing ranges in proportion for arabica to robusta and to change them to numerical variable

In [15]:
coffee_df['arabica_robusta'].value_counts()

100% Arabica    742
80/20            22
50/50            14
60/40            13
75/25            10
30/70            10
70/30             9
40/60             9
90/10             9
100% Robusta      6
85/15             6
92/8              3
95/5              2
78/22             2
55/45             2
84/16             1
65/35             1
10/90             1
35/65             1
Name: arabica_robusta, dtype: int64

In [18]:
coffee_df[['arabica (%)', 'robusta (%)']] = coffee_df['arabica_robusta'].str.split('/', 1, expand=True) # spliting column into two
coffee_df['arabica (%)'].replace({'100% Arabica' : 100, '100% Robusta' : 0}, inplace=True) # cleaning variable
coffee_df['arabica (%)'].value_counts()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



100    742
80      22
50      14
60      13
30      10
75      10
90       9
40       9
70       9
0        6
85       6
92       3
78       2
95       2
55       2
84       1
35       1
10       1
65       1
Name: arabica (%), dtype: int64

In [19]:
coffee_df['arabica (%)'] = coffee_df['arabica (%)'].astype('int64')
coffee_df.dtypes



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



process             object
brewing method      object
roast               object
grind               object
arabica_robusta     object
origin              object
price_per_kg       float64
arabica (%)          int64
robusta (%)         object
dtype: object

We no longer need robusta nor the original column so we will drop it from df not to analyze it.
Robusta is complimentary to Arabica and would add noise to data.

In [20]:
coffee_df = coffee_df.drop(['arabica_robusta', 'robusta (%)'], axis=1)
coffee_df.head(10)

Unnamed: 0_level_0,process,brewing method,roast,grind,origin,price_per_kg,arabica (%)
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Monsooning,drip (alternative brewing methods),light,beans,Laos,52.22,100
2,Natural,drip (alternative brewing methods),medium,beans,Brazylia,31.92,100
3,Natural,drip (alternative brewing methods),light,beans,Etiopia,39.2,100
4,Washed,drip (alternative brewing methods),light,beans,Etiopia,39.2,100
5,Natural,drip (alternative brewing methods),dark,beans,Indonezja,35.2,100
6,Natural,drip (alternative brewing methods),medium,beans,Brazylia,20.76,100
7,Washed,drip (alternative brewing methods),light,beans,Kenia,45.0,100
8,Carbonic Maceration,drip (alternative brewing methods),light,beans,Panama,280.0,100
9,Semi-washed,drip (alternative brewing methods),light,ground,Kolumbia,100.0,100
10,Washed,drip (alternative brewing methods),light,beans,Salwador,45.6,100


In [21]:
origin_fig = px.violin(coffee_df, x='origin', y='price_per_kg', box=True, title="Coffee origin vs coffee price")
origin_fig.update_layout(xaxis_type="category", xaxis={'categoryorder':'mean ascending'})

## Observations:

* there is one outlier to be removed
* plots for roast and brweing method are very similar, which highligths the risk of potential dependency between the two

In [22]:
coffee_df = coffee_df[coffee_df["price_per_kg"] < 280] # removing the outlier
coffee_df

Unnamed: 0_level_0,process,brewing method,roast,grind,origin,price_per_kg,arabica (%)
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Monsooning,drip (alternative brewing methods),light,beans,Laos,52.22,100
2,Natural,drip (alternative brewing methods),medium,beans,Brazylia,31.92,100
3,Natural,drip (alternative brewing methods),light,beans,Etiopia,39.20,100
4,Washed,drip (alternative brewing methods),light,beans,Etiopia,39.20,100
5,Natural,drip (alternative brewing methods),dark,beans,Indonezja,35.20,100
...,...,...,...,...,...,...,...
862,Rum Aged,drip (alternative brewing methods),light,beans,Gwatemala,73.33,100
863,Natural,espresso,light,beans,Panama,50.00,30
864,Pulped natural,drip (alternative brewing methods),light,beans,Nikaragua,36.00,100
865,Washed,drip (alternative brewing methods),light,beans,Gwatemala,25.00,100


In [23]:
categorical_data = ['brewing method', 'roast', 'grind', 'origin', 'process']
rows = 3
cols = 2
subplot_titles = tuple(cat + " vs coffee price (per kg)" for cat in categorical_data)
fig = make_subplots(rows=rows, cols=cols, subplot_titles=subplot_titles)

for i, cat in enumerate(categorical_data):
    row = (i // cols) + 1
    col = (i % cols) + 1
    
    fig.add_trace(go.Violin(
        x=coffee_df[cat], y=coffee_df['price_per_kg'], name=cat, box_visible=True
    ), row=row, col=col)
    
    fig.update_xaxes(patch=dict(type='category', categoryorder='mean ascending'), row=row, col=col)
    
    
fig.update_layout(height=1000, width=1300, title='Violin plots of feature vs coffee price', showlegend=True)
fig.show()

## Numeric variable

In [24]:
arabica_fig = px.scatter(coffee_df, x='arabica (%)', y='price_per_kg')
arabica_fig.show()

## Save newest data

In [25]:
coffee_df.to_csv('data\coffee_desk_dataset_clean_with_numerical_var.csv')

## Notes

* variables overlapping:
    - brewing method
    - roast
* variables with visible general trend:
    - arabica (%)
* variables without general trend:
    - grind -> to be dropped maybe
* variables to investigate further:
    - origin -> seems to scattered

In [26]:
roast_to_brewing_fig = px.violin(coffee_df, x='brewing method', y='price_per_kg', color='roast', title="Coffee roast degree and brewing vs coffee price")
roast_to_brewing_fig.update_layout(xaxis_type="category", xaxis={'categoryorder':'mean ascending'})

There is observable trend: drip and alternative methods require light roast, hybrid methods use medium roast, and dark roast is best used for espresso.

## Notes

* For target variable use log1 or log2
* Concatenate roast and brewing and once hot encode them
* Add new flag variable -> if 100% of arabica or not
* positioning regarding type of processing -> change into higher level of positioning, create groups for patterns of process types
* use random state/seed to have same train-test split in case I need to redo EAD after data modelling
* I can do: 
    1. train-test split;
    2. backward selection and forward selection (compare the outcomes of the two)
    3. hyper param opt
    4. in the end cross variable

**Presentation**: think about addresing not technical audience; add reasoning for steps I took

## Data cleaning after first round for EAD

In [27]:
import numpy as np

In [28]:
# new flag for 100% of arabica
coffee_df['Pure arabica'] = np.where(coffee_df['arabica (%)'] == 100, True, False)
coffee_df.head()

Unnamed: 0_level_0,process,brewing method,roast,grind,origin,price_per_kg,arabica (%),Pure arabica
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Monsooning,drip (alternative brewing methods),light,beans,Laos,52.22,100,True
2,Natural,drip (alternative brewing methods),medium,beans,Brazylia,31.92,100,True
3,Natural,drip (alternative brewing methods),light,beans,Etiopia,39.2,100,True
4,Washed,drip (alternative brewing methods),light,beans,Etiopia,39.2,100,True
5,Natural,drip (alternative brewing methods),dark,beans,Indonezja,35.2,100,True


In [29]:
# concatenating roast and brew to check if joined they can predict the target variable better
coffee_df['roast_brew'] = coffee_df['roast'] + '_' + coffee_df['brewing method']
coffee_df.head()

Unnamed: 0_level_0,process,brewing method,roast,grind,origin,price_per_kg,arabica (%),Pure arabica,roast_brew
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Monsooning,drip (alternative brewing methods),light,beans,Laos,52.22,100,True,light_drip (alternative brewing methods)
2,Natural,drip (alternative brewing methods),medium,beans,Brazylia,31.92,100,True,medium_drip (alternative brewing methods)
3,Natural,drip (alternative brewing methods),light,beans,Etiopia,39.2,100,True,light_drip (alternative brewing methods)
4,Washed,drip (alternative brewing methods),light,beans,Etiopia,39.2,100,True,light_drip (alternative brewing methods)
5,Natural,drip (alternative brewing methods),dark,beans,Indonezja,35.2,100,True,dark_drip (alternative brewing methods)


In [30]:
coffee_df['roast_brew'].value_counts()

light_drip (alternative brewing methods)     373
medium_espresso                              206
dark_espresso                                196
medium_drip, espresso                         62
light_drip, espresso                          11
medium_drip (alternative brewing methods)      9
dark_drip, espresso                            2
light_espresso                                 2
dark_drip (alternative brewing methods)        1
Name: roast_brew, dtype: int64

In [73]:
coffee_df.to_csv('data\coffee_desk_dataset_ead_tmp.csv') # saving data, but TODO remove it later and save only the final

In [2]:
coffee_df = pd.read_csv('data\coffee_desk_dataset_ead_tmp.csv', index_col='idx')
coffee_df

Unnamed: 0_level_0,process,brewing method,roast,grind,origin,price_per_kg,arabica (%),Pure arabica,roast_brew
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Macerated Natural,drip (alternative brewing methods),light,beans,Laos,52.22,100,True,light_drip (alternative brewing methods)
2,Natural,drip (alternative brewing methods),medium,beans,Brazylia,31.92,100,True,medium_drip (alternative brewing methods)
3,Natural,drip (alternative brewing methods),light,beans,Etiopia,39.20,100,True,light_drip (alternative brewing methods)
4,Washed,drip (alternative brewing methods),light,beans,Etiopia,39.20,100,True,light_drip (alternative brewing methods)
5,Natural,drip (alternative brewing methods),dark,beans,Indonezja,35.20,100,True,dark_drip (alternative brewing methods)
...,...,...,...,...,...,...,...,...,...
862,Washed,"drip, espresso",light,beans,Gwatemala,73.33,100,True,"light_drip, espresso"
863,Natural,espresso,light,beans,Panama,50.00,40,False,light_espresso
864,Macerated Natural,drip (alternative brewing methods),light,beans,Nikaragua,36.00,100,True,light_drip (alternative brewing methods)
865,Washed,drip (alternative brewing methods),light,beans,Gwatemala,25.00,100,True,light_drip (alternative brewing methods)


### Process higher positioning

In [31]:
coffee_df['process'].value_counts()

Washed                             329
Natural                            273
Semi-Washed Kombucha Experiment     53
Pulped natural                      49
Honey                               35
Rum Aged                            26
Semi-Carbonic Maceration            23
Semi-washed                         19
Anaerobic                           15
Macerated Natural                    6
Monsooning                           6
Carbonic Maceration                  5
Washed and natural                   5
Anaerobic natural                    3
Washed Anaerobic                     2
Aerobic Fermentation in Piles        2
Yeast Process                        2
Washed Double Anaerobic              1
Washed, Barrel Aged                  1
Experimental Anaerobic               1
Fermentacja kontrolowana             1
Honey + Anaerobic                    1
Closed Tank Fermentation             1
CRYO                                 1
Yellow Bourbon                       1
Experimental             

In [39]:
# as the processes include various elements and some overlap additional flag for washed stage is added
coffee_df['Washed'] = np.where((coffee_df['process'] == 'Washed') | (coffee_df['process'] == 'Honey') | (coffee_df['process'] == 'Pulped natural') | (coffee_df['process'] == 'Washed Anaerobic') | (coffee_df['process'] == 'Washed and natural') | (coffee_df['process'] == 'Closed Tank Fermentation') | (coffee_df['process'] == 'Semi-washed') | (coffee_df['process'] == 'Washed Barrel Aged') | (coffee_df['process'] == 'Honey + Anaerobic') | (coffee_df['process'] == 'Washed Double Anaerobic') | (coffee_df['process'] == 'Semi-Washed Kombucha Experiment'), True, False)
coffee_df['Washed'].value_counts()

True     495
False    367
Name: Washed, dtype: int64

In [40]:
# as the processes include various elements and some overlap additional flag for natural stage is added
coffee_df['Natural'] = np.where((coffee_df['process'] == 'Natural') | (coffee_df['process'] == 'Honey')  | (coffee_df['process'] == 'Pulped natural') | (coffee_df['process'] == 'Red Honey') | (coffee_df['process'] == 'Macerated Natural') | (coffee_df['process'] == 'Yellow Honey') | (coffee_df['process'] == 'Black Honey') | (coffee_df['process'] == 'Washed and natural') | (coffee_df['process'] == 'Anaerobic natural') | (coffee_df['process'] == 'Honey + Anaerobic'), True, False)
coffee_df['Natural'].value_counts()

False    490
True     372
Name: Natural, dtype: int64

In [42]:
# as the processes include various elements and some overlap additional flag for Fermented/macerated (traditional) stage is added
coffee_df['Fermented/macerated (traditional)'] = np.where((coffee_df['process'] == 'Red Honey') | (coffee_df['process'] == 'Macerated Natural') | (coffee_df['process'] == 'Yellow Honey') | (coffee_df['process'] == 'Black Honey') | (coffee_df['process'] == 'Aerobic Fermentation in Piles') | (coffee_df['process'] == 'Semi-washed') | (coffee_df['process'] == 'Honey') | (coffee_df['process'] == 'Fermentacja kontrolowana'), True, False)
coffee_df['Fermented/macerated (traditional)'].value_counts()

False    799
True      63
Name: Fermented/macerated (traditional), dtype: int64