# Contents
1. Imports
2. Checks
3. Wrangling
4. Exports

# 1. Imports

In [None]:
# Libraries
import pandas as pd
import numpy as np
import os 

In [2]:
# Path
path = r'/Users/davidgriesel/Documents/GitHub/06_coffee_quality_modelling/data'

In [3]:
# Dataset
df_quality = pd.read_csv(os.path.join(path, 'original','coffee_quality.csv'), index_col = False)

##### Source: [Kaggle](https://www.kaggle.com/datasets/adampq/coffee-quality-with-locations-of-origin/data)

# 2. Checks

In [4]:
# Dimensions
df_quality.shape

(1339, 22)

In [5]:
# Configuration
pd.set_option('display.max_columns', None) # Display all columns 

In [6]:
# Preview
df_quality.head()

Unnamed: 0,REC_ID,Species,Continent.of.Origin,Country.of.Origin,Harvest.Year,Expiration,Variety,Color,Processing.Method,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean.Cup,Sweetness,Moisture,Quakers,Category.One.Defects,Category.Two.Defects
0,0,Arabica,Africa,Ethiopia,2014.0,04/03/16,,Green,Washed / Wet,8.67,8.83,8.67,8.75,8.5,8.42,10.0,10.0,10.0,0.12,0,0,0
1,1,Arabica,Africa,Ethiopia,2014.0,04/03/16,Other,Green,Washed / Wet,8.75,8.67,8.5,8.58,8.42,8.42,10.0,10.0,10.0,0.12,0,0,1
2,2,Arabica,North America,Guatemala,,05/31/11,Bourbon,,,8.42,8.5,8.42,8.42,8.33,8.42,10.0,10.0,10.0,0.0,0,0,0
3,3,Arabica,Africa,Ethiopia,2014.0,03/25/16,,Green,Natural / Dry,8.17,8.58,8.42,8.42,8.5,8.25,10.0,10.0,10.0,0.11,0,0,2
4,4,Arabica,Africa,Ethiopia,2014.0,04/03/16,Other,Green,Washed / Wet,8.25,8.5,8.25,8.5,8.42,8.33,10.0,10.0,10.0,0.12,0,0,2


In [7]:
# Reset configuration
pd.reset_option('display.max_columns')

In [8]:
# Descriptive statistics - Raw dataset
df_quality.describe()

Unnamed: 0,REC_ID,Harvest.Year,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean.Cup,Sweetness,Moisture,Quakers,Category.One.Defects,Category.Two.Defects
count,1339.0,1279.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0
mean,669.0,2013.567631,7.566706,7.520426,7.401083,7.535706,7.517498,7.518013,9.834877,9.835108,9.856692,0.088379,0.173264,0.479462,3.556385
std,386.680316,1.808496,0.37756,0.398442,0.404463,0.379827,0.370064,0.408943,0.554591,0.763946,0.616102,0.048287,0.831824,2.549683,5.312541
min,0.0,2009.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,334.5,2012.0,7.42,7.33,7.25,7.33,7.33,7.33,10.0,10.0,10.0,0.09,0.0,0.0,0.0
50%,669.0,2013.0,7.58,7.58,7.42,7.58,7.5,7.5,10.0,10.0,10.0,0.11,0.0,0.0,2.0
75%,1003.5,2015.0,7.75,7.75,7.58,7.75,7.67,7.75,10.0,10.0,10.0,0.12,0.0,0.0,4.0
max,1338.0,2018.0,8.75,8.83,8.67,8.75,8.58,8.75,10.0,10.0,10.0,0.28,11.0,63.0,55.0


# 3. Wrangling

## 3.1. Dropping columns

In [9]:
# Null values
df_quality.isnull().sum()

REC_ID                    0
Species                   0
Continent.of.Origin       1
Country.of.Origin         1
Harvest.Year             60
Expiration                0
Variety                 226
Color                   270
Processing.Method       170
Aroma                     0
Flavor                    0
Aftertaste                0
Acidity                   0
Body                      0
Balance                   0
Uniformity                0
Clean.Cup                 0
Sweetness                 0
Moisture                  0
Quakers                   0
Category.One.Defects      0
Category.Two.Defects      0
dtype: int64

In [10]:
# Remove index column
df_quality_v1 = df_quality.drop(columns = ['REC_ID'])

In [11]:
# Dimensions
df_quality_v1.shape

(1339, 21)

## 3.2. Renaming columns

In [12]:
# Column names
df_quality_v1.columns

Index(['Species', 'Continent.of.Origin', 'Country.of.Origin', 'Harvest.Year',
       'Expiration', 'Variety', 'Color', 'Processing.Method', 'Aroma',
       'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 'Uniformity',
       'Clean.Cup', 'Sweetness', 'Moisture', 'Quakers', 'Category.One.Defects',
       'Category.Two.Defects'],
      dtype='object')

In [13]:
# Copy dataframe
df_quality_v2 = df_quality_v1.copy()

# Change '.' to '_'
df_quality_v2.rename(columns = {
    'Continent.of.Origin': 'Continent_of_Origin',
    'Country.of.Origin': 'Country_of_Origin',
    'Harvest.Year': 'Harvest_Year',
    'Processing.Method': 'Processing_Method',
    'Clean.Cup': 'Clean_Cup',
    'Category.One.Defects': 'Category_One_Defects',
    'Category.Two.Defects': 'Category_Two_Defects'
}, inplace = True)

In [14]:
# Column names
df_quality_v2.columns

Index(['Species', 'Continent_of_Origin', 'Country_of_Origin', 'Harvest_Year',
       'Expiration', 'Variety', 'Color', 'Processing_Method', 'Aroma',
       'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 'Uniformity',
       'Clean_Cup', 'Sweetness', 'Moisture', 'Quakers', 'Category_One_Defects',
       'Category_Two_Defects'],
      dtype='object')

## 3.3. Data types

In [15]:
# Data types
df_quality_v2.dtypes

Species                  object
Continent_of_Origin      object
Country_of_Origin        object
Harvest_Year            float64
Expiration               object
Variety                  object
Color                    object
Processing_Method        object
Aroma                   float64
Flavor                  float64
Aftertaste              float64
Acidity                 float64
Body                    float64
Balance                 float64
Uniformity              float64
Clean_Cup               float64
Sweetness               float64
Moisture                float64
Quakers                   int64
Category_One_Defects      int64
Category_Two_Defects      int64
dtype: object

In [16]:
# Copy dataframe
df_quality_v3 = df_quality_v2.copy()

# Convert 'Harvest_Year' to datetime
df_quality_v3['Harvest_Year'] = pd.to_datetime(
    df_quality_v3['Harvest_Year'],  # Replace NaN with 1900 and cast to integer
    format = '%Y')  # Specify format as year

# Convert 'Expiration' to datetime
df_quality_v3['Expiration'] = pd.to_datetime(
    df_quality_v3['Expiration'], 
    format = '%m/%d/%y')

In [17]:
# Data types
df_quality_v3.dtypes

Species                         object
Continent_of_Origin             object
Country_of_Origin               object
Harvest_Year            datetime64[ns]
Expiration              datetime64[ns]
Variety                         object
Color                           object
Processing_Method               object
Aroma                          float64
Flavor                         float64
Aftertaste                     float64
Acidity                        float64
Body                           float64
Balance                        float64
Uniformity                     float64
Clean_Cup                      float64
Sweetness                      float64
Moisture                       float64
Quakers                          int64
Category_One_Defects             int64
Category_Two_Defects             int64
dtype: object

## 3.4 Accuracy

In [18]:
# Basic statistics
df_quality_v3.describe()

Unnamed: 0,Harvest_Year,Expiration,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean_Cup,Sweetness,Moisture,Quakers,Category_One_Defects,Category_Two_Defects
count,1279,1339,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0,1339.0
mean,2013-07-26 21:53:54.089132032,2015-03-22 19:15:00.672143360,7.566706,7.520426,7.401083,7.535706,7.517498,7.518013,9.834877,9.835108,9.856692,0.088379,0.173264,0.479462,3.556385
min,2009-01-01 00:00:00,2011-04-09 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2012-01-01 00:00:00,2013-08-17 00:00:00,7.42,7.33,7.25,7.33,7.33,7.33,10.0,10.0,10.0,0.09,0.0,0.0,0.0
50%,2013-01-01 00:00:00,2015-04-03 00:00:00,7.58,7.58,7.42,7.58,7.5,7.5,10.0,10.0,10.0,0.11,0.0,0.0,2.0
75%,2015-01-01 00:00:00,2016-07-16 00:00:00,7.75,7.75,7.58,7.75,7.67,7.75,10.0,10.0,10.0,0.12,0.0,0.0,4.0
max,2018-01-01 00:00:00,2019-01-19 00:00:00,8.75,8.83,8.67,8.75,8.58,8.75,10.0,10.0,10.0,0.28,11.0,63.0,55.0
std,,,0.37756,0.398442,0.404463,0.379827,0.370064,0.408943,0.554591,0.763946,0.616102,0.048287,0.831824,2.549683,5.312541


In [19]:
# Configuration
pd.set_option('display.max_rows', None) # Display all rows

In [20]:
# Frequency tables

In [21]:
df_quality_v3['Species'].value_counts(dropna = True)

Species
Arabica    1311
Robusta      28
Name: count, dtype: int64

In [22]:
df_quality_v3['Continent_of_Origin'].value_counts(dropna = True)

Continent_of_Origin
North America    665
South America    328
Asia             182
Africa           162
Oceania            1
Name: count, dtype: int64

In [23]:
df_quality_v3['Country_of_Origin'].value_counts(dropna = True)

Country_of_Origin
Mexico              236
Colombia            183
Guatemala           181
Brazil              132
United States        87
Taiwan               75
Honduras             53
Costa Rica           51
Ethiopia             44
Tanzania             40
Uganda               36
Thailand             32
Nicaragua            26
Kenya                25
El Salvador          21
Indonesia            20
China                16
India                14
Malawi               11
Peru                 10
Vietnam               8
Myanmar               8
Haiti                 6
Philippines           5
Panama                4
Ecuador               3
Laos                  3
Burundi               2
Papua New Guinea      1
Japan                 1
Rwanda                1
Zambia                1
Mauritius             1
Cote dIvoire          1
Name: count, dtype: int64

In [24]:
df_quality_v3['Harvest_Year'].value_counts(dropna = True)

Harvest_Year
2012-01-01    354
2014-01-01    252
2013-01-01    210
2015-01-01    157
2016-01-01    131
2017-01-01     89
2011-01-01     35
2010-01-01     30
2009-01-01     20
2018-01-01      1
Name: count, dtype: int64

In [25]:
df_quality_v3['Expiration'].value_counts(dropna = True)

Expiration
2014-12-26    25
2013-07-11    25
2013-06-06    19
2013-08-30    18
2013-07-26    15
2014-03-29    13
2016-10-07    13
2013-09-27    13
2011-06-17    12
2018-10-20    11
2013-09-17    11
2015-12-12    10
2013-09-10    10
2015-12-18    10
2018-06-01     9
2013-09-11     9
2017-08-16     9
2017-04-06     9
2013-02-22     8
2015-11-23     8
2015-11-04     8
2013-07-03     8
2013-09-04     8
2016-05-22     7
2015-04-26     7
2011-05-31     7
2013-08-31     7
2015-06-27     7
2018-08-22     7
2016-07-16     7
2015-06-26     7
2012-12-02     7
2018-06-22     6
2015-08-25     6
2015-05-15     6
2014-02-26     6
2013-02-12     6
2013-08-01     6
2016-11-15     6
2017-05-18     6
2013-05-24     6
2016-03-12     6
2013-04-06     6
2015-01-17     6
2016-06-08     6
2016-03-20     5
2012-10-27     5
2015-06-16     5
2013-02-28     5
2014-09-03     5
2013-07-02     5
2013-03-05     5
2018-02-13     5
2012-06-13     5
2018-08-23     5
2013-04-02     5
2015-06-30     5
2013-09-13     5
201

In [26]:
df_quality_v3['Variety'].value_counts(dropna = True)

Variety
Caturra                  256
Bourbon                  226
Typica                   211
Other                    110
Catuai                    74
Hawaiian Kona             44
Yellow Bourbon            35
Mundo Novo                33
Catimor                   20
SL14                      17
SL28                      15
Pacas                     13
Gesha                     12
SL34                       8
Pacamara                   8
Arusha                     6
Peaberry                   5
Sumatra                    3
Mandheling                 3
Ruiru 11                   2
Blue Mountain              2
Ethiopian Yirgacheffe      2
Java                       2
Ethiopian Heirlooms        1
Moka Peaberry              1
Sulawesi                   1
Sumatra Lintong            1
Marigojipe                 1
Pache Comun                1
Name: count, dtype: int64

In [27]:
df_quality_v3['Color'].value_counts(dropna = True)

Color
Green           870
Bluish-Green    114
Blue-Green       85
Name: count, dtype: int64

In [28]:
df_quality_v3['Processing_Method'].value_counts(dropna = True)

Processing_Method
Washed / Wet                 815
Natural / Dry                258
Semi-washed / Semi-pulped     56
Other                         26
Pulped natural / honey        14
Name: count, dtype: int64

In [29]:
df_quality_v3['Aroma'].value_counts(dropna = True)

Aroma
7.67    179
7.50    165
7.58    152
7.75    125
7.42    122
7.83    103
7.33     98
7.25     78
7.92     59
8.00     48
7.17     45
7.08     28
7.00     23
8.08     20
8.17     20
6.92     14
8.42      9
8.25      9
6.83      9
6.75      7
8.33      7
6.67      3
8.50      3
6.50      2
8.67      2
7.81      2
5.08      1
8.75      1
6.42      1
6.17      1
8.58      1
6.33      1
0.00      1
Name: count, dtype: int64

In [30]:
df_quality_v3['Flavor'].value_counts(dropna = True)

Flavor
7.50    166
7.58    166
7.67    148
7.75    126
7.42    116
7.33    111
7.83     89
7.25     64
7.17     56
7.92     45
7.08     42
8.00     41
7.00     36
8.17     18
6.83     17
6.92     15
8.08     14
6.75     10
6.50      9
8.25      7
8.33      5
8.42      5
6.58      5
6.67      5
8.50      5
8.67      4
6.33      3
7.88      2
6.17      2
8.58      2
6.42      1
8.83      1
6.08      1
7.81      1
0.00      1
Name: count, dtype: int64

In [31]:
df_quality_v3['Aftertaste'].value_counts(dropna = True)

Aftertaste
7.50    164
7.33    153
7.42    129
7.58    126
7.25    104
7.67    102
7.17     91
7.75     87
7.83     65
7.00     62
7.08     45
6.83     36
6.92     36
8.00     27
7.92     22
6.67     14
6.75     10
6.17      8
8.08      7
8.17      7
6.50      7
6.33      6
6.58      6
8.25      4
8.50      4
6.42      4
8.42      3
8.58      2
8.33      2
6.25      1
8.67      1
7.38      1
7.56      1
7.88      1
0.00      1
Name: count, dtype: int64

In [32]:
df_quality_v3['Acidity'].value_counts(dropna = True)

Acidity
7.50    162
7.58    154
7.67    146
7.42    129
7.75    126
7.33    111
7.25     86
7.83     78
7.17     74
8.00     50
7.92     47
7.08     36
7.00     32
8.08     25
8.17     14
6.83     12
6.92     10
8.33      9
8.50      7
8.42      6
8.25      6
6.75      6
6.67      5
6.25      1
6.50      1
6.08      1
8.75      1
5.25      1
7.63      1
8.58      1
0.00      1
Name: count, dtype: int64

In [33]:
df_quality_v3['Body'].value_counts(dropna = True)

Body
7.50    201
7.67    154
7.58    138
7.33    131
7.42    127
7.75    111
7.25     87
7.83     85
7.17     68
7.92     52
7.08     37
7.00     34
8.00     34
8.08     21
6.92     12
8.17      7
8.25      7
8.33      6
6.83      4
6.75      4
8.50      3
8.42      3
6.67      2
6.33      2
6.42      1
0.00      1
6.50      1
5.08      1
7.63      1
5.25      1
7.38      1
8.58      1
5.17      1
Name: count, dtype: int64

In [34]:
df_quality_v3['Balance'].value_counts(dropna = True)

Balance
7.50    176
7.67    148
7.58    131
7.42    121
7.75    107
7.83    101
7.33     99
7.17     72
7.25     64
7.00     47
8.00     46
7.92     42
7.08     41
6.92     26
6.83     23
8.17     17
8.08     16
8.25      8
6.75      7
8.42      7
8.58      7
8.50      7
8.33      7
6.67      4
6.58      3
6.17      3
8.75      2
6.50      2
6.33      1
6.42      1
6.08      1
0.00      1
5.25      1
Name: count, dtype: int64

In [35]:
df_quality_v3['Uniformity'].value_counts(dropna = True)

Uniformity
10.00    1152
9.33      116
8.67       31
8.00       25
6.67        7
6.00        3
7.33        2
9.50        1
9.00        1
0.00        1
Name: count, dtype: int64

In [36]:
df_quality_v3['Clean_Cup'].value_counts(dropna = True)

Clean_Cup
10.00    1219
9.33       61
8.67       16
6.67       13
8.00       13
6.00        6
5.33        3
7.33        3
2.67        2
0.00        2
1.33        1
Name: count, dtype: int64

In [37]:
df_quality_v3['Sweetness'].value_counts(dropna = True)

Sweetness
10.00    1218
9.33       61
8.67       12
8.00       11
6.67        8
7.75        7
7.58        5
6.00        3
7.42        3
7.92        2
7.67        2
7.83        2
0.00        1
7.50        1
1.33        1
8.42        1
7.08        1
Name: count, dtype: int64

In [38]:
df_quality_v3['Moisture'].value_counts(dropna = True)

Moisture
0.11    383
0.12    294
0.00    264
0.10    182
0.13     76
0.09     27
0.14     23
0.08     16
0.01     15
0.15      8
0.05      8
0.02      7
0.06      7
0.07      5
0.16      5
0.04      4
0.03      4
0.20      3
0.17      3
0.18      2
0.28      1
0.21      1
0.22      1
Name: count, dtype: int64

In [39]:
df_quality_v3['Quakers'].value_counts(dropna = True)

Quakers
0     1245
1       39
2       30
4        5
5        5
3        5
6        4
7        3
11       1
9        1
8        1
Name: count, dtype: int64

In [40]:
df_quality_v3['Category_One_Defects'].value_counts(dropna = True)

Category_One_Defects
0     1137
1      101
2       38
3       18
4       16
5        9
10       4
6        3
7        3
31       2
9        1
20       1
8        1
11       1
15       1
23       1
12       1
63       1
Name: count, dtype: int64

In [41]:
df_quality_v3['Category_Two_Defects'].value_counts(dropna = True)

Category_Two_Defects
0     373
1     207
2     182
3     134
4     118
5      73
6      44
7      41
8      29
9      23
10     21
12     13
13      9
14      9
20      8
11      8
17      7
16      6
15      6
19      3
26      3
21      2
29      2
18      2
23      2
30      2
47      1
24      1
27      1
28      1
31      1
40      1
38      1
45      1
34      1
22      1
32      1
55      1
Name: count, dtype: int64

In [42]:
# Reset configuration
pd.reset_option('display.max_rows') 

In [43]:
# Copy dataframe
df_quality_v4 = df_quality_v3.copy()

## 3.5. Missing values

In [44]:
# Null values
df_quality_v4.isnull().sum()

Species                   0
Continent_of_Origin       1
Country_of_Origin         1
Harvest_Year             60
Expiration                0
Variety                 226
Color                   270
Processing_Method       170
Aroma                     0
Flavor                    0
Aftertaste                0
Acidity                   0
Body                      0
Balance                   0
Uniformity                0
Clean_Cup                 0
Sweetness                 0
Moisture                  0
Quakers                   0
Category_One_Defects      0
Category_Two_Defects      0
dtype: int64

In [45]:
# Extract records with null values
df_null_values = df_quality_v4[df_quality_v4.isnull().any(axis = 1)]

# View subset
df_null_values

Unnamed: 0,Species,Continent_of_Origin,Country_of_Origin,Harvest_Year,Expiration,Variety,Color,Processing_Method,Aroma,Flavor,...,Acidity,Body,Balance,Uniformity,Clean_Cup,Sweetness,Moisture,Quakers,Category_One_Defects,Category_Two_Defects
0,Arabica,Africa,Ethiopia,2014-01-01,2016-04-03,,Green,Washed / Wet,8.67,8.83,...,8.75,8.50,8.42,10.00,10.00,10.00,0.12,0,0,0
2,Arabica,North America,Guatemala,NaT,2011-05-31,Bourbon,,,8.42,8.50,...,8.42,8.33,8.42,10.00,10.00,10.00,0.00,0,0,0
3,Arabica,Africa,Ethiopia,2014-01-01,2016-03-25,,Green,Natural / Dry,8.17,8.58,...,8.42,8.50,8.25,10.00,10.00,10.00,0.11,0,0,2
5,Arabica,South America,Brazil,2013-01-01,2014-09-03,,Bluish-Green,Natural / Dry,8.58,8.42,...,8.50,8.25,8.33,10.00,10.00,10.00,0.11,0,0,1
7,Arabica,Africa,Ethiopia,2010-01-01,2011-09-02,,,,8.25,8.33,...,8.42,8.33,8.50,10.00,10.00,9.33,0.03,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1334,Robusta,South America,Ecuador,2016-01-01,2017-01-18,,Blue-Green,,7.75,7.58,...,7.58,5.08,7.83,10.00,10.00,7.75,0.00,0,0,1
1335,Robusta,South America,Ecuador,2016-01-01,2017-01-18,,Blue-Green,,7.50,7.67,...,7.75,5.17,5.25,10.00,10.00,8.42,0.00,0,0,0
1336,Robusta,North America,United States,2014-01-01,2015-12-23,,,Natural / Dry,7.33,7.33,...,7.42,7.50,7.17,9.33,9.33,7.42,0.00,0,0,6
1337,Robusta,Asia,India,2013-01-01,2015-08-25,,Green,Natural / Dry,7.42,6.83,...,7.17,7.25,7.00,9.33,9.33,7.08,0.10,0,20,1


In [46]:
# Remove records with null values
df_quality_v5 = df_quality_v4[~df_quality_v4.isnull().any(axis = 1)]

# Dimensions
df_quality_v5.shape

(946, 21)

## 3.6. Mixed-type data

In [47]:
# Find mixed type data
for col in df_quality_v5.columns.tolist():
    weird = (df_quality_v5[[col]].map(type) != df_quality_v5[[col]].iloc[0].apply(type)).any(axis=1)
    if len(df_quality_v5[weird]) > 0:
        print (col)

In [48]:
# Copy dataframe
df_quality_v6 = df_quality_v5.copy()

## 3.7. Duplicates

In [49]:
# Find duplicates
df_duplicates = df_quality_v6[df_quality_v6.duplicated()]

# View subset
df_duplicates

Unnamed: 0,Species,Continent_of_Origin,Country_of_Origin,Harvest_Year,Expiration,Variety,Color,Processing_Method,Aroma,Flavor,...,Acidity,Body,Balance,Uniformity,Clean_Cup,Sweetness,Moisture,Quakers,Category_One_Defects,Category_Two_Defects
742,Arabica,Asia,Taiwan,2013-01-01,2014-12-26,Typica,Green,Washed / Wet,7.5,7.5,...,7.5,7.5,7.5,10.0,10.0,10.0,0.11,0,0,0


In [50]:
# Drop duplicates
df_quality_v7 = df_quality_v6.drop_duplicates()

# Dimensions
df_quality_v7.shape

(945, 21)

# 4. Exports

In [51]:
# Descriptive statistics - Cleaned dataset
df_quality_v7.describe()

Unnamed: 0,Harvest_Year,Expiration,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean_Cup,Sweetness,Moisture,Quakers,Category_One_Defects,Category_Two_Defects
count,945,945,945.0,945.0,945.0,945.0,945.0,945.0,945.0,945.0,945.0,945.0,945.0,945.0,945.0
mean,2013-10-20 22:43:48.571428608,2015-07-11 04:43:25.714285568,7.564984,7.513735,7.392815,7.530582,7.508296,7.50946,9.861556,9.848265,9.920085,0.095249,0.164021,0.415873,3.606349
min,2011-01-01 00:00:00,2012-11-10 00:00:00,5.08,6.08,6.17,5.25,6.33,6.08,6.0,1.33,1.33,0.0,0.0,0.0,0.0
25%,2012-01-01 00:00:00,2013-09-19 00:00:00,7.42,7.33,7.25,7.33,7.33,7.33,10.0,10.0,10.0,0.1,0.0,0.0,0.0
50%,2014-01-01 00:00:00,2015-05-26 00:00:00,7.58,7.5,7.42,7.5,7.5,7.5,10.0,10.0,10.0,0.11,0.0,0.0,2.0
75%,2015-01-01 00:00:00,2016-10-07 00:00:00,7.75,7.67,7.58,7.67,7.67,7.67,10.0,10.0,10.0,0.12,0.0,0.0,4.0
max,2018-01-01 00:00:00,2019-01-19 00:00:00,8.75,8.67,8.5,8.58,8.42,8.58,10.0,10.0,10.0,0.17,11.0,31.0,47.0
std,,,0.291573,0.311277,0.319015,0.294613,0.261814,0.32606,0.458965,0.70307,0.448902,0.041779,0.782202,2.004646,5.355336


In [52]:
# Export cleaned dataset
df_quality_v7.to_pickle(os.path.join(path, 'prepared', 'coffee_quality_cleaned.pkl'))