# Summary of Analysis and Cleaning strategy 
- Review all Columns and null values 
- Change column names or lower case 
- Change column names from having '.' to '_' for easier read 
- 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Reading in coffee datasets 

In [2]:
# Read the Arabica csv into a Pandas DataFrame
arabica_coffee_df = pd.read_csv('../Resources/arabica_data_cleaned.csv')
arabica_coffee_df.head()

Unnamed: 0.1,Unnamed: 0,Species,Owner,Country.of.Origin,Farm.Name,Lot.Number,Mill,ICO.Number,Company,Altitude,...,Color,Category.Two.Defects,Expiration,Certification.Body,Certification.Address,Certification.Contact,unit_of_measurement,altitude_low_meters,altitude_high_meters,altitude_mean_meters
0,1,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,...,Green,0,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0
1,2,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,...,Green,1,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0
2,3,Arabica,grounds for health admin,Guatemala,"san marcos barrancas ""san cristobal cuch",,,,,1600 - 1800 m,...,,0,"May 31st, 2011",Specialty Coffee Association,36d0d00a3724338ba7937c52a378d085f2172daa,0878a7d4b9d35ddbf0fe2ce69a2062cceb45a660,m,1600.0,1800.0,1700.0
3,4,Arabica,yidnekachew dabessa,Ethiopia,yidnekachew dabessa coffee plantation,,wolensu,,yidnekachew debessa coffee plantation,1800-2200,...,Green,2,"March 25th, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1800.0,2200.0,2000.0
4,5,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,...,Green,2,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0


In [3]:
# Read the Robusta csv into a Pandas DataFrame
robusta_coffee_df = pd.read_csv('../Resources/robusta_data_cleaned.csv')
robusta_coffee_df.head()

Unnamed: 0.1,Unnamed: 0,Species,Owner,Country.of.Origin,Farm.Name,Lot.Number,Mill,ICO.Number,Company,Altitude,...,Color,Category.Two.Defects,Expiration,Certification.Body,Certification.Address,Certification.Contact,unit_of_measurement,altitude_low_meters,altitude_high_meters,altitude_mean_meters
0,1,Robusta,ankole coffee producers coop,Uganda,kyangundu cooperative society,,ankole coffee producers,0,ankole coffee producers coop,1488,...,Green,2,"June 26th, 2015",Uganda Coffee Development Authority,e36d0270932c3b657e96b7b0278dfd85dc0fe743,03077a1c6bac60e6f514691634a7f6eb5c85aae8,m,1488.0,1488.0,1488.0
1,2,Robusta,nishant gurjer,India,sethuraman estate kaapi royale,25.0,sethuraman estate,14/1148/2017/21,kaapi royale,3170,...,,2,"October 31st, 2018",Specialty Coffee Association,ff7c18ad303d4b603ac3f8cff7e611ffc735e720,352d0cf7f3e9be14dad7df644ad65efc27605ae2,m,3170.0,3170.0,3170.0
2,3,Robusta,andrew hetzel,India,sethuraman estate,,,0000,sethuraman estate,1000m,...,Green,0,"April 29th, 2016",Specialty Coffee Association,ff7c18ad303d4b603ac3f8cff7e611ffc735e720,352d0cf7f3e9be14dad7df644ad65efc27605ae2,m,1000.0,1000.0,1000.0
3,4,Robusta,ugacof,Uganda,ugacof project area,,ugacof,0,ugacof ltd,1212,...,Green,7,"July 14th, 2015",Uganda Coffee Development Authority,e36d0270932c3b657e96b7b0278dfd85dc0fe743,03077a1c6bac60e6f514691634a7f6eb5c85aae8,m,1212.0,1212.0,1212.0
4,5,Robusta,katuka development trust ltd,Uganda,katikamu capca farmers association,,katuka development trust,0,katuka development trust ltd,1200-1300,...,Green,3,"June 26th, 2015",Uganda Coffee Development Authority,e36d0270932c3b657e96b7b0278dfd85dc0fe743,03077a1c6bac60e6f514691634a7f6eb5c85aae8,m,1200.0,1300.0,1250.0


# Comparing the arabica and robusta coffee dataframes

In [4]:
# Get a brief summary of Arabica dataFrame
arabica_coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1311 entries, 0 to 1310
Data columns (total 44 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             1311 non-null   int64  
 1   Species                1311 non-null   object 
 2   Owner                  1304 non-null   object 
 3   Country.of.Origin      1310 non-null   object 
 4   Farm.Name              955 non-null    object 
 5   Lot.Number             270 non-null    object 
 6   Mill                   1001 non-null   object 
 7   ICO.Number             1165 non-null   object 
 8   Company                1102 non-null   object 
 9   Altitude               1088 non-null   object 
 10  Region                 1254 non-null   object 
 11  Producer               1081 non-null   object 
 12  Number.of.Bags         1311 non-null   int64  
 13  Bag.Weight             1311 non-null   object 
 14  In.Country.Partner     1311 non-null   object 
 15  Harv

In [5]:
# Get a brief summary of Robusta dataFrame

robusta_coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 44 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             28 non-null     int64  
 1   Species                28 non-null     object 
 2   Owner                  28 non-null     object 
 3   Country.of.Origin      28 non-null     object 
 4   Farm.Name              25 non-null     object 
 5   Lot.Number             6 non-null      object 
 6   Mill                   20 non-null     object 
 7   ICO.Number             17 non-null     object 
 8   Company                28 non-null     object 
 9   Altitude               25 non-null     object 
 10  Region                 26 non-null     object 
 11  Producer               26 non-null     object 
 12  Number.of.Bags         28 non-null     int64  
 13  Bag.Weight             28 non-null     object 
 14  In.Country.Partner     28 non-null     object 
 15  Harvest.

In [6]:
# Compare column names to identify differences between datasets
# ------------------------------------------------

# Get column names of each DataFrame
columns_dfa = set(arabica_coffee_df.columns)
columns_dfr = set(robusta_coffee_df.columns)

# Compare column names
common_columns = columns_dfa.intersection(columns_dfr)
unique_columns_dfa = columns_dfa - columns_dfr
unique_columns_dfr = columns_dfr - columns_dfa

# Print the results
print("Common columns:", common_columns)
print("Columns unique to dfa:", unique_columns_dfa)
print("Columns unique to dfr:", unique_columns_dfr)

Common columns: {'Variety', 'Expiration', 'Owner', 'Certification.Contact', 'Company', 'Altitude', 'Species', 'Category.One.Defects', 'Color', 'unit_of_measurement', 'Aftertaste', 'Certification.Body', 'Clean.Cup', 'Certification.Address', 'Cupper.Points', 'Quakers', 'Country.of.Origin', 'Category.Two.Defects', 'Unnamed: 0', 'Processing.Method', 'Mill', 'Farm.Name', 'Owner.1', 'Balance', 'altitude_high_meters', 'Harvest.Year', 'altitude_low_meters', 'In.Country.Partner', 'altitude_mean_meters', 'Grading.Date', 'Region', 'Number.of.Bags', 'Total.Cup.Points', 'ICO.Number', 'Lot.Number', 'Producer', 'Bag.Weight', 'Moisture', 'Flavor'}
Columns unique to dfa: {'Acidity', 'Sweetness', 'Uniformity', 'Aroma', 'Body'}
Columns unique to dfr: {'Fragrance...Aroma', 'Mouthfeel', 'Uniform.Cup', 'Salt...Acid', 'Bitter...Sweet'}


In [7]:
# Change the names of columns in Robusta dataframe to match relevent columns in Arabica dataframe
# ------------------------------------------------

# Change names
robusta_coffee_df = robusta_coffee_df.rename(columns={'Fragrance...Aroma': 'Aroma',
                                                      'Salt...Acid':'Acidity',
                                                      'Uniform.Cup':'Uniformity',
                                                      'Mouthfeel':'Body',
                                                     'Bitter...Sweet':'Sweetness'})

# Check dataframe
robusta_coffee_df.head()

Unnamed: 0.1,Unnamed: 0,Species,Owner,Country.of.Origin,Farm.Name,Lot.Number,Mill,ICO.Number,Company,Altitude,...,Color,Category.Two.Defects,Expiration,Certification.Body,Certification.Address,Certification.Contact,unit_of_measurement,altitude_low_meters,altitude_high_meters,altitude_mean_meters
0,1,Robusta,ankole coffee producers coop,Uganda,kyangundu cooperative society,,ankole coffee producers,0,ankole coffee producers coop,1488,...,Green,2,"June 26th, 2015",Uganda Coffee Development Authority,e36d0270932c3b657e96b7b0278dfd85dc0fe743,03077a1c6bac60e6f514691634a7f6eb5c85aae8,m,1488.0,1488.0,1488.0
1,2,Robusta,nishant gurjer,India,sethuraman estate kaapi royale,25.0,sethuraman estate,14/1148/2017/21,kaapi royale,3170,...,,2,"October 31st, 2018",Specialty Coffee Association,ff7c18ad303d4b603ac3f8cff7e611ffc735e720,352d0cf7f3e9be14dad7df644ad65efc27605ae2,m,3170.0,3170.0,3170.0
2,3,Robusta,andrew hetzel,India,sethuraman estate,,,0000,sethuraman estate,1000m,...,Green,0,"April 29th, 2016",Specialty Coffee Association,ff7c18ad303d4b603ac3f8cff7e611ffc735e720,352d0cf7f3e9be14dad7df644ad65efc27605ae2,m,1000.0,1000.0,1000.0
3,4,Robusta,ugacof,Uganda,ugacof project area,,ugacof,0,ugacof ltd,1212,...,Green,7,"July 14th, 2015",Uganda Coffee Development Authority,e36d0270932c3b657e96b7b0278dfd85dc0fe743,03077a1c6bac60e6f514691634a7f6eb5c85aae8,m,1212.0,1212.0,1212.0
4,5,Robusta,katuka development trust ltd,Uganda,katikamu capca farmers association,,katuka development trust,0,katuka development trust ltd,1200-1300,...,Green,3,"June 26th, 2015",Uganda Coffee Development Authority,e36d0270932c3b657e96b7b0278dfd85dc0fe743,03077a1c6bac60e6f514691634a7f6eb5c85aae8,m,1200.0,1300.0,1250.0


In [8]:
# Check differences once more
# ------------------------------------------------

# Get column names of each DataFrame
columns_dfa = set(arabica_coffee_df.columns)
columns_dfr = set(robusta_coffee_df.columns)

# Compare column names
common_columns = columns_dfa.intersection(columns_dfr)
unique_columns_dfa = columns_dfa - columns_dfr
unique_columns_dfr = columns_dfr - columns_dfa

# Print the results
print("Common columns:", common_columns)
print("Columns unique to dfa:", unique_columns_dfa)
print("Columns unique to dfr:", unique_columns_dfr)

Common columns: {'Acidity', 'Variety', 'Expiration', 'Owner', 'Certification.Contact', 'Company', 'Altitude', 'Species', 'Aroma', 'Category.One.Defects', 'Color', 'unit_of_measurement', 'Aftertaste', 'Certification.Body', 'Clean.Cup', 'Certification.Address', 'Cupper.Points', 'Quakers', 'Country.of.Origin', 'Category.Two.Defects', 'Unnamed: 0', 'Processing.Method', 'Mill', 'Farm.Name', 'Sweetness', 'Owner.1', 'Balance', 'altitude_high_meters', 'Harvest.Year', 'Uniformity', 'altitude_low_meters', 'In.Country.Partner', 'altitude_mean_meters', 'Grading.Date', 'Region', 'Number.of.Bags', 'Total.Cup.Points', 'ICO.Number', 'Lot.Number', 'Producer', 'Bag.Weight', 'Moisture', 'Flavor', 'Body'}
Columns unique to dfa: set()
Columns unique to dfr: set()


In [9]:
# Append both dataframes to each other

coffee_df = pd.concat([arabica_coffee_df, robusta_coffee_df], ignore_index=True)

coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1339 entries, 0 to 1338
Data columns (total 44 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             1339 non-null   int64  
 1   Species                1339 non-null   object 
 2   Owner                  1332 non-null   object 
 3   Country.of.Origin      1338 non-null   object 
 4   Farm.Name              980 non-null    object 
 5   Lot.Number             276 non-null    object 
 6   Mill                   1021 non-null   object 
 7   ICO.Number             1182 non-null   object 
 8   Company                1130 non-null   object 
 9   Altitude               1113 non-null   object 
 10  Region                 1280 non-null   object 
 11  Producer               1107 non-null   object 
 12  Number.of.Bags         1339 non-null   int64  
 13  Bag.Weight             1339 non-null   object 
 14  In.Country.Partner     1339 non-null   object 
 15  Harv

# Refining column names 
- changing all column names to lower case 
- changing all '.' to '_' for easier read 

In [10]:
# lower case all column names for ease of access 
coffee_df.columns = coffee_df.columns.str.lower()

# Replace all '.' to '_' for easier read 
coffee_df.columns = coffee_df.columns.str.replace('.','_')

# confirm all columns renamed
coffee_df.columns

  coffee_df.columns = coffee_df.columns.str.replace('.','_')


Index(['unnamed: 0', 'species', 'owner', 'country_of_origin', 'farm_name',
       'lot_number', 'mill', 'ico_number', 'company', 'altitude', 'region',
       'producer', 'number_of_bags', 'bag_weight', 'in_country_partner',
       'harvest_year', 'grading_date', 'owner_1', 'variety',
       'processing_method', 'aroma', 'flavor', 'aftertaste', 'acidity', 'body',
       'balance', 'uniformity', 'clean_cup', 'sweetness', 'cupper_points',
       'total_cup_points', 'moisture', 'category_one_defects', 'quakers',
       'color', 'category_two_defects', 'expiration', 'certification_body',
       'certification_address', 'certification_contact', 'unit_of_measurement',
       'altitude_low_meters', 'altitude_high_meters', 'altitude_mean_meters'],
      dtype='object')

In [11]:
# drop unnecessary columns 
coffee_df = coffee_df.drop(columns=['unnamed: 0', 'owner', 'lot_number', 'mill', 'uniformity', 'farm_name', 'producer', 'total_cup_points',  'cupper_points', 'ico_number', 'cupper_points', 'company', 'altitude', 'number_of_bags', 'bag_weight', 'in_country_partner', 'harvest_year', 'grading_date', 'owner_1', 'moisture', 'category_one_defects', 'quakers', 'color', 'category_two_defects', 'expiration', 'certification_body', 'certification_address', 'certification_contact', 'unit_of_measurement', 'altitude_mean_meters'])

In [12]:
# check columns
coffee_df.columns

Index(['species', 'country_of_origin', 'region', 'variety',
       'processing_method', 'aroma', 'flavor', 'aftertaste', 'acidity', 'body',
       'balance', 'clean_cup', 'sweetness', 'altitude_low_meters',
       'altitude_high_meters'],
      dtype='object')

In [13]:
# Change 'null values' from 'variety' column to 'unknown'
coffee_df['variety'] = coffee_df['variety'].fillna('Unknown')

In [14]:
# Delete null-values from altitude column(s)
coffee_df = coffee_df.dropna()

In [15]:
coffee_df

Unnamed: 0,species,country_of_origin,region,variety,processing_method,aroma,flavor,aftertaste,acidity,body,balance,clean_cup,sweetness,altitude_low_meters,altitude_high_meters
0,Arabica,Ethiopia,guji-hambela,Unknown,Washed / Wet,8.67,8.83,8.67,8.75,8.50,8.42,10.00,10.00,1950.0,2200.0
1,Arabica,Ethiopia,guji-hambela,Other,Washed / Wet,8.75,8.67,8.50,8.58,8.42,8.42,10.00,10.00,1950.0,2200.0
3,Arabica,Ethiopia,oromia,Unknown,Natural / Dry,8.17,8.58,8.42,8.42,8.50,8.25,10.00,10.00,1800.0,2200.0
4,Arabica,Ethiopia,guji-hambela,Other,Washed / Wet,8.25,8.50,8.25,8.50,8.42,8.33,10.00,10.00,1950.0,2200.0
9,Arabica,Ethiopia,"snnp/kaffa zone,gimbowereda",Other,Natural / Dry,8.08,8.58,8.50,8.50,7.67,8.42,10.00,10.00,1795.0,1850.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1322,Robusta,India,chikmagalur karnataka,Other,Natural / Dry,7.67,7.83,7.75,7.50,7.75,7.58,10.00,7.75,1000.0,1000.0
1327,Robusta,India,chikmagalur,Unknown,Natural / Dry,7.50,7.50,7.25,7.83,7.67,7.83,10.00,7.83,750.0,750.0
1332,Robusta,India,chikmagalur,Unknown,Natural / Dry,7.58,7.42,7.42,7.83,7.42,7.50,10.00,7.42,750.0,750.0
1333,Robusta,United States,chikmagalur,Arusha,Natural / Dry,7.92,7.50,7.42,7.42,7.42,7.42,10.00,7.58,3000.0,3000.0


In [16]:
# Summary for numerical columns
print("Numerical Summary:\n")
print(coffee_df.describe())

Numerical Summary:

             aroma       flavor   aftertaste      acidity         body  \
count  1010.000000  1010.000000  1010.000000  1010.000000  1010.000000   
mean      7.567604     7.514594     7.386921     7.527366     7.505911   
std       0.300388     0.323070     0.331549     0.307352     0.272840   
min       5.080000     6.170000     6.170000     5.250000     6.330000   
25%       7.420000     7.330000     7.170000     7.330000     7.330000   
50%       7.580000     7.500000     7.420000     7.500000     7.500000   
75%       7.750000     7.670000     7.580000     7.670000     7.670000   
max       8.750000     8.830000     8.670000     8.750000     8.500000   

           balance    clean_cup    sweetness  altitude_low_meters  \
count  1010.000000  1010.000000  1010.000000          1010.000000   
mean      7.500960     9.857337     9.913950          1798.432975   
std       0.340169     0.736605     0.465518          9082.111184   
min       6.080000     0.000000     1

# Data Cleaning

In [17]:
# remove text after / in 'processing_method' column
coffee_df['processing_method'] = coffee_df['processing_method'].str.split('/').str[0]

In [18]:
# change altitude_low_meters to int datatype
coffee_df['altitude_low_meters'] = coffee_df['altitude_low_meters'].astype(int)

In [19]:
# change altitude_high_meters to int datatype
coffee_df['altitude_high_meters'] = coffee_df['altitude_high_meters'].astype(int)

In [20]:
# Identify unique values
# ------------------------------------------------

# Create an empty dictionary to store unique values for each column
unique_values = {}

# Loop through the columns and find unique values
for column in coffee_df.columns:
    unique_values[column] = coffee_df[column].unique()

# Display unique values for each column
for column, values in unique_values.items():
    print(f'Column: {column}')
    print(values)
    print('\n')

Column: species
['Arabica' 'Robusta']


Column: country_of_origin
['Ethiopia' 'United States' 'China' 'Costa Rica' 'Mexico' 'Brazil'
 'Uganda' 'Taiwan' 'Kenya' 'Thailand' 'Colombia' 'Panama' 'Guatemala'
 'Papua New Guinea' 'El Salvador' 'Indonesia'
 'Tanzania, United Republic Of' 'Honduras' 'Japan' 'Nicaragua' 'Ecuador'
 'United States (Puerto Rico)' 'Haiti' 'Burundi' 'Vietnam' 'Philippines'
 'Rwanda' 'Malawi' 'Laos' 'Zambia' 'Myanmar' 'Cote d?Ivoire' 'Peru'
 'India']


Column: region
['guji-hambela' 'oromia' 'snnp/kaffa zone,gimbowereda' 'antioquia'
 'yunnan' 'gedio' 'san ramon' 'xalapa' 'south of minas'
 'kapchorwa eastern' 'leye, alishan township, chiayi county'
 'vale da grama' 'west and central valley' 'muranga' 'chiang rai'
 'natou county' 'nyeri' 'tolima' 'kiambu' 'sipi, mt elgon' 'eastern'
 'huila' 'boquete' 'acatenango' 'nuevo oriente'
 'eastern highlands province' 'apaneca'
 'ataco, apaneca - ilamatepec mountain range' 'kirinyaga'
 'bulambuli eastern region' 'huehuetenango' '

In [None]:
replacement_dict = {'Ethiopian Yirgacheffe': 'Yirgacheffe',
                    'Ethiopian Heirlooms': 'Heirloom',
                    'Sumatra Lintong': 'Lintong',
                    'ada okinawa japan': 'okinawa',
                    'lintong': 'sumatra',
                    'sumatra brastagi': 'sumatra',
                    'snnprg; kafa; telo woreda; shada kebele': 'kaffa zone'}

# Iterate through the dictionary and replace values in the ' column
for old_value, new_value in replacement_dict.items():
    df['City'] = df['City'].replace(old_value, new_value)

In [33]:
# Define regex patterns
patterns_for_column = {'region': [',.*', '([^,]*,[^,]*),.*']}

# Iterate through columns
for column in coffee_df.columns:
    if column in patterns_for_column:
        for pattern in patterns_for_column['region']:
            # Check if the current regex pattern is present in any cell of the column
            if coffee_df['region'].str.contains(pattern).any():
                # If found, update the values using the regex pattern
                coffee_df['region'] = coffee_df['region'].replace(pattern, '', regex=True)
                #break  # Stop checking with other patterns for this column

# Display the result DataFrame
print("Result DataFrame after applying regex patterns:")
coffee_df.head()

Result DataFrame after applying regex patterns:


  if coffee_df['region'].str.contains(pattern).any():


Unnamed: 0,species,country_of_origin,region,variety,processing_method,aroma,flavor,aftertaste,acidity,body,balance,clean_cup,sweetness,altitude_low_meters,altitude_high_meters
0,Arabica,Ethiopia,guji-hambela,Unknown,Washed,8.67,8.83,8.67,8.75,8.5,8.42,10.0,10.0,1950,2200
1,Arabica,Ethiopia,guji-hambela,Other,Washed,8.75,8.67,8.5,8.58,8.42,8.42,10.0,10.0,1950,2200
3,Arabica,Ethiopia,oromia,Unknown,Natural,8.17,8.58,8.42,8.42,8.5,8.25,10.0,10.0,1800,2200
4,Arabica,Ethiopia,guji-hambela,Other,Washed,8.25,8.5,8.25,8.5,8.42,8.33,10.0,10.0,1950,2200
9,Arabica,Ethiopia,kaffa zone,Other,Natural,8.08,8.58,8.5,8.5,7.67,8.42,10.0,10.0,1795,1850


In [32]:
coffee_df.to_csv('../Resources/coffee_final.csv', index=False) 