# Summary of Analysis and Cleaning strategy 
- Review all Columns and null values 
- Change column names or lower case 
- Change column names from having '.' to '_' for easier read 
- 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Reading in coffee datasets 

In [2]:
# Read the Arabica csv into a Pandas DataFrame
arabica_coffee_df = pd.read_csv('../Resources/arabica_data_cleaned.csv')
arabica_coffee_df.head()

Unnamed: 0.1,Unnamed: 0,Species,Owner,Country.of.Origin,Farm.Name,Lot.Number,Mill,ICO.Number,Company,Altitude,...,Color,Category.Two.Defects,Expiration,Certification.Body,Certification.Address,Certification.Contact,unit_of_measurement,altitude_low_meters,altitude_high_meters,altitude_mean_meters
0,1,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,...,Green,0,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0
1,2,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,...,Green,1,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0
2,3,Arabica,grounds for health admin,Guatemala,"san marcos barrancas ""san cristobal cuch",,,,,1600 - 1800 m,...,,0,"May 31st, 2011",Specialty Coffee Association,36d0d00a3724338ba7937c52a378d085f2172daa,0878a7d4b9d35ddbf0fe2ce69a2062cceb45a660,m,1600.0,1800.0,1700.0
3,4,Arabica,yidnekachew dabessa,Ethiopia,yidnekachew dabessa coffee plantation,,wolensu,,yidnekachew debessa coffee plantation,1800-2200,...,Green,2,"March 25th, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1800.0,2200.0,2000.0
4,5,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,...,Green,2,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0


In [3]:
# Read the Robusta csv into a Pandas DataFrame
robusta_coffee_df = pd.read_csv('../Resources/robusta_data_cleaned.csv')
robusta_coffee_df.head()

Unnamed: 0.1,Unnamed: 0,Species,Owner,Country.of.Origin,Farm.Name,Lot.Number,Mill,ICO.Number,Company,Altitude,...,Color,Category.Two.Defects,Expiration,Certification.Body,Certification.Address,Certification.Contact,unit_of_measurement,altitude_low_meters,altitude_high_meters,altitude_mean_meters
0,1,Robusta,ankole coffee producers coop,Uganda,kyangundu cooperative society,,ankole coffee producers,0,ankole coffee producers coop,1488,...,Green,2,"June 26th, 2015",Uganda Coffee Development Authority,e36d0270932c3b657e96b7b0278dfd85dc0fe743,03077a1c6bac60e6f514691634a7f6eb5c85aae8,m,1488.0,1488.0,1488.0
1,2,Robusta,nishant gurjer,India,sethuraman estate kaapi royale,25.0,sethuraman estate,14/1148/2017/21,kaapi royale,3170,...,,2,"October 31st, 2018",Specialty Coffee Association,ff7c18ad303d4b603ac3f8cff7e611ffc735e720,352d0cf7f3e9be14dad7df644ad65efc27605ae2,m,3170.0,3170.0,3170.0
2,3,Robusta,andrew hetzel,India,sethuraman estate,,,0000,sethuraman estate,1000m,...,Green,0,"April 29th, 2016",Specialty Coffee Association,ff7c18ad303d4b603ac3f8cff7e611ffc735e720,352d0cf7f3e9be14dad7df644ad65efc27605ae2,m,1000.0,1000.0,1000.0
3,4,Robusta,ugacof,Uganda,ugacof project area,,ugacof,0,ugacof ltd,1212,...,Green,7,"July 14th, 2015",Uganda Coffee Development Authority,e36d0270932c3b657e96b7b0278dfd85dc0fe743,03077a1c6bac60e6f514691634a7f6eb5c85aae8,m,1212.0,1212.0,1212.0
4,5,Robusta,katuka development trust ltd,Uganda,katikamu capca farmers association,,katuka development trust,0,katuka development trust ltd,1200-1300,...,Green,3,"June 26th, 2015",Uganda Coffee Development Authority,e36d0270932c3b657e96b7b0278dfd85dc0fe743,03077a1c6bac60e6f514691634a7f6eb5c85aae8,m,1200.0,1300.0,1250.0


# Comparing the arabica and robusta coffee dataframes

In [4]:
# Get a brief summary of Arabica dataFrame
arabica_coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1311 entries, 0 to 1310
Data columns (total 44 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             1311 non-null   int64  
 1   Species                1311 non-null   object 
 2   Owner                  1304 non-null   object 
 3   Country.of.Origin      1310 non-null   object 
 4   Farm.Name              955 non-null    object 
 5   Lot.Number             270 non-null    object 
 6   Mill                   1001 non-null   object 
 7   ICO.Number             1165 non-null   object 
 8   Company                1102 non-null   object 
 9   Altitude               1088 non-null   object 
 10  Region                 1254 non-null   object 
 11  Producer               1081 non-null   object 
 12  Number.of.Bags         1311 non-null   int64  
 13  Bag.Weight             1311 non-null   object 
 14  In.Country.Partner     1311 non-null   object 
 15  Harv

In [5]:
# Get a brief summary of Robusta dataFrame

robusta_coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 44 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             28 non-null     int64  
 1   Species                28 non-null     object 
 2   Owner                  28 non-null     object 
 3   Country.of.Origin      28 non-null     object 
 4   Farm.Name              25 non-null     object 
 5   Lot.Number             6 non-null      object 
 6   Mill                   20 non-null     object 
 7   ICO.Number             17 non-null     object 
 8   Company                28 non-null     object 
 9   Altitude               25 non-null     object 
 10  Region                 26 non-null     object 
 11  Producer               26 non-null     object 
 12  Number.of.Bags         28 non-null     int64  
 13  Bag.Weight             28 non-null     object 
 14  In.Country.Partner     28 non-null     object 
 15  Harvest.

In [6]:
# Compare column names to identify differences between datasets
# ------------------------------------------------

# Get column names of each DataFrame
columns_dfa = set(arabica_coffee_df.columns)
columns_dfr = set(robusta_coffee_df.columns)

# Compare column names
common_columns = columns_dfa.intersection(columns_dfr)
unique_columns_dfa = columns_dfa - columns_dfr
unique_columns_dfr = columns_dfr - columns_dfa

# Print the results
print("Common columns:", common_columns)
print("Columns unique to dfa:", unique_columns_dfa)
print("Columns unique to dfr:", unique_columns_dfr)

Common columns: {'Category.Two.Defects', 'Producer', 'Bag.Weight', 'Certification.Address', 'Country.of.Origin', 'Number.of.Bags', 'Expiration', 'Grading.Date', 'Unnamed: 0', 'Mill', 'Balance', 'Clean.Cup', 'Region', 'Variety', 'Aftertaste', 'Quakers', 'altitude_low_meters', 'Certification.Body', 'Total.Cup.Points', 'Processing.Method', 'Company', 'Altitude', 'In.Country.Partner', 'ICO.Number', 'Owner', 'Certification.Contact', 'unit_of_measurement', 'Color', 'Species', 'Lot.Number', 'Category.One.Defects', 'altitude_mean_meters', 'Flavor', 'Harvest.Year', 'Moisture', 'Owner.1', 'Cupper.Points', 'altitude_high_meters', 'Farm.Name'}
Columns unique to dfa: {'Aroma', 'Body', 'Acidity', 'Sweetness', 'Uniformity'}
Columns unique to dfr: {'Fragrance...Aroma', 'Bitter...Sweet', 'Salt...Acid', 'Mouthfeel', 'Uniform.Cup'}


In [7]:
# Change the names of columns in Robusta dataframe to match relevent columns in Arabica dataframe
# ------------------------------------------------

# Change names
robusta_coffee_df = robusta_coffee_df.rename(columns={'Fragrance...Aroma': 'Aroma',
                                                      'Salt...Acid':'Acidity',
                                                      'Uniform.Cup':'Uniformity',
                                                      'Mouthfeel':'Body',
                                                     'Bitter...Sweet':'Sweetness'})

# Check dataframe
robusta_coffee_df.head()

Unnamed: 0.1,Unnamed: 0,Species,Owner,Country.of.Origin,Farm.Name,Lot.Number,Mill,ICO.Number,Company,Altitude,...,Color,Category.Two.Defects,Expiration,Certification.Body,Certification.Address,Certification.Contact,unit_of_measurement,altitude_low_meters,altitude_high_meters,altitude_mean_meters
0,1,Robusta,ankole coffee producers coop,Uganda,kyangundu cooperative society,,ankole coffee producers,0,ankole coffee producers coop,1488,...,Green,2,"June 26th, 2015",Uganda Coffee Development Authority,e36d0270932c3b657e96b7b0278dfd85dc0fe743,03077a1c6bac60e6f514691634a7f6eb5c85aae8,m,1488.0,1488.0,1488.0
1,2,Robusta,nishant gurjer,India,sethuraman estate kaapi royale,25.0,sethuraman estate,14/1148/2017/21,kaapi royale,3170,...,,2,"October 31st, 2018",Specialty Coffee Association,ff7c18ad303d4b603ac3f8cff7e611ffc735e720,352d0cf7f3e9be14dad7df644ad65efc27605ae2,m,3170.0,3170.0,3170.0
2,3,Robusta,andrew hetzel,India,sethuraman estate,,,0000,sethuraman estate,1000m,...,Green,0,"April 29th, 2016",Specialty Coffee Association,ff7c18ad303d4b603ac3f8cff7e611ffc735e720,352d0cf7f3e9be14dad7df644ad65efc27605ae2,m,1000.0,1000.0,1000.0
3,4,Robusta,ugacof,Uganda,ugacof project area,,ugacof,0,ugacof ltd,1212,...,Green,7,"July 14th, 2015",Uganda Coffee Development Authority,e36d0270932c3b657e96b7b0278dfd85dc0fe743,03077a1c6bac60e6f514691634a7f6eb5c85aae8,m,1212.0,1212.0,1212.0
4,5,Robusta,katuka development trust ltd,Uganda,katikamu capca farmers association,,katuka development trust,0,katuka development trust ltd,1200-1300,...,Green,3,"June 26th, 2015",Uganda Coffee Development Authority,e36d0270932c3b657e96b7b0278dfd85dc0fe743,03077a1c6bac60e6f514691634a7f6eb5c85aae8,m,1200.0,1300.0,1250.0


In [8]:
# Check differences once more
# ------------------------------------------------

# Get column names of each DataFrame
columns_dfa = set(arabica_coffee_df.columns)
columns_dfr = set(robusta_coffee_df.columns)

# Compare column names
common_columns = columns_dfa.intersection(columns_dfr)
unique_columns_dfa = columns_dfa - columns_dfr
unique_columns_dfr = columns_dfr - columns_dfa

# Print the results
print("Common columns:", common_columns)
print("Columns unique to dfa:", unique_columns_dfa)
print("Columns unique to dfr:", unique_columns_dfr)

Common columns: {'Category.Two.Defects', 'Aroma', 'Producer', 'Bag.Weight', 'Certification.Address', 'Country.of.Origin', 'Number.of.Bags', 'Body', 'Acidity', 'Sweetness', 'Expiration', 'Grading.Date', 'Unnamed: 0', 'Mill', 'Balance', 'Clean.Cup', 'Region', 'Variety', 'Aftertaste', 'Uniformity', 'Quakers', 'altitude_low_meters', 'Certification.Body', 'Total.Cup.Points', 'Processing.Method', 'Company', 'Altitude', 'In.Country.Partner', 'ICO.Number', 'Owner', 'Certification.Contact', 'unit_of_measurement', 'Color', 'Species', 'Lot.Number', 'Category.One.Defects', 'altitude_mean_meters', 'Flavor', 'Harvest.Year', 'Moisture', 'Owner.1', 'Cupper.Points', 'altitude_high_meters', 'Farm.Name'}
Columns unique to dfa: set()
Columns unique to dfr: set()


In [9]:
# Append both dataframes to each other

coffee_df = pd.concat([arabica_coffee_df, robusta_coffee_df], ignore_index=True)

coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1339 entries, 0 to 1338
Data columns (total 44 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             1339 non-null   int64  
 1   Species                1339 non-null   object 
 2   Owner                  1332 non-null   object 
 3   Country.of.Origin      1338 non-null   object 
 4   Farm.Name              980 non-null    object 
 5   Lot.Number             276 non-null    object 
 6   Mill                   1021 non-null   object 
 7   ICO.Number             1182 non-null   object 
 8   Company                1130 non-null   object 
 9   Altitude               1113 non-null   object 
 10  Region                 1280 non-null   object 
 11  Producer               1107 non-null   object 
 12  Number.of.Bags         1339 non-null   int64  
 13  Bag.Weight             1339 non-null   object 
 14  In.Country.Partner     1339 non-null   object 
 15  Harv

# Refining column names 
- changing all column names to lower case 
- changing all '.' to '_' for easier read 

In [10]:
# lower case all column names for ease of access 
coffee_df.columns = coffee_df.columns.str.lower()

# Replace all '.' to '_' for easier read 
coffee_df.columns = coffee_df.columns.str.replace('.','_')

# confirm all columns renamed
coffee_df.columns

  coffee_df.columns = coffee_df.columns.str.replace('.','_')


Index(['unnamed: 0', 'species', 'owner', 'country_of_origin', 'farm_name',
       'lot_number', 'mill', 'ico_number', 'company', 'altitude', 'region',
       'producer', 'number_of_bags', 'bag_weight', 'in_country_partner',
       'harvest_year', 'grading_date', 'owner_1', 'variety',
       'processing_method', 'aroma', 'flavor', 'aftertaste', 'acidity', 'body',
       'balance', 'uniformity', 'clean_cup', 'sweetness', 'cupper_points',
       'total_cup_points', 'moisture', 'category_one_defects', 'quakers',
       'color', 'category_two_defects', 'expiration', 'certification_body',
       'certification_address', 'certification_contact', 'unit_of_measurement',
       'altitude_low_meters', 'altitude_high_meters', 'altitude_mean_meters'],
      dtype='object')

In [11]:
# drop unnecessary columns 
coffee_df = coffee_df.drop(columns=['unnamed: 0', 'owner', 'lot_number', 'mill', 'uniformity', 'farm_name', 'producer', 'total_cup_points',  'cupper_points', 'ico_number', 'cupper_points', 'company', 'altitude', 'number_of_bags', 'bag_weight', 'in_country_partner', 'harvest_year', 'grading_date', 'owner_1', 'moisture', 'category_one_defects', 'quakers', 'color', 'category_two_defects', 'expiration', 'certification_body', 'certification_address', 'certification_contact', 'unit_of_measurement', 'altitude_mean_meters'])

In [12]:
# check columns
coffee_df.columns

Index(['species', 'country_of_origin', 'region', 'variety',
       'processing_method', 'aroma', 'flavor', 'aftertaste', 'acidity', 'body',
       'balance', 'clean_cup', 'sweetness', 'altitude_low_meters',
       'altitude_high_meters'],
      dtype='object')

In [None]:
# Change 'null values' from 'variety' column to 'unknown'
coffee_df['variety'] = coffee_df['variety'].fillna('Unknown')

In [None]:
# Delete null-values from appended dataframe
coffee_df = coffee_df.dropna()

In [None]:
# Summary for numerical columns
print("Numerical Summary:\n")
print(coffee_df.describe())

# Data Cleaning

In [None]:
# remove text after / in 'processing_method' column
coffee_df['processing_method'] = coffee_df['processing_method'].str.split('/').str[0]

In [None]:
# change altitude_low_meters to int datatype
coffee_df['altitude_low_meters'] = coffee_df['altitude_low_meters'].astype(int)

In [None]:
# change altitude_high_meters to int datatype
coffee_df['altitude_high_meters'] = coffee_df['altitude_high_meters'].astype(int)

In [None]:
# Identify unique values
# ------------------------------------------------

# Create an empty dictionary to store unique values for each column
unique_values = {}

# Loop through the columns and find unique values
for column in coffee_df.columns:
    unique_values[column] = coffee_df[column].unique()

# Display unique values for each column
for column, values in unique_values.items():
    print(f'Column: {column}')
    print(values)
    print('\n')

In [None]:
replacement_dict = {'Ethiopian Yirgacheffe': 'Yirgacheffe',
                    'Ethiopian Heirlooms': 'Heirloom',
                    'Sumatra Lintong': 'Lintong',
                    'ada okinawa japan': 'okinawa',
                    'lintong': 'sumatra',
                    'sumatra brastagi': 'sumatra',
                    'snnprg; kafa; telo woreda; shada kebele': 'kaffa zone'
                   }

# Iterate through the dictionary and replace values in the correct columns
for old_value, new_value in replacement_dict.items():
    coffee_df['region'] = coffee_df['region'].replace(old_value, new_value)
    coffee_df['variety'] = coffee_df['variety'].replace(old_value, new_value)

In [None]:
# Define regex patterns
patterns_for_column = {'region': [',.*', '([^,]*,[^,]*),.*']}

# Iterate through columns
for column in coffee_df.columns:
    if column in patterns_for_column:
        for pattern in patterns_for_column['region']:
            # Check if the current regex pattern is present in any cell of the column
            if coffee_df['region'].str.contains(pattern).any():
                # If found, update the values using the regex pattern
                coffee_df['region'] = coffee_df['region'].replace(pattern, '', regex=True)
                #break  # Stop checking with other patterns for this column

# Display the result DataFrame
coffee_df.head()

In [None]:
# Convert all text to lowercase
coffee_df = coffee_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
coffee_df = coffee_df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
coffee_df = coffee_df.applymap(lambda x: x.replace(' ', '_') if isinstance(x, str) else x)
coffee_df = coffee_df.applymap(lambda x: x.replace(';', '_') if isinstance(x, str) else x)
coffee_df = coffee_df.applymap(lambda x: x.replace('-', '_') if isinstance(x, str) else x)
coffee_df

In [None]:
coffee_df.to_csv('../Resources/coffee_final.csv', index=False) 