# Summary of Analysis and Cleaning strategy 
- Review all Columns and null values 
- Change column names or lower case 
- Change column names from having '.' to '_' for easier read 
- 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Reading in Dataset 

In [2]:
# Read the data into a Pandas DataFrame
coffee_df = pd.read_csv('Resources/arabica_data_cleaned.csv')
coffee_df.head()

Unnamed: 0.1,Unnamed: 0,Species,Owner,Country.of.Origin,Farm.Name,Lot.Number,Mill,ICO.Number,Company,Altitude,...,Color,Category.Two.Defects,Expiration,Certification.Body,Certification.Address,Certification.Contact,unit_of_measurement,altitude_low_meters,altitude_high_meters,altitude_mean_meters
0,1,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,...,Green,0,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0
1,2,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,...,Green,1,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0
2,3,Arabica,grounds for health admin,Guatemala,"san marcos barrancas ""san cristobal cuch",,,,,1600 - 1800 m,...,,0,"May 31st, 2011",Specialty Coffee Association,36d0d00a3724338ba7937c52a378d085f2172daa,0878a7d4b9d35ddbf0fe2ce69a2062cceb45a660,m,1600.0,1800.0,1700.0
3,4,Arabica,yidnekachew dabessa,Ethiopia,yidnekachew dabessa coffee plantation,,wolensu,,yidnekachew debessa coffee plantation,1800-2200,...,Green,2,"March 25th, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1800.0,2200.0,2000.0
4,5,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,...,Green,2,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0


# Exploring the coffee_df 

In [3]:
# Get a brief summary of DataFrame.
coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1311 entries, 0 to 1310
Data columns (total 44 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             1311 non-null   int64  
 1   Species                1311 non-null   object 
 2   Owner                  1304 non-null   object 
 3   Country.of.Origin      1310 non-null   object 
 4   Farm.Name              955 non-null    object 
 5   Lot.Number             270 non-null    object 
 6   Mill                   1001 non-null   object 
 7   ICO.Number             1165 non-null   object 
 8   Company                1102 non-null   object 
 9   Altitude               1088 non-null   object 
 10  Region                 1254 non-null   object 
 11  Producer               1081 non-null   object 
 12  Number.of.Bags         1311 non-null   int64  
 13  Bag.Weight             1311 non-null   object 
 14  In.Country.Partner     1311 non-null   object 
 15  Harv

In [4]:
# Get the columns of the DataFrame 
coffee_df.columns

Index(['Unnamed: 0', 'Species', 'Owner', 'Country.of.Origin', 'Farm.Name',
       'Lot.Number', 'Mill', 'ICO.Number', 'Company', 'Altitude', 'Region',
       'Producer', 'Number.of.Bags', 'Bag.Weight', 'In.Country.Partner',
       'Harvest.Year', 'Grading.Date', 'Owner.1', 'Variety',
       'Processing.Method', 'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body',
       'Balance', 'Uniformity', 'Clean.Cup', 'Sweetness', 'Cupper.Points',
       'Total.Cup.Points', 'Moisture', 'Category.One.Defects', 'Quakers',
       'Color', 'Category.Two.Defects', 'Expiration', 'Certification.Body',
       'Certification.Address', 'Certification.Contact', 'unit_of_measurement',
       'altitude_low_meters', 'altitude_high_meters', 'altitude_mean_meters'],
      dtype='object')

# Refining column names 
- changing all column names to lower case 
- changing all '.' to '_' for easier read 

In [5]:
# lower case all column names for ease of access 
coffee_df.columns = coffee_df.columns.str.lower()

# Replace all '.' to '_' for easier read 
coffee_df.columns = coffee_df.columns.str.replace('.','_')

# confirm all columns renamed
coffee_df.columns

  coffee_df.columns = coffee_df.columns.str.replace('.','_')


Index(['unnamed: 0', 'species', 'owner', 'country_of_origin', 'farm_name',
       'lot_number', 'mill', 'ico_number', 'company', 'altitude', 'region',
       'producer', 'number_of_bags', 'bag_weight', 'in_country_partner',
       'harvest_year', 'grading_date', 'owner_1', 'variety',
       'processing_method', 'aroma', 'flavor', 'aftertaste', 'acidity', 'body',
       'balance', 'uniformity', 'clean_cup', 'sweetness', 'cupper_points',
       'total_cup_points', 'moisture', 'category_one_defects', 'quakers',
       'color', 'category_two_defects', 'expiration', 'certification_body',
       'certification_address', 'certification_contact', 'unit_of_measurement',
       'altitude_low_meters', 'altitude_high_meters', 'altitude_mean_meters'],
      dtype='object')

In [6]:
# drop unnecessary columns 
coffee_df = coffee_df.drop(columns=['unnamed: 0', 'owner', 'lot_number', 'mill', 'uniformity', 'farm_name', 'producer', 'total_cup_points',  'cupper_points', 'ico_number', 'cupper_points', 'company', 'altitude', 'number_of_bags', 'bag_weight', 'in_country_partner', 'harvest_year', 'grading_date', 'owner_1', 'moisture', 'category_one_defects', 'quakers', 'color', 'category_two_defects', 'expiration', 'certification_body', 'certification_address', 'certification_contact', 'unit_of_measurement', 'altitude_mean_meters'])

In [7]:
# check columns
coffee_df.columns

Index(['species', 'country_of_origin', 'region', 'variety',
       'processing_method', 'aroma', 'flavor', 'aftertaste', 'acidity', 'body',
       'balance', 'clean_cup', 'sweetness', 'altitude_low_meters',
       'altitude_high_meters'],
      dtype='object')

In [8]:
# Change 'null values' from 'variety' column to 'unknown'
coffee_df['variety'] = coffee_df['variety'].fillna('Unknown')

In [9]:
# Delete null-values from altitude column(s)
coffee_df = coffee_df.dropna()

In [10]:
coffee_df

Unnamed: 0,species,country_of_origin,region,variety,processing_method,aroma,flavor,aftertaste,acidity,body,balance,clean_cup,sweetness,altitude_low_meters,altitude_high_meters
0,Arabica,Ethiopia,guji-hambela,Unknown,Washed / Wet,8.67,8.83,8.67,8.75,8.50,8.42,10.00,10.00,1950.00,2200.00
1,Arabica,Ethiopia,guji-hambela,Other,Washed / Wet,8.75,8.67,8.50,8.58,8.42,8.42,10.00,10.00,1950.00,2200.00
3,Arabica,Ethiopia,oromia,Unknown,Natural / Dry,8.17,8.58,8.42,8.42,8.50,8.25,10.00,10.00,1800.00,2200.00
4,Arabica,Ethiopia,guji-hambela,Other,Washed / Wet,8.25,8.50,8.25,8.50,8.42,8.33,10.00,10.00,1950.00,2200.00
9,Arabica,Ethiopia,"snnp/kaffa zone,gimbowereda",Other,Natural / Dry,8.08,8.58,8.50,8.50,7.67,8.42,10.00,10.00,1795.00,1850.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,Arabica,Honduras,marcala,Catuai,Washed / Wet,7.00,6.33,6.17,6.50,6.67,6.17,8.00,8.00,1450.00,1450.00
1306,Arabica,Mexico,juchique de ferrer,Bourbon,Washed / Wet,7.08,6.83,6.25,7.42,7.25,6.75,0.00,10.00,900.00,900.00
1307,Arabica,Haiti,"department d'artibonite , haiti",Typica,Natural / Dry,6.75,6.58,6.42,6.67,7.08,6.67,6.00,6.00,350.00,350.00
1308,Arabica,Nicaragua,jalapa,Caturra,Other,7.25,6.58,6.33,6.25,6.42,6.08,6.00,6.00,1100.00,1100.00


In [11]:
# Summary for numerical columns
print("Numerical Summary:\n")
print(coffee_df.describe())

Numerical Summary:

             aroma       flavor   aftertaste      acidity         body  \
count  1002.000000  1002.000000  1002.000000  1002.000000  1002.000000   
mean      7.566806     7.513882     7.385679     7.526088     7.504531   
std       0.300916     0.323882     0.331790     0.307696     0.272921   
min       5.080000     6.170000     6.170000     5.250000     6.330000   
25%       7.420000     7.330000     7.170000     7.330000     7.330000   
50%       7.580000     7.500000     7.420000     7.500000     7.500000   
75%       7.750000     7.670000     7.580000     7.670000     7.670000   
max       8.750000     8.830000     8.670000     8.750000     8.500000   

           balance    clean_cup    sweetness  altitude_low_meters  \
count  1002.000000  1002.000000  1002.000000          1002.000000   
mean      7.499721     9.856866     9.931387          1799.213877   
std       0.340405     0.739259     0.423847          9117.804902   
min       6.080000     0.000000     1

# Data Cleaning - Try Removing unique values 

In [12]:
# Create an empty dictionary to store unique values for each column
unique_values = {}

# Loop through the columns and find unique values
for column in coffee_df.columns:
    unique_values[column] = coffee_df[column].unique()

# Display unique values for each column
for column, values in unique_values.items():
    print(f'Column: {column}')
    print(values)
    print('\n')

Column: species
['Arabica']


Column: country_of_origin
['Ethiopia' 'United States' 'China' 'Costa Rica' 'Mexico' 'Brazil'
 'Uganda' 'Taiwan' 'Kenya' 'Thailand' 'Colombia' 'Panama' 'Guatemala'
 'Papua New Guinea' 'El Salvador' 'Indonesia'
 'Tanzania, United Republic Of' 'Honduras' 'Japan' 'Nicaragua' 'Ecuador'
 'United States (Puerto Rico)' 'Haiti' 'Burundi' 'Vietnam' 'Philippines'
 'Rwanda' 'Malawi' 'Laos' 'Zambia' 'Myanmar' 'Cote d?Ivoire' 'Peru']


Column: region
['guji-hambela' 'oromia' 'snnp/kaffa zone,gimbowereda' 'antioquia'
 'yunnan' 'gedio' 'san ramon' 'xalapa' 'south of minas'
 'kapchorwa eastern' 'leye, alishan township, chiayi county'
 'vale da grama' 'west and central valley' 'muranga' 'chiang rai'
 'natou county' 'nyeri' 'tolima' 'kiambu' 'sipi, mt elgon' 'eastern'
 'huila' 'boquete' 'acatenango' 'nuevo oriente'
 'eastern highlands province' 'apaneca'
 'ataco, apaneca - ilamatepec mountain range' 'kirinyaga'
 'bulambuli eastern region' 'huehuetenango' 'kapchorwa' 'west va

In [13]:
# remove text after / in 'processing_method' column
coffee_df['processing_method'] = coffee_df['processing_method'].str.split('/').str[0]

In [14]:
# change altitude_low_meters to int datatype
coffee_df['altitude_low_meters'] = coffee_df['altitude_low_meters'].astype(int)

In [15]:
# change altitude_high_meters to int datatype
coffee_df['altitude_high_meters'] = coffee_df['altitude_high_meters'].astype(int)

In [16]:
# Extract text between '/' and ','
#coffee_df['region'] = coffee_df['region'].apply(lambda x: re.search('/(.*?),', x).group(1) if re.search('/(.*?),', x) else None)

In [17]:
coffee_df

Unnamed: 0,species,country_of_origin,region,variety,processing_method,aroma,flavor,aftertaste,acidity,body,balance,clean_cup,sweetness,altitude_low_meters,altitude_high_meters
0,Arabica,Ethiopia,guji-hambela,Unknown,Washed,8.67,8.83,8.67,8.75,8.50,8.42,10.00,10.00,1950,2200
1,Arabica,Ethiopia,guji-hambela,Other,Washed,8.75,8.67,8.50,8.58,8.42,8.42,10.00,10.00,1950,2200
3,Arabica,Ethiopia,oromia,Unknown,Natural,8.17,8.58,8.42,8.42,8.50,8.25,10.00,10.00,1800,2200
4,Arabica,Ethiopia,guji-hambela,Other,Washed,8.25,8.50,8.25,8.50,8.42,8.33,10.00,10.00,1950,2200
9,Arabica,Ethiopia,"snnp/kaffa zone,gimbowereda",Other,Natural,8.08,8.58,8.50,8.50,7.67,8.42,10.00,10.00,1795,1850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,Arabica,Honduras,marcala,Catuai,Washed,7.00,6.33,6.17,6.50,6.67,6.17,8.00,8.00,1450,1450
1306,Arabica,Mexico,juchique de ferrer,Bourbon,Washed,7.08,6.83,6.25,7.42,7.25,6.75,0.00,10.00,900,900
1307,Arabica,Haiti,"department d'artibonite , haiti",Typica,Natural,6.75,6.58,6.42,6.67,7.08,6.67,6.00,6.00,350,350
1308,Arabica,Nicaragua,jalapa,Caturra,Other,7.25,6.58,6.33,6.25,6.42,6.08,6.00,6.00,1100,1100
