## Clean up script for combined_raw
Decisions arround data management will be recorded here.


In [1]:
# Dependencies
import pandas as pd
import numpy as np
import re

In [3]:
# Read CSV into pandas dataframe.
df = pd.read_csv('combined_raw.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,quality_score,view_certificate_1,view_certificate_2,Cupping Protocol and Descriptors,View Green Analysis Details,Request a Sample,Species,Owner,Country of Origin,...,Quakers,Color,Category Two Defects,NA.3,Expiration,Certification Body,Certification Address,Certification Contact,Unnamed: 51,Notes
0,0,90.58,,,,,,Arabica,metad plc,Ethiopia,...,0.0,Green,0 full defects,,"April 3rd, 2016",METAD Agricultural Development plc,"BAWA Center, 3rd Floor (Gerji), Addis Ababa, E...","Aman Adinew (Emebet Dinku) - +251-116-292534, ...",,
1,1,89.92,,,,,,Arabica,metad plc,Ethiopia,...,0.0,Green,1 full defects,,"April 3rd, 2016",METAD Agricultural Development plc,"BAWA Center, 3rd Floor (Gerji), Addis Ababa, E...","Aman Adinew (Emebet Dinku) - +251-116-292534, ...",,
2,2,89.75,,,,,,Arabica,Grounds for Health Admin,Guatemala,...,0.0,,0 full defects,,"May 31st, 2011",Specialty Coffee Association,"117 W 4th St, Suite 300 Santa Ana, CA 92701",Chris Buck - (562) 624-4100,,
3,3,89.0,,,,,,Arabica,Yidnekachew Dabessa,Ethiopia,...,0.0,Green,2 full defects,,"March 25th, 2016",METAD Agricultural Development plc,"BAWA Center, 3rd Floor (Gerji), Addis Ababa, E...","Aman Adinew (Emebet Dinku) - +251-116-292534, ...",,
4,4,88.83,,,,,,Arabica,metad plc,Ethiopia,...,0.0,Green,2 full defects,,"April 3rd, 2016",METAD Agricultural Development plc,"BAWA Center, 3rd Floor (Gerji), Addis Ababa, E...","Aman Adinew (Emebet Dinku) - +251-116-292534, ...",,


In [4]:
## Drop columns, view_certificate_1, view_certificate_2, Cupping Protocol and Descriptors, View Green Analysis Details, Request a Sample. No data

df1 = df.drop(columns =['Unnamed: 0','Owner.1','NA','Unnamed: 51','Notes','view_certificate_1','view_certificate_2','Cupping Protocol and Descriptors','View Green Analysis Details','Request a Sample','NA.1','NA.2','NA.3'])
df1.columns

Index(['quality_score', 'Species', 'Owner', 'Country of Origin', 'Farm Name',
       'Lot Number', 'Mill', 'ICO Number', 'Company', 'Altitude', 'Region',
       'Producer', 'Number of Bags', 'Bag Weight', 'In-Country Partner',
       'Harvest Year', 'Grading Date', 'Variety', 'Status',
       'Processing Method', 'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body',
       'Balance', 'Uniformity', 'Clean Cup', 'Sweetness', 'Cupper Points',
       'Total Cup Points', 'Moisture', 'Category One Defects', 'Quakers',
       'Color', 'Category Two Defects', 'Expiration', 'Certification Body',
       'Certification Address', 'Certification Contact'],
      dtype='object')

In [5]:
## Scan for N/A and convert to blank
## Count number of blanks per column 
df1=df1.replace('n/a',"",regex=True)
df1=df1.replace(np.nan,"",regex=True)


In [6]:
df1.index.name = 'ID'


In [7]:
## quality score convert to int
df1.astype({'quality_score':'int32'}).dtypes

quality_score              int32
Species                   object
Owner                     object
Country of Origin         object
Farm Name                 object
Lot Number                object
Mill                      object
ICO Number                object
Company                   object
Altitude                  object
Region                    object
Producer                  object
Number of Bags             int64
Bag Weight                object
In-Country Partner        object
Harvest Year              object
Grading Date              object
Variety                   object
Status                    object
Processing Method         object
Aroma                    float64
Flavor                   float64
Aftertaste               float64
Acidity                  float64
Body                     float64
Balance                  float64
Uniformity               float64
Clean Cup                float64
Sweetness                float64
Cupper Points            float64
Total Cup 

In [8]:
df2 = df1.copy()
df3 = df1.copy()

In [9]:
## remove lbs/kgs before counter
counter = -1
        
for row in df3['Bag Weight']:
   counter += 1
   if "kg" in row and "lbs" in row:
        df3.iloc[counter, 13] = ""

In [10]:
## list keeping track of cells with lbs for conversion later
counter = 0
idx_lst = []
for idx, row in enumerate(df3['Bag Weight']):
    if "lbs" in row:
        counter += 1

        idx_lst.append(idx)

In [11]:
counter = -1
        
for row in df2['Bag Weight']:
   counter += 1
   if "kg" in row and "lbs" in row: 
        df2.iloc[counter, 13] = 0

In [12]:
## Bag Weight convert to all kg. Drop all letters and make int
# if the cell has lbs and kg make the cell empty

counter = -1
        
for row in df2['Bag Weight']:
   counter += 1
   if row ==0:
       continue
   if "kg" in row:
        fixed = re.sub(r'[a-z]+','',row,re.I) 
        df2.iloc[counter, 13] = fixed
   if "lbs" in row:
        fixed= re.sub(r'[a-z]+','',row,re.I)
        df2.iloc[counter, 13] = fixed


In [14]:
#type conversion

df2['Bag Weight'] = df2['Bag Weight'].astype(int)

In [15]:
## convert lbs to kg
for i in idx_lst:
   df2.iloc[i,13]=df2.iloc[i,13]*.453592



## Altitude String Observations

### Observed
1. Since this data collects from bilingual participants, abbreviations differ. 
    - MSNM: Spanish — meters above sea level
    - MASL: English — meters above sea level
    - F: English — feet
    - 公尺: Chinese — meter
2. There is the appearance of ranges (i.e. 640m-1400m).
    - This is difficult because it requires making a decision about the data. Maybe we set the data for ranges in separate columns.
    
### Conclusions
The majority of the data is in meters, so it will be our goal to convert all values to meters, therefore:
1. First, cleaned of any debris (non-values).
2. Second, the data must be sorted to find values that are not meters and convert them.
3. Third, the data must be split by lower and upper ranges.

### Results
After printing using a for loop and if statements, it was discovered:
<br>
 - 'm' is present 360 times. <br>
 - 'f' is present 26 times. <br>
 - Otherwise, a number is present 700 times. <br>
 - Garbage present 226 times. <br> <br>

Total count: 1312 <br>
Expected count: 1312

In [16]:
# Get length of index for reference
len_index = len(df2.index)

## Cleaning Altitude Code
The following code iterates over each element in 'Altitude' and checks it against some rules. Does it contain 'm' for meter, 'f' for foot, neither? Once checked, it appends a corrected string into a corresponding new column for replacement.

In [17]:
##Fix altitude data

# Function for checking string for number
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

# Assign alt column to variable (as series)
alt = df2['Altitude']
count_y = 0
count_y_num = 0
count_y_f = 0
count_garb = 0

str_meter = []
str_feet = []
str_garb = []
str_num = []

# New column for appending
canon_col = []

for idx, elt in enumerate(alt):
    # if element contains m (suggesting meters)
    if 'm' in elt.lower() or '公尺' in elt.lower():
        count_y += 1
        
        ### --- Begin Canonincalizing --- ###
        elt_new = ""
        elt_garb = ""

        # Print for user
        print(f"""
        ---(String Conversion 1)---
        Working on [{elt}]
        -------------------------
        """)

        # Itertate over each character in a string
        for char in elt:

            # If character is numeric, add to new string in elt
            if char.isdigit() or "-" in char:
                elt_new += char

            # If not, add to garbage variable
            else:
                elt_garb += char

        elt_new = elt_new + " meters"

        print(f"""
        Digit result: {elt_new}
        Non-digit result: {elt_garb}
        """)

        # Append canon_col with new string
        canon_col.append(elt_new)

        ### --- End Canonicalizing --- ###
        
    # if not, it must be a different type
    else:
        
        # if element has numbers, what kind?
        if hasNumbers(elt):
            
            # are they feet?
            if 'f' in elt.lower():
                count_y_f += 1
                str_feet.append(elt)
                
                ### --- Begin Canonincalizing --- ###
                elt_new = ""
                elt_garb = ""

                # Print for user
                print(f"""
                ---(String Conversion 2)---
                Working on [{elt}]
                -------------------------
                """)

                # Itertate over each character in a string
                for char in elt:

                    # If character is numeric, add to new string in elt
                    if char.isdigit() or "-" in char:
                        elt_new += char

                    # If not, add to garbage variable
                    else:
                        elt_garb += char
                        
                elt_new = elt_new + " feet"

                print(f"""
                Digit result: {elt_new}
                Non-digit result: {elt_garb}
                """)
                
                # Append canon_col with new string
                canon_col.append(elt_new)
                
                ### --- End Canonicalizing --- ###
                
            # are they other?
            if 'f' not in elt.lower():
                count_y_num += 1
                
                ### --- Begin Canonincalizing --- ###
                elt_new = ""
                elt_garb = ""

                # Print for user
                print(f"""
                ---(String Conversion 3)---
                Working on [{elt}]
                -------------------------
                """)

                # Itertate over each character in a string
                for char in elt:

                    # If character is numeric, add to new string in elt
                    if char.isdigit() or "-" in char:
                        elt_new += char

                    # If not, add to garbage variable
                    else:
                        elt_garb += char
               
                # Try checking if value is metric
                try:
                    if int(elt_new) > 3000:
                        elt_new = elt_new + " feet"
                    else:
                        elt_new = elt_new + " meters"
                    print(f"""
                    Digit result: {elt_new}
                    Non-digit result: {elt_garb}
                    """)
                    
                # On values that error, add 'range'
                except:
                    
                    # Split this range into list with (2 elements)
                    list = elt_new.split("-", 1)
                    # Is the larger number meters?
                    if int(list[1]) > 3000:
                        elt_new = elt_new + " feet"
                    else:
                        elt_new = elt_new + " meters"

                    print(f"!!! — Converted range: [{elt_new}]")

                
                # Append canon_col with new string
                canon_col.append(elt_new)
                
                ### --- End Canonicalizing --- ###
        else:
            count_garb += 1
            canon_col.append(elt)

q_sum = count_y+count_y_f+count_y_num+count_garb

if q_sum != len_index or len(canon_col) != len_index:
    print("ERROR")
else:
    print("----- SUCCESS -----")
            
print(f"""
Meters is present {count_y} times.
Feet is present {count_y_f} times.
Otherwise, number is present {count_y_num} times.
Garbage present {count_garb} times.
---
Process count: {count_y+count_y_f+count_y_num+count_garb}
Canon count (canon_col): {len(canon_col)}
Expected count: {len_index}
""")

      -------------------------
                

                    Digit result: 1400 meters
                    Non-digit result: 
                    

                ---(String Conversion 3)---
                Working on [1200]
                -------------------------
                

                    Digit result: 1200 meters
                    Non-digit result: 
                    

                ---(String Conversion 3)---
                Working on [1500]
                -------------------------
                

                    Digit result: 1500 meters
                    Non-digit result: 
                    

        ---(String Conversion 1)---
        Working on [4000 P.S.N.M]
        -------------------------
        

        Digit result: 4000 meters
        Non-digit result:  P.S.N.M
        

                ---(String Conversion 3)---
                Working on [4300]
                -------------------------
                

                    Di

In [18]:
# Split strings to new columns for units
df2['New_Altitude'] = canon_col
list_split = df2["New_Altitude"].str.split(" ", expand = True) 
df2['New_Altitude'] = list_split[0]
df2['Altitude_Units'] = list_split[1]

In [19]:
# Get rid of random string values
for idx, i in enumerate(df2['New_Altitude']):
    if hasNumbers(i):
        pass
    else:
        df2['New_Altitude'][idx] = ""
    print(i)

1950-2200
1950-2200
1600-1800
1800-2200
1950-2200


1570-1700
1570-1700
1795-1850
1855-1955
1872
1943
2000
1570-1700
2080
1200-1800

1450
1700-2000
2019
1300
1320
2112

1250

1950
1400
1200


1300
1300
1750-1800
1800



1941
1300
12
12
1000
1754
1250

900-1500
1520-2200
1400-1900
1500-2000
1400

1400-1900
1800
1600
1800-2000
5000
4650


1700
1500
13001400

1680
1900
1800-2000
1600
5600-5760
1800-2000
1950

1250
1400
1700
1770
1300
1300
1600
1200



1550
1550
1250-1400
1400

2560
1700

2136



1800

1900
1300
-1
1580
1100
1400

3500-5200

16001950
800
1620
16001950
1200
1200
1000

350
1400-1900

1500-2000
1500-2000

1800
1450
1200
800-1050
1350
1200
170
1150
1600
1750
1900
1750
1200

1000
1550
2000
4300
5000
1200
1700-1850

1500-2000
1754
1500

1800
5000
4000
1550
16001900

1750
14501700
1800
1300
1100
1200
1750
1680
1700
1800
1100
4300
2000
1350
1500


TEST
1300
1400

16001950
1200

900

1800
1500

1500

442
1800
1700
12001350
900-1200
3600-6200
4400-4700
1000
1200
2560
1500
3607
1400


In [20]:
for idx, i in enumerate(df2['New_Altitude']):
    try:
        if int(i) > 9000:   
            df2['New_Altitude'][idx] = print(i[:4]+'-'+i[4:])
        if int(i) < 500:
            df2['New_Altitude'][idx] = ""
    except:
        pass

1300-1400
1600-1950
1600-1950
1600-1900
1450-1700
1600-1950
1200-1350
1600-1950
1600-1950
1600-1950
9001-100
1600-1950
1600-1950
1600-1950
1600-1950
1600-1950
1600-1950
1600-1950
1600-1950
1600-1950
1600-1950
1600-1950
1600-1950
1300-1800
1600-1950
1100-0
1600-1950
1600-1950
1400-1850
1600-1950
1800-5900
1600-1950
1600-1950
1600-1950
1200-1400
1600-1950
1600-1950
1300-1800
1600-1950
1600-1950
1901-64
1002-000
7001-400
1100-00
1300-1500
1901-64
1300-1800


In [21]:
# Split strings to new columns for upper and lower alts
list_split = df2["New_Altitude"].str.split("-", expand = True) 
df2['Lower_Altitude'] = list_split[0]
df2['Upper_Altitude'] = list_split[1]



In [22]:
df3 = df2.drop(columns=['Altitude', 'New_Altitude', 'ICO Number', 'Species', 'Lot Number', 'Mill'])
df3.to_csv('clean_cqi_')

In [23]:
## Drop all entries that are not in the YYYY format 
counter =-1
for row in df3['Harvest Year']:
    counter += 1
    if len(row) != 4:
      df3.iloc[counter, 15] = ""
    if row == "TEST":
      df3.iloc[counter, 15] =""

In [24]:
df3['Harvest Year']= df3['Harvest Year'].replace(r'^(?![0-9]{4}).*$', np.nan, regex=True)

In [25]:
df3_rename = df3.copy()
list = df3_rename.columns
                  
res = [sub.replace(' ', '_') for sub in list] 
df3_rename.columns = res
df3_rename.columns

Index(['quality_score', 'Owner', 'Country_of_Origin', 'Farm_Name', 'Company',
       'Region', 'Producer', 'Number_of_Bags', 'Bag_Weight',
       'In-Country_Partner', 'Harvest_Year', 'Grading_Date', 'Variety',
       'Status', 'Processing_Method', 'Aroma', 'Flavor', 'Aftertaste',
       'Acidity', 'Body', 'Balance', 'Uniformity', 'Clean_Cup', 'Sweetness',
       'Cupper_Points', 'Total_Cup_Points', 'Moisture', 'Category_One_Defects',
       'Quakers', 'Color', 'Category_Two_Defects', 'Expiration',
       'Certification_Body', 'Certification_Address', 'Certification_Contact',
       'Altitude_Units', 'Lower_Altitude', 'Upper_Altitude'],
      dtype='object')

In [34]:
df3_rename['Harvest Year Corrected'] = df3_rename['Harvest_Year'].astype(str).str[:4]

In [37]:
df3_rename['Harvest Year Corrected'] = df3_rename['Harvest Year Corrected'].apply (pd.to_numeric, errors='coerce')
df3_rename = df3_rename.dropna(subset=['Harvest Year Corrected'])

In [39]:
df3_rename.to_csv('Combined_Quality_Cleaned.csv', index = False)