## This is the notebook for the new coffee.csv file we found.

Cleaning and exploration code below...

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.geocoders import Nominatim
import re

In [2]:
# Read the csv into a Pandas DataFrame
coffee_df = pd.read_csv('../Resources/coffee.csv', encoding='utf-8')
coffee_df.head()

Unnamed: 0,all_text,name,rating,roaster,slug,region_africa_arabia,region_caribbean,region_central_america,region_hawaii,region_asia_pacific,...,aroma,acid,body,flavor,aftertaste,with_milk,desc_1,desc_2,desc_3,desc_4
0,\n\n\n\n \n93\nFlight Coffee Co.\nEthiopia Der...,Ethiopia Deri Kochoha,93,Flight Coffee Co.,/review/ethiopia-deri-kochoha-2,1,0,0,0,0,...,9.0,8.0,9.0,9.0,8.0,,"Bright, crisp, sweetly tart. Citrus medley, ca...",From the Deri Kochoha mill in the Hagere Marya...,A poised and melodic wet-processed Ethiopia co...,
1,\n\n\n\n\n91\nDoi Chaang Coffee\nEspresso\nLoc...,Espresso,91,Doi Chaang Coffee,/review/espresso-14,0,0,0,0,1,...,8.0,,8.0,8.0,8.0,9.0,"Evaluated as espresso. Deeply rich, sweetly ro...",Doi Chaang is a single-estate coffee produced ...,"A rich, resonant espresso from Thailand, espec...",
2,\n\n\n\n \n95\nTemple Coffee and Tea\nKenya Ru...,Kenya Ruthaka Peaberry,95,Temple Coffee and Tea,/review/kenya-ruthaka-peaberry,1,0,0,0,0,...,9.0,8.0,9.0,10.0,8.0,,"Deeply sweet, richly savory. Dark chocolate, p...",Despite challenges ranging from contested gove...,"A high-toned, nuanced Kenya cup, classic in it...",
3,\n\n\n\n \n93\nTemple Coffee and Tea\nEthiopia...,Ethiopia Gora Kone Sidamo,93,Temple Coffee and Tea,/review/ethiopia-gora-kone-sidamo,1,0,0,0,0,...,9.0,8.0,9.0,9.0,8.0,,"Fruit-forward, richly chocolaty. Raspberry cou...",Southern Ethiopia coffees like this one are la...,"A playful, unrestrained fruit bomb of a coffee...",
4,\n\n\n\n\n93\nChoosy Gourmet\nSpecialty Coffee...,Specialty Coffee Blend Espresso,93,Choosy Gourmet,/review/specialty-coffee-blend-espresso,0,0,0,0,0,...,9.0,,8.0,9.0,8.0,9.0,"Evaluated as espresso. Rich, chocolaty, sweetl...",A blend of coffees from Ethiopia (natural-proc...,An espresso blend in which spice notes — in pa...,


In [3]:
# Get a brief summary of DataFrame
coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5124 entries, 0 to 5123
Data columns (total 34 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   all_text                5124 non-null   object 
 1   name                    5124 non-null   object 
 2   rating                  5124 non-null   object 
 3   roaster                 5124 non-null   object 
 4   slug                    5124 non-null   object 
 5   region_africa_arabia    5124 non-null   int64  
 6   region_caribbean        5124 non-null   int64  
 7   region_central_america  5124 non-null   int64  
 8   region_hawaii           5124 non-null   int64  
 9   region_asia_pacific     5124 non-null   int64  
 10  region_south_america    5124 non-null   int64  
 11  type_espresso           5124 non-null   int64  
 12  type_organic            5124 non-null   int64  
 13  type_fair_trade         5124 non-null   int64  
 14  type_decaffeinated      5124 non-null   

In [4]:
# drop unnecessary columns 
coffee_df = coffee_df.drop(columns=['all_text', 'est_price', 'review_date', 'agtron', 'location', 'with_milk', 'desc_3', 'desc_4'])

In [5]:
#coffee_df.head()

In [6]:
# Specify the columns to clean
columns_to_clean = ['roaster','desc_1', 'desc_2']

# Define a regular expression to match special characters
special_characters_pattern = r'[^a-zA-Z0-9\s,]'

# Clean each specified column
for column in columns_to_clean:
    coffee_df[column] = coffee_df[column].str.replace(special_characters_pattern, '', regex=True)

TypeError: expected string or bytes-like object, got 'list'

In [None]:
# Apply the lower method to all columns using applymap
coffee_df = coffee_df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [None]:
# Replace hyphens with underscores in the specified column
coffee_df['roast'] = coffee_df['roast'].str.replace('-', '_')

In [None]:
coffee_df.info()

### Work with the 'origin' column

In [None]:
coffee_df['origin'].value_counts()

In [None]:
# Remove the dot at the end of string in new 'country_of_origin' column
coffee_df['origin'] = coffee_df['origin'].str.rstrip('.')
coffee_df['origin'].value_counts()

In [None]:
# Delete rows where country is listed as 'not disclosed'
coffee_df = coffee_df[coffee_df['origin'] != 'Not disclosed']
coffee_df = coffee_df[coffee_df['origin'] != 'Not Disclosed']

In [None]:
# Delete rows with NaN values
coffee_df = coffee_df.dropna(subset=['origin'])

In [None]:
# Specify the column with strings
columnName = 'origin'

# Create a new column to store the split result
newColumnName = 'country_of_origin'

# Function to split the last word and add it to a new column
def split_last_word_except_semicolon(text):
    if pd.isna(text):  # Check if the value is NaN
        return ''
    words = re.split(r'\s*;\s*|\s+', text)
    return words[-1]

# Apply the function to create a new column
coffee_df['country_of_origin'] = coffee_df['origin'].apply(split_last_word_except_semicolon)

# Print the updated DataFrame
coffee_df.head()

In [None]:
coffee_df['country_of_origin'].value_counts()

In [None]:
# Change the datatype of the specified column to string
coffee_df['country_of_origin'] = coffee_df['country_of_origin'].astype(str)

# Create a function to identify instances of 'origin' where the value count is less than a certain number
def single_instances_grouped(instances, cutoff):
    instances_map = {}
    for i in range(len(instances)):
        if instances.values[i] >= cutoff:
            instances_map[instances.index[i]] = instances.index[i]
        else:
            instances_map[instances.index[i]] = 'other'
    return instances_map

origin_map = single_instances_grouped(coffee_df.country_of_origin.value_counts(), 12)
coffee_df['country_of_origin'] = coffee_df['country_of_origin'].map(origin_map)
coffee_df.country_of_origin.value_counts()

In [None]:
# Drop the rows with blank values from the original DataFrame
rows_with_blank_values = coffee_df[coffee_df['country_of_origin'].isna() | (coffee_df['country_of_origin'] == '')]

if not rows_with_blank_values.empty:
    coffee_df.drop(rows_with_blank_values.index, inplace=True)

coffee_df.country_of_origin.value_counts()

In [None]:
coffee_df.country_of_origin.value_counts().sum()

In [None]:
coffee_df = coffee_df.drop(columns='origin')

In [None]:
coffee_df.info()

In [None]:
# Reorder columns in the DataFrame
coffee_df = coffee_df[['slug', 'name', 'roaster', 'roast', 'country_of_origin', 'desc_1', 'desc_2', 'rating', 
                      'region_africa_arabia', 'region_caribbean', 'region_central_america', 'region_hawaii', 'region_asia_pacific', 'region_south_america', 
                      'type_espresso', 'type_organic', 'type_fair_trade', 'type_decaffeinated', 'type_pod_capsule', 'type_blend', 'type_estate', 
                      'aroma', 'acid', 'body', 'flavor', 'aftertaste']]

In [None]:
coffee_df.info()

In [None]:
# # Function to get latitude and longitude
# def get_lat_lon(country_name):
#     geolocator = Nominatim(user_agent="my_geocoder")
    
#     try:
#         location = geolocator.geocode(country_name)
#         if location is not None:
#             return location.latitude, location.longitude
#         else:
#             return None, None
#     except Exception as e:
#         print(f"Error: {e}")
#         return None, None

# # Apply the function to the 'Country' column
# coffee_df[['Latitude', 'Longitude']] = coffee_df['country_of_origin'].apply(lambda x: pd.Series(get_lat_lon(x)))

# # Print the updated DataFrame
# coffee_df.head()

In [None]:
# # Create lat and lon columns and use GeoFy library to extract the coordinates of each country

# def get_lat_lon_for_country(country_name):
#     geolocator = Nominatim(user_agent="country_locator")
#     location = geolocator.geocode(country_name)
    
#     if location:
#         return location.latitude, location.longitude
#     else:
#         return None

# # Create new columns for latitude and longitude
# coffee_df['latitude'] = None
# coffee_df['longitude'] = None

# # Iterate through rows and geocode each country
# for index, row in coffee_df.iterrows():
#     country_name = row['country_of_origin']  # Replace 'CountryColumnName' with the actual column name in your CSV
#     lat_lon = get_lat_lon_for_country(country_name)
    
#     if lat_lon:
#         coffee_df.at[index, 'latitude'] = lat_lon[0]
#         coffee_df.at[index, 'longitude'] = lat_lon[1]

# coffee_df.head()

In [None]:
coffee_df.to_csv('../Resources/NEW_coffee_final.csv', index=False)