## This is the notebook for the new coffee.csv file we found.

Cleaning and exploration code below...

In [None]:
#!pip install geopy

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.geocoders import Nominatim
import pycountry
import re
from sqlalchemy import create_engine, text

In [2]:
# Read the csv into a Pandas DataFrame
coffee_df = pd.read_csv('../Resources/coffee.csv', encoding='utf-8')
coffee_df.head()

Unnamed: 0,all_text,name,rating,roaster,slug,region_africa_arabia,region_caribbean,region_central_america,region_hawaii,region_asia_pacific,...,aroma,acid,body,flavor,aftertaste,with_milk,desc_1,desc_2,desc_3,desc_4
0,\n\n\n\n \n93\nFlight Coffee Co.\nEthiopia Der...,Ethiopia Deri Kochoha,93,Flight Coffee Co.,/review/ethiopia-deri-kochoha-2,1,0,0,0,0,...,9.0,8.0,9.0,9.0,8.0,,"Bright, crisp, sweetly tart. Citrus medley, ca...",From the Deri Kochoha mill in the Hagere Marya...,A poised and melodic wet-processed Ethiopia co...,
1,\n\n\n\n\n91\nDoi Chaang Coffee\nEspresso\nLoc...,Espresso,91,Doi Chaang Coffee,/review/espresso-14,0,0,0,0,1,...,8.0,,8.0,8.0,8.0,9.0,"Evaluated as espresso. Deeply rich, sweetly ro...",Doi Chaang is a single-estate coffee produced ...,"A rich, resonant espresso from Thailand, espec...",
2,\n\n\n\n \n95\nTemple Coffee and Tea\nKenya Ru...,Kenya Ruthaka Peaberry,95,Temple Coffee and Tea,/review/kenya-ruthaka-peaberry,1,0,0,0,0,...,9.0,8.0,9.0,10.0,8.0,,"Deeply sweet, richly savory. Dark chocolate, p...",Despite challenges ranging from contested gove...,"A high-toned, nuanced Kenya cup, classic in it...",
3,\n\n\n\n \n93\nTemple Coffee and Tea\nEthiopia...,Ethiopia Gora Kone Sidamo,93,Temple Coffee and Tea,/review/ethiopia-gora-kone-sidamo,1,0,0,0,0,...,9.0,8.0,9.0,9.0,8.0,,"Fruit-forward, richly chocolaty. Raspberry cou...",Southern Ethiopia coffees like this one are la...,"A playful, unrestrained fruit bomb of a coffee...",
4,\n\n\n\n\n93\nChoosy Gourmet\nSpecialty Coffee...,Specialty Coffee Blend Espresso,93,Choosy Gourmet,/review/specialty-coffee-blend-espresso,0,0,0,0,0,...,9.0,,8.0,9.0,8.0,9.0,"Evaluated as espresso. Rich, chocolaty, sweetl...",A blend of coffees from Ethiopia (natural-proc...,An espresso blend in which spice notes — in pa...,


In [3]:
# Get a brief summary of DataFrame
coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5124 entries, 0 to 5123
Data columns (total 34 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   all_text                5124 non-null   object 
 1   name                    5124 non-null   object 
 2   rating                  5124 non-null   object 
 3   roaster                 5124 non-null   object 
 4   slug                    5124 non-null   object 
 5   region_africa_arabia    5124 non-null   int64  
 6   region_caribbean        5124 non-null   int64  
 7   region_central_america  5124 non-null   int64  
 8   region_hawaii           5124 non-null   int64  
 9   region_asia_pacific     5124 non-null   int64  
 10  region_south_america    5124 non-null   int64  
 11  type_espresso           5124 non-null   int64  
 12  type_organic            5124 non-null   int64  
 13  type_fair_trade         5124 non-null   int64  
 14  type_decaffeinated      5124 non-null   

In [4]:
# drop unnecessary columns 
coffee_df = coffee_df.drop(columns=['all_text', 'est_price', 'review_date', 'agtron', 'location', 'with_milk', 'desc_3', 'desc_4'])

In [5]:
#coffee_df.head()

In [6]:
# Specify the columns to clean
columns_to_clean = ['roaster','desc_1', 'desc_2']

# Define a regular expression to match special characters
special_characters_pattern = r'[^a-zA-Z0-9\s,]'

# Clean each specified column
for column in columns_to_clean:
    coffee_df[column] = coffee_df[column].str.replace(special_characters_pattern, '', regex=True)

In [7]:
# Apply the lower method to all columns using applymap
coffee_df = coffee_df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [8]:
# Replace hyphens with underscores in the specified column
coffee_df['roast'] = coffee_df['roast'].str.replace('-', '_')

In [9]:
coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5124 entries, 0 to 5123
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    5124 non-null   object 
 1   rating                  5124 non-null   object 
 2   roaster                 5124 non-null   object 
 3   slug                    5124 non-null   object 
 4   region_africa_arabia    5124 non-null   int64  
 5   region_caribbean        5124 non-null   int64  
 6   region_central_america  5124 non-null   int64  
 7   region_hawaii           5124 non-null   int64  
 8   region_asia_pacific     5124 non-null   int64  
 9   region_south_america    5124 non-null   int64  
 10  type_espresso           5124 non-null   int64  
 11  type_organic            5124 non-null   int64  
 12  type_fair_trade         5124 non-null   int64  
 13  type_decaffeinated      5124 non-null   int64  
 14  type_pod_capsule        5124 non-null   

### Work with the 'origin' column

In [10]:
coffee_df['origin'].value_counts()

not disclosed.                                    376
not disclosed                                     166
yirgacheffe growing region, southern ethiopia.    113
boquete growing region, western panama            109
yirgacheffe growing region, southern ethiopia      94
                                                 ... 
la paz department, western bolivia.                 1
gakui, central kenya.                               1
colombia; brazil.                                   1
africa; central and south america.                  1
indonesia, central and south america                1
Name: origin, Length: 1446, dtype: int64

In [11]:
# Remove the dot at the end of string in new 'country_of_origin' column
coffee_df['origin'] = coffee_df['origin'].str.rstrip('.')
coffee_df['origin'].value_counts()

not disclosed                                                     542
yirgacheffe growing region, southern ethiopia                     207
south-central kenya                                               126
boquete growing region, western panama                            120
nyeri growing region, south-central kenya                          83
                                                                 ... 
embu county, kenya                                                  1
el salvador; kenya                                                  1
gedeo zone, yirgacheffe growing region, south-central ethiopia      1
kenya; papua_new_guinea                                             1
java                                                                1
Name: origin, Length: 1271, dtype: int64

In [12]:
# Delete rows where country is listed as 'not disclosed'
coffee_df = coffee_df[coffee_df['origin'] != 'Not disclosed']
coffee_df = coffee_df[coffee_df['origin'] != 'Not Disclosed']

In [13]:
# Delete rows with NaN values
coffee_df = coffee_df.dropna(subset=['origin'])

In [14]:
# Specify the column with strings
columnName = 'origin'

# Create a new column to store the split result
newColumnName = 'country_of_origin'

# Function to split the last word and add it to a new column
def split_last_word_except_semicolon(text):
    if pd.isna(text):  # Check if the value is NaN
        return ''
    words = re.split(r'\s*;\s*|\s+', text)
    return words[-1]

# Apply the function to create a new column
coffee_df['country_of_origin'] = coffee_df['origin'].apply(split_last_word_except_semicolon)

# Print the updated DataFrame
coffee_df.head()

Unnamed: 0,name,rating,roaster,slug,region_africa_arabia,region_caribbean,region_central_america,region_hawaii,region_asia_pacific,region_south_america,...,origin,roast,aroma,acid,body,flavor,aftertaste,desc_1,desc_2,country_of_origin
0,ethiopia deri kochoha,93,flight coffee co,/review/ethiopia-deri-kochoha-2,1,0,0,0,0,0,...,"west guji zone, oromia region, southeastern et...",medium_light,9.0,8.0,9.0,9.0,8.0,"bright, crisp, sweetly tart citrus medley, cac...",from the deri kochoha mill in the hagere marya...,ethiopia
1,espresso,91,doi chaang coffee,/review/espresso-14,0,0,0,0,1,0,...,northern thailand,medium,8.0,,8.0,8.0,8.0,"evaluated as espresso deeply rich, sweetly roa...",doi chaang is a singleestate coffee produced b...,thailand
2,kenya ruthaka peaberry,95,temple coffee and tea,/review/kenya-ruthaka-peaberry,1,0,0,0,0,0,...,"nyeri growing region, south-central kenya",medium,9.0,8.0,9.0,10.0,8.0,"deeply sweet, richly savory dark chocolate, pi...",despite challenges ranging from contested gove...,kenya
3,ethiopia gora kone sidamo,93,temple coffee and tea,/review/ethiopia-gora-kone-sidamo,1,0,0,0,0,0,...,"sidamo (also sidama) growing region, south-cen...",medium_light,9.0,8.0,9.0,9.0,8.0,"fruitforward, richly chocolaty raspberry couli...",southern ethiopia coffees like this one are la...,ethiopia
4,specialty coffee blend espresso,93,choosy gourmet,/review/specialty-coffee-blend-espresso,0,0,0,0,0,0,...,ethiopia; colombia; kenya,medium_light,9.0,,8.0,9.0,8.0,"evaluated as espresso rich, chocolaty, sweetly...",a blend of coffees from ethiopia naturalproces...,kenya


In [45]:
coffee_df['country_of_origin']

0        ethiopia
1        thailand
2           kenya
3        ethiopia
4           kenya
          ...    
5074    guatemala
5095      sumatra
5098        other
5114      america
5115     colombia
Name: country_of_origin, Length: 4509, dtype: object

In [16]:
# Change the datatype of the specified column to string
coffee_df['country_of_origin'] = coffee_df['country_of_origin'].astype(str)

# Create a function to identify instances of 'origin' where the value count is less than a certain number
def single_instances_grouped(instances, cutoff):
    instances_map = {}
    for i in range(len(instances)):
        if instances.values[i] >= cutoff:
            instances_map[instances.index[i]] = instances.index[i]
        else:
            instances_map[instances.index[i]] = 'other'
    return instances_map

origin_map = single_instances_grouped(coffee_df.country_of_origin.value_counts(), 12)
coffee_df['country_of_origin'] = coffee_df['country_of_origin'].map(origin_map)
coffee_df.country_of_origin.value_counts()

ethiopia                        743
disclosed                       545
kenya                           424
indonesia                       338
colombia                        334
guatemala                       272
panama                          221
costa_rica                      190
brazil                          168
salvador                        165
other                           137
hawaii                          112
rwanda                          101
nicaragua                        97
honduras                         72
papua_new_guinea                 63
burundi                          57
mexico                           55
america                          53
peru                             51
thailand                         49
bolivia                          43
tanzania                         40
india                            37
sumatra                          32
africa                           23
ecuador                          22
jamaica                     

In [17]:
# Drop the rows with blank values from the original DataFrame
rows_with_blank_values = coffee_df[coffee_df['country_of_origin'].isna() | (coffee_df['country_of_origin'] == '')]

if not rows_with_blank_values.empty:
    coffee_df.drop(rows_with_blank_values.index, inplace=True)

coffee_df.country_of_origin.value_counts()

ethiopia                        743
disclosed                       545
kenya                           424
indonesia                       338
colombia                        334
guatemala                       272
panama                          221
costa_rica                      190
brazil                          168
salvador                        165
other                           137
hawaii                          112
rwanda                          101
nicaragua                        97
honduras                         72
papua_new_guinea                 63
burundi                          57
mexico                           55
america                          53
peru                             51
thailand                         49
bolivia                          43
tanzania                         40
india                            37
sumatra                          32
africa                           23
ecuador                          22
jamaica                     

In [18]:
coffee_df.country_of_origin.value_counts().sum()

4509

In [19]:
coffee_df = coffee_df.drop(columns='origin')

In [20]:
coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4509 entries, 0 to 5115
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    4509 non-null   object 
 1   rating                  4509 non-null   object 
 2   roaster                 4509 non-null   object 
 3   slug                    4509 non-null   object 
 4   region_africa_arabia    4509 non-null   int64  
 5   region_caribbean        4509 non-null   int64  
 6   region_central_america  4509 non-null   int64  
 7   region_hawaii           4509 non-null   int64  
 8   region_asia_pacific     4509 non-null   int64  
 9   region_south_america    4509 non-null   int64  
 10  type_espresso           4509 non-null   int64  
 11  type_organic            4509 non-null   int64  
 12  type_fair_trade         4509 non-null   int64  
 13  type_decaffeinated      4509 non-null   int64  
 14  type_pod_capsule        4509 non-null   

In [21]:
# Reorder columns in the DataFrame
coffee_df = coffee_df[['slug', 'name', 'roaster', 'roast', 'country_of_origin', 'desc_1', 'desc_2', 'rating',
                       'aroma', 'acid', 'body', 'flavor', 'aftertaste', 
                       'region_africa_arabia', 'region_caribbean', 'region_central_america', 'region_hawaii', 'region_asia_pacific', 'region_south_america', 
                       'type_espresso', 'type_organic', 'type_fair_trade', 'type_decaffeinated', 'type_pod_capsule', 'type_blend', 'type_estate']]

In [22]:
coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4509 entries, 0 to 5115
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   slug                    4509 non-null   object 
 1   name                    4509 non-null   object 
 2   roaster                 4509 non-null   object 
 3   roast                   4399 non-null   object 
 4   country_of_origin       4509 non-null   object 
 5   desc_1                  4509 non-null   object 
 6   desc_2                  4509 non-null   object 
 7   rating                  4509 non-null   object 
 8   aroma                   4475 non-null   float64
 9   acid                    3685 non-null   float64
 10  body                    4499 non-null   float64
 11  flavor                  4496 non-null   float64
 12  aftertaste              4091 non-null   float64
 13  region_africa_arabia    4509 non-null   int64  
 14  region_caribbean        4509 non-null   

In [23]:
# Create a DataFrame of the 'country_of_origin' column for cleaning
coffee_countries = coffee_df[['country_of_origin']].copy()
coffee_countries.head()

Unnamed: 0,country_of_origin
0,ethiopia
1,thailand
2,kenya
3,ethiopia
4,kenya


In [24]:
# Extract unique values from the 'Category' column
unique_categories = coffee_countries['country_of_origin'].unique()

# Create a new DataFrame with unique values
unique_df = pd.DataFrame({'country': unique_categories})

# Display the result
unique_df

Unnamed: 0,country
0,ethiopia
1,thailand
2,kenya
3,honduras
4,democratic_republic_of_congo
5,brazil
6,panama
7,colombia
8,guatemala
9,indonesia


In [55]:
def get_coordinates(country):
    try:
        country_obj = pycountry.countries.get(name=country)
        geolocator = Nominatim(user_agent="coffee_countries", timeout=20)
        location = geolocator.geocode(country_obj.name)
        return location.latitude, location.longitude
    except AttributeError:
        return None, None

In [56]:
unique_df[['latitude', 'longitude']] = unique_df['country'].apply(get_coordinates).apply(pd.Series)

In [57]:
unique_df

Unnamed: 0,country,latitude,longitude
0,ethiopia,10.21167,38.65212
1,thailand,13.038762,101.700176
2,kenya,1.441968,38.431398
3,honduras,15.257243,-86.075514
4,democratic_republic_of_congo,,
5,brazil,-10.333333,-53.2
6,panama,8.559559,-81.130843
7,colombia,4.099917,-72.908813
8,guatemala,15.585555,-90.345759
9,indonesia,-2.483383,117.890285


In [43]:
list(pycountry.countries)

[Country(alpha_2='AW', alpha_3='ABW', flag='🇦🇼', name='Aruba', numeric='533'),
 Country(alpha_2='AF', alpha_3='AFG', flag='🇦🇫', name='Afghanistan', numeric='004', official_name='Islamic Republic of Afghanistan'),
 Country(alpha_2='AO', alpha_3='AGO', flag='🇦🇴', name='Angola', numeric='024', official_name='Republic of Angola'),
 Country(alpha_2='AI', alpha_3='AIA', flag='🇦🇮', name='Anguilla', numeric='660'),
 Country(alpha_2='AX', alpha_3='ALA', flag='🇦🇽', name='Åland Islands', numeric='248'),
 Country(alpha_2='AL', alpha_3='ALB', flag='🇦🇱', name='Albania', numeric='008', official_name='Republic of Albania'),
 Country(alpha_2='AD', alpha_3='AND', flag='🇦🇩', name='Andorra', numeric='020', official_name='Principality of Andorra'),
 Country(alpha_2='AE', alpha_3='ARE', flag='🇦🇪', name='United Arab Emirates', numeric='784'),
 Country(alpha_2='AR', alpha_3='ARG', flag='🇦🇷', name='Argentina', numeric='032', official_name='Argentine Republic'),
 Country(alpha_2='AM', alpha_3='ARM', flag='🇦🇲', 

In [None]:
# # Function to get latitude and longitude
# def get_lat_lon(country_name):
#     geolocator = Nominatim(user_agent="my_geocoder")
    
#     try:
#         location = geolocator.geocode(country_name)
#         if location is not None:
#             return location.latitude, location.longitude
#         else:
#             return None, None
#     except Exception as e:
#         print(f"Error: {e}")
#         return None, None

# # Apply the function to the 'Country' column
# coffee_df[['Latitude', 'Longitude']] = coffee_df['country_of_origin'].apply(lambda x: pd.Series(get_lat_lon(x)))

# # Print the updated DataFrame
# coffee_df.head()

In [None]:
# # Create lat and lon columns and use GeoFy library to extract the coordinates of each country

# def get_lat_lon_for_country(country_name):
#     geolocator = Nominatim(user_agent="country_locator")
#     location = geolocator.geocode(country_name)
    
#     if location:
#         return location.latitude, location.longitude
#     else:
#         return None

# # Create new columns for latitude and longitude
# coffee_df['latitude'] = None
# coffee_df['longitude'] = None

# # Iterate through rows and geocode each country
# for index, row in coffee_df.iterrows():
#     country_name = row['country_of_origin']  # Replace 'CountryColumnName' with the actual column name in your CSV
#     lat_lon = get_lat_lon_for_country(country_name)
    
#     if lat_lon:
#         coffee_df.at[index, 'latitude'] = lat_lon[0]
#         coffee_df.at[index, 'longitude'] = lat_lon[1]

# coffee_df.head()

In [None]:
coffee_df.to_csv('../Resources/NEW_coffee_final.csv', index=False)

In [None]:
engine = create_engine('sqlite:///Data_Engineering.db')

In [None]:
# Drop the existing table and create a new one with the desired primary key
with engine.connect() as con:
    con.execute(text('''
        CREATE TABLE IF NOT EXISTS coffee_data (
            "slug" VARCHAR, 
            "name" VARCHAR, 
            "roaster" VARCHAR, 
            "roast" VARCHAR, 
            "country_of_origin" VARCHAR, 
            "desc_1" VARCHAR, 
            "desc_2" VARCHAR, 
            "rating" INTEGER,
            "aroma" FLOAT, 
            "acid" FLOAT, 
            "body" FLOAT, 
            "flavor" FLOAT, 
            "aftertaste" FLOAT,
            "region_africa_arabia" INTEGER, 
            "region_caribbean" INTEGER, 
            "region_central_america" INTEGER, 
            "region_hawaii" INTEGER, 
            "region_asia_pacific" INTEGER, 
            "region_south_america" INTEGER, 
            "type_espresso" INTEGER, 
            "type_organic" INTEGER, 
            "type_fair_trade" INTEGER, 
            "type_decaffeinated" INTEGER, 
            "type_pod_capsule" INTEGER, 
            "type_blend" INTEGER, 
            "type_estate" INTEGER,
            PRIMARY KEY ("slug")
        )
    '''))

# Output to the database 
coffee_df.to_sql(name='coffee_data', con=engine, if_exists='replace', index=False)

In [None]:
# Use the engine to create a connection
with engine.connect() as con:
    # Use the connection to execute SQL queries
    row_count = con.execute(text('SELECT COUNT(*) FROM coffee_data')).scalar()

# Print the number of rows in the coffee_data table
print("Number of rows in coffee_data table:", row_count)