# So, How Do You Like *Your* Coffee?

A machine learning project built by Jesslyn Lengkong, Cayley Morrow and Dominique Spencer

---

### Initial steps:

##### Import dependencies and read in csv file.

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.geocoders import Nominatim
import pycountry
import re
from sqlalchemy import create_engine, text, inspect

In [2]:
# Read the csv into a Pandas DataFrame
coffee_df = pd.read_csv('../Resources/coffee.csv', encoding='utf-8')
coffee_df.head()

Unnamed: 0,all_text,name,rating,roaster,slug,region_africa_arabia,region_caribbean,region_central_america,region_hawaii,region_asia_pacific,...,aroma,acid,body,flavor,aftertaste,with_milk,desc_1,desc_2,desc_3,desc_4
0,\n\n\n\n \n93\nFlight Coffee Co.\nEthiopia Der...,Ethiopia Deri Kochoha,93,Flight Coffee Co.,/review/ethiopia-deri-kochoha-2,1,0,0,0,0,...,9.0,8.0,9.0,9.0,8.0,,"Bright, crisp, sweetly tart. Citrus medley, ca...",From the Deri Kochoha mill in the Hagere Marya...,A poised and melodic wet-processed Ethiopia co...,
1,\n\n\n\n\n91\nDoi Chaang Coffee\nEspresso\nLoc...,Espresso,91,Doi Chaang Coffee,/review/espresso-14,0,0,0,0,1,...,8.0,,8.0,8.0,8.0,9.0,"Evaluated as espresso. Deeply rich, sweetly ro...",Doi Chaang is a single-estate coffee produced ...,"A rich, resonant espresso from Thailand, espec...",
2,\n\n\n\n \n95\nTemple Coffee and Tea\nKenya Ru...,Kenya Ruthaka Peaberry,95,Temple Coffee and Tea,/review/kenya-ruthaka-peaberry,1,0,0,0,0,...,9.0,8.0,9.0,10.0,8.0,,"Deeply sweet, richly savory. Dark chocolate, p...",Despite challenges ranging from contested gove...,"A high-toned, nuanced Kenya cup, classic in it...",
3,\n\n\n\n \n93\nTemple Coffee and Tea\nEthiopia...,Ethiopia Gora Kone Sidamo,93,Temple Coffee and Tea,/review/ethiopia-gora-kone-sidamo,1,0,0,0,0,...,9.0,8.0,9.0,9.0,8.0,,"Fruit-forward, richly chocolaty. Raspberry cou...",Southern Ethiopia coffees like this one are la...,"A playful, unrestrained fruit bomb of a coffee...",
4,\n\n\n\n\n93\nChoosy Gourmet\nSpecialty Coffee...,Specialty Coffee Blend Espresso,93,Choosy Gourmet,/review/specialty-coffee-blend-espresso,0,0,0,0,0,...,9.0,,8.0,9.0,8.0,9.0,"Evaluated as espresso. Rich, chocolaty, sweetl...",A blend of coffees from Ethiopia (natural-proc...,An espresso blend in which spice notes — in pa...,


In [3]:
# Get a brief summary of DataFrame
coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5124 entries, 0 to 5123
Data columns (total 34 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   all_text                5124 non-null   object 
 1   name                    5124 non-null   object 
 2   rating                  5124 non-null   object 
 3   roaster                 5124 non-null   object 
 4   slug                    5124 non-null   object 
 5   region_africa_arabia    5124 non-null   int64  
 6   region_caribbean        5124 non-null   int64  
 7   region_central_america  5124 non-null   int64  
 8   region_hawaii           5124 non-null   int64  
 9   region_asia_pacific     5124 non-null   int64  
 10  region_south_america    5124 non-null   int64  
 11  type_espresso           5124 non-null   int64  
 12  type_organic            5124 non-null   int64  
 13  type_fair_trade         5124 non-null   int64  
 14  type_decaffeinated      5124 non-null   

##### Drop unwanted columns and clean text values using a regex pattern.

In [4]:
# Drop unnecessary columns 
coffee_df = coffee_df.drop(columns=['all_text', 'est_price', 'review_date', 'agtron', 'location', 'with_milk', 'desc_3', 'desc_4'])

In [5]:
# Specify the columns to clean with regex
columns_to_clean = ['roaster','desc_1', 'desc_2']

# Define a regex pattern to match special characters
special_characters_pattern = r'[^a-zA-Z0-9\s,.-é"]'

# Clean chosen columns
for column in columns_to_clean:
    coffee_df[column] = coffee_df[column].str.replace(special_characters_pattern, '', regex=True)

### Work with the 'origin' column:

In [6]:
# Get details about 'origin' column
coffee_df['origin'].value_counts()

Not disclosed.                                    376
Not disclosed                                     117
Yirgacheffe growing region, southern Ethiopia.    113
Boquete growing region, western Panama            109
Yirgacheffe growing region, southern Ethiopia      94
                                                 ... 
Bururi Province, Burundi.                           1
La Paz Department, Western Bolivia.                 1
Gakui, Central Kenya.                               1
Rusizi District, Western Rwanda.                    1
Indonesia, Central and South America                1
Name: origin, Length: 1460, dtype: int64

In [7]:
# Remove the dot at the end of string in 'origin' column
coffee_df['origin'] = coffee_df['origin'].str.rstrip('.')
coffee_df['origin'].value_counts()

Not disclosed                                                     493
Yirgacheffe growing region, southern Ethiopia                     207
South-central Kenya                                               124
Boquete growing region, western Panama                            120
Nyeri growing region, south-central Kenya                          83
                                                                 ... 
Nyeri County, Central Kenya                                         1
El Salvador; Kenya                                                  1
Gedeo zone, Yirgacheffe growing region, south-central Ethiopia      1
Kenya; Papua_New_Guinea                                             1
Java                                                                1
Name: origin, Length: 1288, dtype: int64

##### Delete all rows with different iterations of 'not disclosed' from 'origin' column.

In [8]:
# Define a regex pattern
pattern = re.compile(r'not\s*disclosed', flags=re.IGNORECASE)

# Apply this to 'origin' column
coffee_df = coffee_df[~coffee_df['origin'].str.contains(pattern, na=False) | coffee_df['origin'].isna()]

In [9]:
# Delete rows with NaN values from 'origin' column
coffee_df = coffee_df.dropna(subset=['origin'])

##### Create a new column with specific 'country of origin' values.

In [10]:
# Define the 'origin' column 
old_origin_column = 'origin'

# Create a new column to store the split result
new_origin_column = 'country_of_origin'

# Function to split the last word and add it to the new column
def split_last_word_except_semicolon(text):
    if pd.isna(text):  # Check if the value is NaN
        return ''
    words = re.split(r'\s*;\s*|\s+', text)
    return words[-1]

# Apply the function to create a new column
coffee_df['country_of_origin'] = coffee_df['origin'].apply(split_last_word_except_semicolon)

# Check the updated DataFrame
coffee_df.head()

Unnamed: 0,name,rating,roaster,slug,region_africa_arabia,region_caribbean,region_central_america,region_hawaii,region_asia_pacific,region_south_america,...,origin,roast,aroma,acid,body,flavor,aftertaste,desc_1,desc_2,country_of_origin
0,Ethiopia Deri Kochoha,93,Flight Coffee Co.,/review/ethiopia-deri-kochoha-2,1,0,0,0,0,0,...,"West Guji Zone, Oromia Region, southeastern Et...",Medium-Light,9.0,8.0,9.0,9.0,8.0,"Bright, crisp, sweetly tart. Citrus medley, ca...",From the Deri Kochoha mill in the Hagere Marya...,Ethiopia
1,Espresso,91,Doi Chaang Coffee,/review/espresso-14,0,0,0,0,1,0,...,Northern Thailand,Medium,8.0,,8.0,8.0,8.0,"Evaluated as espresso. Deeply rich, sweetly ro...",Doi Chaang is a singleestate coffee produced b...,Thailand
2,Kenya Ruthaka Peaberry,95,Temple Coffee and Tea,/review/kenya-ruthaka-peaberry,1,0,0,0,0,0,...,"Nyeri growing region, south-central Kenya",Medium,9.0,8.0,9.0,10.0,8.0,"Deeply sweet, richly savory. Dark chocolate, p...",Despite challenges ranging from contested gove...,Kenya
3,Ethiopia Gora Kone Sidamo,93,Temple Coffee and Tea,/review/ethiopia-gora-kone-sidamo,1,0,0,0,0,0,...,"Sidamo (also Sidama) growing region, south-cen...",Medium-Light,9.0,8.0,9.0,9.0,8.0,"Fruitforward, richly chocolaty. Raspberry coul...",Southern Ethiopia coffees like this one are la...,Ethiopia
4,Specialty Coffee Blend Espresso,93,Choosy Gourmet,/review/specialty-coffee-blend-espresso,0,0,0,0,0,0,...,Ethiopia; Colombia; Kenya,Medium-Light,9.0,,8.0,9.0,8.0,"Evaluated as espresso. Rich, chocolaty, sweetl...",A blend of coffees from Ethiopia naturalproces...,Kenya


In [11]:
# Check values
coffee_df['country_of_origin'].value_counts()

Ethiopia     743
Kenya        424
Indonesia    338
Colombia     334
Guatemala    272
            ... 
Asia           1
Micro-Lot      1
Malawi         1
Laos           1
USA            1
Name: country_of_origin, Length: 76, dtype: int64

##### Remove all rows where there are 12 or fewer value counts.

In [12]:
# Change the datatype of the specified column to string
coffee_df['country_of_origin'] = coffee_df['country_of_origin'].astype(str)

# Set the cutoff value
cutoff = 12

# Create a mask for rows to keep
keep_mask = coffee_df['country_of_origin'].map(coffee_df['country_of_origin'].value_counts()) >= cutoff

# Keep only the rows where the count is at least the cutoff
coffee_df = coffee_df[keep_mask]

In [13]:
# Get a refreshed summary of DataFrame
coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3844 entries, 0 to 5115
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    3844 non-null   object 
 1   rating                  3844 non-null   object 
 2   roaster                 3844 non-null   object 
 3   slug                    3844 non-null   object 
 4   region_africa_arabia    3844 non-null   int64  
 5   region_caribbean        3844 non-null   int64  
 6   region_central_america  3844 non-null   int64  
 7   region_hawaii           3844 non-null   int64  
 8   region_asia_pacific     3844 non-null   int64  
 9   region_south_america    3844 non-null   int64  
 10  type_espresso           3844 non-null   int64  
 11  type_organic            3844 non-null   int64  
 12  type_fair_trade         3844 non-null   int64  
 13  type_decaffeinated      3844 non-null   int64  
 14  type_pod_capsule        3844 non-null   

##### Drop all rows with blank values, and general country name 'Africa'.

In [14]:
# Drop the rows with blank values from the original DataFrame
rows_with_blank_values = coffee_df[coffee_df['country_of_origin'].isna() | (coffee_df['country_of_origin'] == '')]

if not rows_with_blank_values.empty:
    coffee_df.drop(rows_with_blank_values.index, inplace=True)
    
# Drop all rows containing 'Africa' as a country of origin
string_to_drop = 'Africa'
coffee_df = coffee_df[~coffee_df['country_of_origin'].str.contains(string_to_drop, case=False, na=False)]

# Check the updated DataFrame
coffee_df.head()

Unnamed: 0,name,rating,roaster,slug,region_africa_arabia,region_caribbean,region_central_america,region_hawaii,region_asia_pacific,region_south_america,...,origin,roast,aroma,acid,body,flavor,aftertaste,desc_1,desc_2,country_of_origin
0,Ethiopia Deri Kochoha,93,Flight Coffee Co.,/review/ethiopia-deri-kochoha-2,1,0,0,0,0,0,...,"West Guji Zone, Oromia Region, southeastern Et...",Medium-Light,9.0,8.0,9.0,9.0,8.0,"Bright, crisp, sweetly tart. Citrus medley, ca...",From the Deri Kochoha mill in the Hagere Marya...,Ethiopia
1,Espresso,91,Doi Chaang Coffee,/review/espresso-14,0,0,0,0,1,0,...,Northern Thailand,Medium,8.0,,8.0,8.0,8.0,"Evaluated as espresso. Deeply rich, sweetly ro...",Doi Chaang is a singleestate coffee produced b...,Thailand
2,Kenya Ruthaka Peaberry,95,Temple Coffee and Tea,/review/kenya-ruthaka-peaberry,1,0,0,0,0,0,...,"Nyeri growing region, south-central Kenya",Medium,9.0,8.0,9.0,10.0,8.0,"Deeply sweet, richly savory. Dark chocolate, p...",Despite challenges ranging from contested gove...,Kenya
3,Ethiopia Gora Kone Sidamo,93,Temple Coffee and Tea,/review/ethiopia-gora-kone-sidamo,1,0,0,0,0,0,...,"Sidamo (also Sidama) growing region, south-cen...",Medium-Light,9.0,8.0,9.0,9.0,8.0,"Fruitforward, richly chocolaty. Raspberry coul...",Southern Ethiopia coffees like this one are la...,Ethiopia
4,Specialty Coffee Blend Espresso,93,Choosy Gourmet,/review/specialty-coffee-blend-espresso,0,0,0,0,0,0,...,Ethiopia; Colombia; Kenya,Medium-Light,9.0,,8.0,9.0,8.0,"Evaluated as espresso. Rich, chocolaty, sweetl...",A blend of coffees from Ethiopia naturalproces...,Kenya


In [15]:
# Drop original 'origin' column
coffee_df = coffee_df.drop(columns='origin')

In [16]:
# Check total values of new column 'country_of_origin'
coffee_df.country_of_origin.value_counts()

Ethiopia                        743
Kenya                           424
Indonesia                       338
Colombia                        334
Guatemala                       272
Panama                          221
Costa_Rica                      190
Brazil                          168
Salvador                        165
Hawaii                          112
Rwanda                          101
Nicaragua                        97
Honduras                         72
Papua_New_Guinea                 63
Burundi                          57
Mexico                           55
America                          53
Peru                             51
Thailand                         49
Bolivia                          43
Tanzania                         40
India                            37
Sumatra                          32
Ecuador                          22
Jamaica                          22
Yemen                            18
Democratic_Republic_of_Congo     13
Taiwan                      

In [17]:
# Get a refreshed summary of DataFrame
coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3804 entries, 0 to 5115
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    3804 non-null   object 
 1   rating                  3804 non-null   object 
 2   roaster                 3804 non-null   object 
 3   slug                    3804 non-null   object 
 4   region_africa_arabia    3804 non-null   int64  
 5   region_caribbean        3804 non-null   int64  
 6   region_central_america  3804 non-null   int64  
 7   region_hawaii           3804 non-null   int64  
 8   region_asia_pacific     3804 non-null   int64  
 9   region_south_america    3804 non-null   int64  
 10  type_espresso           3804 non-null   int64  
 11  type_organic            3804 non-null   int64  
 12  type_fair_trade         3804 non-null   int64  
 13  type_decaffeinated      3804 non-null   int64  
 14  type_pod_capsule        3804 non-null   

### Get latitude and longitude for countries_of_origin:

In [20]:
# Check unique values from the 'country_of_origin' column
coffee_df['country_of_origin'].unique()

array(['Ethiopia', 'Thailand', 'Kenya', 'Honduras',
       'Congo, The Democratic Republic of the', 'Brazil', 'Panama',
       'Colombia', 'Guatemala', 'Indonesia', 'El Salvador', 'Costa Rica',
       'United States', 'Burundi', 'Nicaragua', 'Yemen',
       'Tanzania, United Republic of', 'Peru', 'Ecuador', 'Rwanda',
       'China', 'Mexico', 'Papua New Guinea',
       'Bolivia, Plurinational State of', 'India', 'Jamaica'],
      dtype=object)

##### Change country names to match country names in pycountry library.

In [21]:
# Define a dictionary of old and new values
values_to_update = {'America': 'United States',
                    'Hawaii': 'United States', 
                    'Democratic_Republic_of_Congo' : 'Congo, The Democratic Republic of the', 
                    'Salvador': 'El Salvador', 
                    'Costa_Rica': 'Costa Rica', 
                    'Tanzania': 'Tanzania, United Republic of', 
                    'Papua_New_Guinea': 'Papua New Guinea', 
                    'Bolivia': 'Bolivia, Plurinational State of', 
                    'Sumatra': 'Indonesia', 
                    'Taiwan': 'China'
                   }

# Create a mask for rows that need updating
update_mask = coffee_df['country_of_origin'].isin(values_to_update.keys())

# Update the values in the 'country_of_origin' column
coffee_df.loc[update_mask, 'country_of_origin'] = coffee_df.loc[update_mask, 'country_of_origin'].replace(values_to_update)

# Check the updated DataFrame
coffee_df.head()

Unnamed: 0,name,rating,roaster,slug,region_africa_arabia,region_caribbean,region_central_america,region_hawaii,region_asia_pacific,region_south_america,...,type_estate,roast,aroma,acid,body,flavor,aftertaste,desc_1,desc_2,country_of_origin
0,Ethiopia Deri Kochoha,93,Flight Coffee Co.,/review/ethiopia-deri-kochoha-2,1,0,0,0,0,0,...,0,Medium-Light,9.0,8.0,9.0,9.0,8.0,"Bright, crisp, sweetly tart. Citrus medley, ca...",From the Deri Kochoha mill in the Hagere Marya...,Ethiopia
1,Espresso,91,Doi Chaang Coffee,/review/espresso-14,0,0,0,0,1,0,...,1,Medium,8.0,,8.0,8.0,8.0,"Evaluated as espresso. Deeply rich, sweetly ro...",Doi Chaang is a singleestate coffee produced b...,Thailand
2,Kenya Ruthaka Peaberry,95,Temple Coffee and Tea,/review/kenya-ruthaka-peaberry,1,0,0,0,0,0,...,0,Medium,9.0,8.0,9.0,10.0,8.0,"Deeply sweet, richly savory. Dark chocolate, p...",Despite challenges ranging from contested gove...,Kenya
3,Ethiopia Gora Kone Sidamo,93,Temple Coffee and Tea,/review/ethiopia-gora-kone-sidamo,1,0,0,0,0,0,...,0,Medium-Light,9.0,8.0,9.0,9.0,8.0,"Fruitforward, richly chocolaty. Raspberry coul...",Southern Ethiopia coffees like this one are la...,Ethiopia
4,Specialty Coffee Blend Espresso,93,Choosy Gourmet,/review/specialty-coffee-blend-espresso,0,0,0,0,0,0,...,0,Medium-Light,9.0,,8.0,9.0,8.0,"Evaluated as espresso. Rich, chocolaty, sweetl...",A blend of coffees from Ethiopia naturalproces...,Kenya


##### Minimise dataframe to only unique values for pycountry function efficiency.

In [20]:
# Create a new DataFrame of the 'country_of_origin' column for getting lat and lon
coffee_countries = coffee_df[['country_of_origin']].copy()

# Extract unique values from the 'Category' column
unique_categories = coffee_countries['country_of_origin'].unique()

# Create a new DataFrame with unique values
unique_df = pd.DataFrame({'country_of_origin': unique_categories})

In [21]:
# Function to get coordinates using pycountry library
def get_coordinates(country):
    try:
        country_obj = pycountry.countries.get(name=country)
        geolocator = Nominatim(user_agent="coffee_countries", timeout=20)
        location = geolocator.geocode(country_obj.name)
        return location.latitude, location.longitude
    except AttributeError:
        return None, None

# Apply function to new Dataframe    
unique_df[['latitude', 'longitude']] = unique_df['country_of_origin'].apply(get_coordinates).apply(pd.Series)

# Check the updated DataFrame
unique_df

Unnamed: 0,country_of_origin,latitude,longitude
0,Ethiopia,6.7678,35.634371
1,Thailand,13.038762,101.700176
2,Kenya,1.441968,38.431398
3,Honduras,15.257243,-86.075514
4,"Congo, The Democratic Republic of the",-2.981434,23.822264
5,Brazil,-10.333333,-53.2
6,Panama,8.559559,-81.130843
7,Colombia,4.099917,-72.908813
8,Guatemala,15.585555,-90.345759
9,Indonesia,-2.483383,117.890285


In [22]:
# Merge the DataFrames based on the common column
new_coffee_df = pd.merge(coffee_df, unique_df, on='country_of_origin', how='left')

In [23]:
# Check the updated DataFrame
new_coffee_df.head()

Unnamed: 0,name,rating,roaster,slug,region_africa_arabia,region_caribbean,region_central_america,region_hawaii,region_asia_pacific,region_south_america,...,aroma,acid,body,flavor,aftertaste,desc_1,desc_2,country_of_origin,latitude,longitude
0,Ethiopia Deri Kochoha,93,Flight Coffee Co.,/review/ethiopia-deri-kochoha-2,1,0,0,0,0,0,...,9.0,8.0,9.0,9.0,8.0,"Bright, crisp, sweetly tart. Citrus medley, ca...",From the Deri Kochoha mill in the Hagere Marya...,Ethiopia,6.7678,35.634371
1,Espresso,91,Doi Chaang Coffee,/review/espresso-14,0,0,0,0,1,0,...,8.0,,8.0,8.0,8.0,"Evaluated as espresso. Deeply rich, sweetly ro...",Doi Chaang is a singleestate coffee produced b...,Thailand,13.038762,101.700176
2,Kenya Ruthaka Peaberry,95,Temple Coffee and Tea,/review/kenya-ruthaka-peaberry,1,0,0,0,0,0,...,9.0,8.0,9.0,10.0,8.0,"Deeply sweet, richly savory. Dark chocolate, p...",Despite challenges ranging from contested gove...,Kenya,1.441968,38.431398
3,Ethiopia Gora Kone Sidamo,93,Temple Coffee and Tea,/review/ethiopia-gora-kone-sidamo,1,0,0,0,0,0,...,9.0,8.0,9.0,9.0,8.0,"Fruitforward, richly chocolaty. Raspberry coul...",Southern Ethiopia coffees like this one are la...,Ethiopia,6.7678,35.634371
4,Specialty Coffee Blend Espresso,93,Choosy Gourmet,/review/specialty-coffee-blend-espresso,0,0,0,0,0,0,...,9.0,,8.0,9.0,8.0,"Evaluated as espresso. Rich, chocolaty, sweetl...",A blend of coffees from Ethiopia naturalproces...,Kenya,1.441968,38.431398


### Complete final cleaning and exporting:

In [24]:
# Check the updated DataFrame value counts
new_coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3804 entries, 0 to 3803
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    3804 non-null   object 
 1   rating                  3804 non-null   object 
 2   roaster                 3804 non-null   object 
 3   slug                    3804 non-null   object 
 4   region_africa_arabia    3804 non-null   int64  
 5   region_caribbean        3804 non-null   int64  
 6   region_central_america  3804 non-null   int64  
 7   region_hawaii           3804 non-null   int64  
 8   region_asia_pacific     3804 non-null   int64  
 9   region_south_america    3804 non-null   int64  
 10  type_espresso           3804 non-null   int64  
 11  type_organic            3804 non-null   int64  
 12  type_fair_trade         3804 non-null   int64  
 13  type_decaffeinated      3804 non-null   int64  
 14  type_pod_capsule        3804 non-null   

In [25]:
# Reorder columns
coffee_df = new_coffee_df[['slug', 'name', 'roaster', 'roast', 'country_of_origin', 'desc_1', 'desc_2', 'latitude', 'longitude', 'rating',
                       'aroma', 'acid', 'body', 'flavor', 'aftertaste',
                       'region_africa_arabia', 'region_caribbean', 'region_central_america', 'region_hawaii', 'region_asia_pacific', 'region_south_america', 
                       'type_espresso', 'type_organic', 'type_fair_trade', 'type_decaffeinated', 'type_pod_capsule', 'type_blend', 'type_estate']]

In [26]:
# Drop rows with null values 
coffee_df = coffee_df.dropna()

In [27]:
# Review the updated DataFrame after all previous changes
coffee_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3001 entries, 0 to 3519
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   slug                    3001 non-null   object 
 1   name                    3001 non-null   object 
 2   roaster                 3001 non-null   object 
 3   roast                   3001 non-null   object 
 4   country_of_origin       3001 non-null   object 
 5   desc_1                  3001 non-null   object 
 6   desc_2                  3001 non-null   object 
 7   latitude                3001 non-null   float64
 8   longitude               3001 non-null   float64
 9   rating                  3001 non-null   object 
 10  aroma                   3001 non-null   float64
 11  acid                    3001 non-null   float64
 12  body                    3001 non-null   float64
 13  flavor                  3001 non-null   float64
 14  aftertaste              3001 non-null   

In [28]:
# Save Dataframe to Resources folder
coffee_df.to_csv('../Resources/NEW_coffee_final.csv', index=False)

### Create SQL database:

In [29]:
# Define engine path
engine = create_engine('sqlite:///Data_Engineering.db')

In [30]:
# Drop the existing table and create a new one with the desired primary key
with engine.connect() as con:
    con.execute(text('''
        CREATE TABLE IF NOT EXISTS coffee_data (
            "slug" VARCHAR, 
            "name" VARCHAR, 
            "roaster" VARCHAR, 
            "roast" VARCHAR, 
            "country_of_origin" VARCHAR, 
            "desc_1" VARCHAR, 
            "desc_2" VARCHAR,
            "latitude" FLOAT,
            "longitude" FLOAT,
            "rating" INTEGER,
            "aroma" FLOAT, 
            "acid" FLOAT, 
            "body" FLOAT, 
            "flavor" FLOAT, 
            "aftertaste" FLOAT,
            "region_africa_arabia" INTEGER, 
            "region_caribbean" INTEGER, 
            "region_central_america" INTEGER, 
            "region_hawaii" INTEGER, 
            "region_asia_pacific" INTEGER, 
            "region_south_america" INTEGER, 
            "type_espresso" INTEGER, 
            "type_organic" INTEGER, 
            "type_fair_trade" INTEGER, 
            "type_decaffeinated" INTEGER, 
            "type_pod_capsule" INTEGER, 
            "type_blend" INTEGER, 
            "type_estate" INTEGER,
            PRIMARY KEY ("slug")
        )
    '''))

# Output to the database 
coffee_df.to_sql(name='coffee_data', con=engine, if_exists='replace', index=False)

3001

In [31]:
# Create a connection
connection = engine.connect()

# Create an Inspector and get the table names
inspector = inspect(engine)
table_names = inspector.get_table_names()

# Print the table names and some sample data
for table_name in table_names:
    print(f"Table: {table_name}")

    # Use text() to create a SQL expression
    query = text(f"SELECT * FROM {table_name} LIMIT 5")
    sample_data = connection.execute(query).fetchall()

    print("Sample Data:")
    for row in sample_data:
        print(row)
    print("\n")

# Close the connection
connection.close()

Table: coffee_data
Sample Data:
('/review/ethiopia-deri-kochoha-2', 'Ethiopia Deri Kochoha', 'Flight Coffee Co.', 'Medium-Light', 'Ethiopia', 'Bright, crisp, sweetly tart. Citrus medley, cacao nib, pink peppercorn, dogwood, almond in aroma and cup. Sweettart structure with lively acidity; crisp, satiny mouthfeel. The short finish is intense with notes of citrus and pink peppercorn, while the long softens to almondtoned florals.', 'From the Deri Kochoha mill in the Hagere Maryame District, a little south of the famous Yirgacheffe region, where coffees largely from trees of indig ... (291 characters truncated) ...  coffeeloving, qualityfocused roasting company based in Bedford, New Hampshire. Visit www.flightcoffeeco.com or call 6038366228 for more information.', 6.7678, 35.6343712, '93', 9.0, 8.0, 9.0, 9.0, 8.0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
('/review/kenya-ruthaka-peaberry', 'Kenya Ruthaka Peaberry', 'Temple Coffee and Tea', 'Medium', 'Kenya', 'Deeply sweet, richly savory. Dar