## This is the notebook for the new coffee.csv file we found.

Cleaning and exploration code below...

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.geocoders import Nominatim
import re

In [None]:
# Read the csv into a Pandas DataFrame
coffee_df = pd.read_csv('../Resources/coffee.csv')
coffee_df.head()

In [None]:
# Get a brief summary of DataFrame
coffee_df.info()

In [None]:
# drop unnecessary columns 
coffee_df = coffee_df.drop(columns=['all_text', 'est_price', 'review_date', 'agtron', 'location'])

In [None]:
coffee_df.head()

### Work with the 'origin' column

In [None]:
coffee_df['origin'].value_counts()

In [None]:
# Remove the dot at the end of string in new 'country_of_origin' column
coffee_df['origin'] = coffee_df['origin'].str.rstrip('.')
coffee_df['origin'].value_counts()

In [None]:
# Delete rows where country is listed as 'not disclosed'
coffee_df = coffee_df[coffee_df['origin'] != 'Not disclosed']
coffee_df = coffee_df[coffee_df['origin'] != 'Not Disclosed']

In [None]:
# Delete rows with NaN values
coffee_df = coffee_df.dropna(subset=['origin'])

In [None]:
# Specify the column with strings
columnName = 'origin'

# Create a new column to store the split result
newColumnName = 'country_of_origin'

# Function to split the last word and add it to a new column
def split_last_word_except_semicolon(text):
    if pd.isna(text):  # Check if the value is NaN
        return ''
    words = re.split(r'\s*;\s*|\s+', text)
    return words[-1]

# Apply the function to create a new column
coffee_df['country_of_origin'] = coffee_df['origin'].apply(split_last_word_except_semicolon)

# Print the updated DataFrame
coffee_df.head()

In [None]:
coffee_df['country_of_origin'].value_counts()

In [None]:
# Change the datatype of the specified column to string
coffee_df['country_of_origin'] = coffee_df['country_of_origin'].astype(str)

# Create a function to identify instances of 'origin' where the value count is less than a certain number
def single_instances_grouped(instances, cutoff):
    instances_map = {}
    for i in range(len(instances)):
        if instances.values[i] >= cutoff:
            instances_map[instances.index[i]] = instances.index[i]
        else:
            instances_map[instances.index[i]] = 'other'
    return instances_map

origin_map = single_instances_grouped(coffee_df.country_of_origin.value_counts(), 12)
coffee_df['country_of_origin'] = coffee_df['country_of_origin'].map(origin_map)
coffee_df.country_of_origin.value_counts()

In [None]:
# Drop the rows with blank values from the original DataFrame
rows_with_blank_values = coffee_df[coffee_df['country_of_origin'].isna() | (coffee_df['country_of_origin'] == '')]

if not rows_with_blank_values.empty:
    coffee_df.drop(rows_with_blank_values.index, inplace=True)

coffee_df.country_of_origin.value_counts()

In [None]:
coffee_df.country_of_origin.value_counts().sum()

In [None]:
coffee_df = coffee_df.drop(columns='origin')

In [None]:
# # Function to get latitude and longitude
# def get_lat_lon(country_name):
#     geolocator = Nominatim(user_agent="my_geocoder")
    
#     try:
#         location = geolocator.geocode(country_name)
#         if location is not None:
#             return location.latitude, location.longitude
#         else:
#             return None, None
#     except Exception as e:
#         print(f"Error: {e}")
#         return None, None

# # Apply the function to the 'Country' column
# coffee_df[['Latitude', 'Longitude']] = coffee_df['country_of_origin'].apply(lambda x: pd.Series(get_lat_lon(x)))

# # Print the updated DataFrame
# coffee_df.head()

In [None]:
# # Create lat and lon columns and use GeoFy library to extract the coordinates of each country

# def get_lat_lon_for_country(country_name):
#     geolocator = Nominatim(user_agent="country_locator")
#     location = geolocator.geocode(country_name)
    
#     if location:
#         return location.latitude, location.longitude
#     else:
#         return None

# # Create new columns for latitude and longitude
# coffee_df['latitude'] = None
# coffee_df['longitude'] = None

# # Iterate through rows and geocode each country
# for index, row in coffee_df.iterrows():
#     country_name = row['country_of_origin']  # Replace 'CountryColumnName' with the actual column name in your CSV
#     lat_lon = get_lat_lon_for_country(country_name)
    
#     if lat_lon:
#         coffee_df.at[index, 'latitude'] = lat_lon[0]
#         coffee_df.at[index, 'longitude'] = lat_lon[1]

# coffee_df.head()

In [None]:
coffee_df.to_csv('../Resources/NEW_coffee_final.csv', index=False)