In [23]:
# Import Libraries
import pandas as pd
import numpy as np

# Machine Learning Libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
# Load the Data
df = pd.read_csv("data/data_with_reviews.csv")

In [25]:
# Preview loaded dataframe
df.head(3)

Unnamed: 0.1,Unnamed: 0,Beer Names,Rating,Locations,Reviews,Users,Image,City,Lats,Longs
0,0,Cloudwater A Scabrous Edge Of The Sky,3.9,"Manchester, Greater Manchester","Light fresh fruit, light floral note, hazy gol...",Trolleo,https://res.cloudinary.com/ratebeer/image/uplo...,Manchester,53.5004,-2.248
1,1,Temperance Might Meets Right: Hot Cocoa,3.8,"Evanston, Illinois",Aromas of chocolate bourbon oak toffee cocoa t...,Thisis12ptfont,https://res.cloudinary.com/ratebeer/image/uplo...,Evanston,42.0463,-87.6942
2,2,Pipeworks Barrel Aged The Brown and Stirred,4.5,"Chicago, Illinois","Pours opaque cinnamon, no head. Aroma features...",Cybercat,https://res.cloudinary.com/ratebeer/image/uplo...,Chicago,41.8373,-87.6862


In [26]:
# Clean and drop columns that won't be needed
clean_df = df.drop(['Rating', 'Users', 'Lats', 'Longs'], axis=1)
clean_df.head(3)

Unnamed: 0.1,Unnamed: 0,Beer Names,Locations,Reviews,Image,City
0,0,Cloudwater A Scabrous Edge Of The Sky,"Manchester, Greater Manchester","Light fresh fruit, light floral note, hazy gol...",https://res.cloudinary.com/ratebeer/image/uplo...,Manchester
1,1,Temperance Might Meets Right: Hot Cocoa,"Evanston, Illinois",Aromas of chocolate bourbon oak toffee cocoa t...,https://res.cloudinary.com/ratebeer/image/uplo...,Evanston
2,2,Pipeworks Barrel Aged The Brown and Stirred,"Chicago, Illinois","Pours opaque cinnamon, no head. Aroma features...",https://res.cloudinary.com/ratebeer/image/uplo...,Chicago


In [27]:
# Rename columns if needed
clean_df = clean_df.rename(columns={"Unnamed: 0": "Index"})
clean_df.head(3)

Unnamed: 0,Index,Beer Names,Locations,Reviews,Image,City
0,0,Cloudwater A Scabrous Edge Of The Sky,"Manchester, Greater Manchester","Light fresh fruit, light floral note, hazy gol...",https://res.cloudinary.com/ratebeer/image/uplo...,Manchester
1,1,Temperance Might Meets Right: Hot Cocoa,"Evanston, Illinois",Aromas of chocolate bourbon oak toffee cocoa t...,https://res.cloudinary.com/ratebeer/image/uplo...,Evanston
2,2,Pipeworks Barrel Aged The Brown and Stirred,"Chicago, Illinois","Pours opaque cinnamon, no head. Aroma features...",https://res.cloudinary.com/ratebeer/image/uplo...,Chicago


In [28]:
# Get a count of the number of rows and columns
clean_df.shape

(10110, 6)

In [29]:
# Create a list of important columns to keep;  What would you want your product to 
# be based on to be compared to others
features = ['Reviews', 'City']
# features = ["review_aroma", "review_appearance", "review_palate", "review_taste"]
clean_df[features].head(3)

Unnamed: 0,Reviews,City
0,"Light fresh fruit, light floral note, hazy gol...",Manchester
1,Aromas of chocolate bourbon oak toffee cocoa t...,Evanston
2,"Pours opaque cinnamon, no head. Aroma features...",Chicago


In [30]:
# Clean and Process the data
# for feature in features:
#     clean_df[feature] = clean_df[feature].fillna(0) # Fill any missing values with the empty string

In [34]:
# Create a function to combine the values of the important columns into a single string
def combine_features(row):
    return str(row['Reviews']) + ' ' + str(row['City'])

In [35]:
# Apply the function to each row in the data set to store the combined
# strings into a new column called combine_features
clean_df["combined_features"] = clean_df.apply(combine_features, axis = 1)

In [36]:
# Print the dataframe
clean_df.head()

Unnamed: 0,Index,Beer Names,Locations,Reviews,Image,City,combined_features
0,0,Cloudwater A Scabrous Edge Of The Sky,"Manchester, Greater Manchester","Light fresh fruit, light floral note, hazy gol...",https://res.cloudinary.com/ratebeer/image/uplo...,Manchester,"Light fresh fruit, light floral note, hazy gol..."
1,1,Temperance Might Meets Right: Hot Cocoa,"Evanston, Illinois",Aromas of chocolate bourbon oak toffee cocoa t...,https://res.cloudinary.com/ratebeer/image/uplo...,Evanston,Aromas of chocolate bourbon oak toffee cocoa t...
2,2,Pipeworks Barrel Aged The Brown and Stirred,"Chicago, Illinois","Pours opaque cinnamon, no head. Aroma features...",https://res.cloudinary.com/ratebeer/image/uplo...,Chicago,"Pours opaque cinnamon, no head. Aroma features..."
3,3,Camba Bavaria 4 Sessions,"Seeon, Bavaria","Schaum: fein, gut.\nFarbe: orange-Gold, leicht...",https://res.cloudinary.com/ratebeer/image/uplo...,Seeon,"Schaum: fein, gut.\nFarbe: orange-Gold, leicht..."
4,4,Rock & Roll Bramble On,"Birmingham - Hockley, West Midlands",Cask @ the brewhouse. Nice head with good dura...,https://res.cloudinary.com/ratebeer/image/uplo...,Birmingham - Hockley,Cask @ the brewhouse. Nice head with good dura...


In [37]:
# Export new CSV [with combined_features columns]
clean_df.to_csv('combined_features_by_reviews_city.csv')