In [13]:
# Imports
import numpy as np
import pandas as pd
import csv
import re 
import string
import nltk
from nltk.corpus import words, stopwords

# Setting options

pd.set_option('display.max_colwidth', -1)
nltk.download('stopwords')
nltk.download('words')

# Load stop words
stop_words = stopwords.words('english')
wordlist = words.words()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielwilentz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/danielwilentz/nltk_data...
[nltk_data]   Package words is already up-to-date!


# Read in Data

### Firstly, we'll read in the dataset from the prior reddit project using r/foodporn

In [2]:
reddit_data = pd.read_pickle('../data/cleaned_data.pkl')

In [5]:
# check the length of the reddit data:
len(reddit_data)

42234

In [4]:
# check out a sample of the data
reddit_data.sample(20)

Unnamed: 0,cleaned_title
41777,beefsteak mashed potatoes cooking something passion ish
39977,try mac cheese pic left comments
44031,sweetened reduced milk simple yet delish
28263,apple
2021,fry pork chop
30246,steak pie smoked applewood mash
27337,linguini marinara beef sausage mixture
38499,leg lamb dinner
41064,breakfast burrito
19207,farmers market score pulled pork carolina style bbq sauce mac cheese steamed kale


### Secondly, we'll read in the kaggle dataset

In [20]:
kaggle_list = []

with open('../data/Dish.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for i, row in enumerate(csv_reader):
        if i >= 1:
            kaggle_list.append(row[1])

print(len(kaggle_list))

422039


In [21]:
# Convert to a dataframe
kaggle_data = pd.DataFrame(kaggle_list)
kaggle_data.columns = ['Title']

In [25]:
# Check it out
kaggle_data.sample(20)

Unnamed: 0,Title
99850,Cabinet Pudding Lemon Sauce
281874,Boned Squab Chicken Ecossaise
195654,Bourgogne Rose Carafe 13/6 Half-Carafe 7/6
265674,"Veuve Clicquot, La Grande Dame, brut, 1990"
215938,Melrose Rye Whiskey
320529,Moules (Mussels) Mariniere
326695,"Soft clams in brochette, Mexicaine"
11082,Nuttolene Toast
11677,"St. Julien, B. & G., 1887"
358061,Uerziger Wurzgarten


We'll have to run this kaggle set through a cleaning process as well

# Define some cleaning functions

In [26]:
# Function for removing punctuation
def drop_punc(my_text):
    clean_text = re.sub('[%s]' % re.escape(string.punctuation), ' ', my_text)
    return clean_text

In [27]:
# Function for making all text lowercase
def lower(my_text):
    clean_text = my_text.lower()
    return clean_text

In [28]:
# Function for removing all numbers
def remove_numbers(my_text):
    clean_text = re.sub('\w*\d\w*', '', my_text)
    return clean_text

In [29]:
# Function for removing emojis
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

In [30]:
# Function for removing stop words
def remove_stop(my_text):
    text_list = my_text.split()
    return ' '.join([word for word in text_list if word not in stop_words])

In [31]:
# Function for stripping whitespace
def my_strip(my_text):
    try: return my_text.strip()
    except Exception as e: return None

In [None]:
# Curated list of additional stop-words for this project
my_stop_words = ['menu']

# Function for removing my stop words
def remove_my_stop(my_text):
    text_list = my_text.split()
    return ' '.join([word for word in text_list if word not in my_stop_words])