In [48]:
# Imports
import numpy as np
import pandas as pd
import csv
import re 
import string
import nltk
from nltk.corpus import words, stopwords
from langdetect import detect

# Setting options

pd.set_option('display.max_colwidth', -1)
nltk.download('stopwords')
nltk.download('words')

# Load stop words
stop_words = stopwords.words('english')
wordlist = words.words()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielwilentz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/danielwilentz/nltk_data...
[nltk_data]   Package words is already up-to-date!


# Read in Data

### Firstly, we'll read in the dataset from the prior reddit project using r/foodporn

In [49]:
reddit_data = pd.read_pickle('../data/cleaned_data.pkl')

In [50]:
# check the length of the reddit data:
len(reddit_data)

42234

In [51]:
# check out a sample of the data
reddit_data.sample(20)

Unnamed: 0,cleaned_title
44607,convenience store sushi tokyo zoom additional happiness
31279,love yolk break
39050,hand octopus turned pretty
38133,egg clouds top kale hummus dressing
30174,kottu captures spirit sri lanka
41097,crispy bacon smashed avo poached eggs hollandaise
36481,chicken butter restuarant style recipie
1296,bolton drink festival lots see try plenty
32734,tomato chutney burgers bacon egg lovely melty cheese rocket
13739,bob burger butcher bar queens lettuce tomato sauteed mushrooms sauteed onion avocado sharp cheddar cheese applewood bacon inside top beef patty


### Secondly, we'll read in the kaggle dataset

In [52]:
kaggle_list = []

with open('../data/Dish.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for i, row in enumerate(csv_reader):
        if i >= 1:
            kaggle_list.append(row[1])

print(len(kaggle_list))

422039


In [53]:
# Convert to a dataframe
kaggle_data = pd.DataFrame(kaggle_list)
kaggle_data.columns = ['title']

# grab a subset as the data we'll use because 422,039 is a fuck ton of data
kaggle_data = kaggle_data.sample(60000)

In [54]:
# Check it out
len(kaggle_data)

60000

We'll have to run this kaggle set through a cleaning process as well

# Define some cleaning functions

In [55]:
# Function for removing punctuation
def drop_punc(my_text):
    clean_text = re.sub('[%s]' % re.escape(string.punctuation), ' ', my_text)
    return clean_text

In [56]:
# Function for making all text lowercase
def lower(my_text):
    clean_text = my_text.lower()
    return clean_text

In [57]:
# Function for removing all numbers
def remove_numbers(my_text):
    clean_text = re.sub('\w*\d\w*', '', my_text)
    return clean_text

In [58]:
# Function for removing stop words
def remove_stop(my_text):
    text_list = my_text.split()
    return ' '.join([word for word in text_list if word not in stop_words])

In [59]:
# Function for stripping whitespace
def my_strip(my_text):
    try: return my_text.strip()
    except Exception as e: return None

In [83]:
# Curated list of additional stop-words for this project
my_stop_words = ['menu', 'etc', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
                'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

# Function for removing my stop words
def remove_my_stop(my_text):
    text_list = my_text.split()
    return ' '.join([word for word in text_list if word not in my_stop_words])

In [61]:
# Function to detect english
def is_english(my_text):
    try:
        if detect(my_text) == 'en':
            return True
        else: return False
    except:
        return False

# Let's see how well our functions work!

In [62]:
test = kaggle_data.sample(50)

In [63]:
for index, row in test.iterrows():
    text = row[0]
    print(text)
    cleaned = remove_numbers(drop_punc(lower(text)))
    print(is_english(cleaned))
    print()

Huhnerbrust mit Morcheln
False

Fried smelts, Remoulade sauce
False

Timbale de poires Polonaise
False

Bass' Ale, own bottling
True

Rhine Wines, Niersteiner
False

Steinbutte mit
False

Nordsee-Steinbutt gebraten "Grenobloise"
False

Broiled Opakapaka, Maitre d'Hotel, Cole Slaw Salad
False

croquette of lamb with peas
True

Bonne Mares 1962
False

Blumenkohl, Sauce Béarnaise
False

Crab Legs Appetizer
False

roquefort
False

Sherry Served with All Dinners
True

Mainzer Kase mit Kummel
False

Clear soup with vegetables and meat-balls
True

Latte di Mandorla - Eiskalt serviert
False

Castell-Castell, Silvaner QbA - Qualitätswein trocken
False

Trommer's Beer - on Draught, Per glass
True

Chicken Salad on Tomato
True

New apple cake
True

Chicken Gai Pin
True

DRY MONOPOLE, RED TOP
True

Cold Sliced Turkey Sandwich with Mayonnaise, Cranberry Jelly
True

Poussin en cocotie, Forestiere
False

Frozen Beer Pie
False

Fried Eggs, Two
False

St. Germain Potage
False

Bacon or sausage
True

Br

In [64]:
# They work alright (I'm not the happiest with the english translator)
# Let's apply them to the dataset

kaggle_data['cleaned_title'] = kaggle_data['title'].apply(lower).apply(drop_punc).apply(remove_numbers)
kaggle_data['cleaned_title'] = kaggle_data['cleaned_title'].str.strip()

In [65]:
kaggle_data.head(50)

Unnamed: 0,title,cleaned_title
23898,Pippin Cider,pippin cider
387012,Habart tojas,habart tojas
152284,Ginger Ale - Half Bottle,ginger ale half bottle
297147,SIBERIAN PUNCH,siberian punch
27843,""" Dog's Head",dog s head
290212,Golden Trout in Aspic,golden trout in aspic
419739,rissolée potatoes,rissolée potatoes
406270,"Fried Ham and Eggs, coffee and rolls",fried ham and eggs coffee and rolls
360606,Pitcher of Cream,pitcher of cream
116898,Cepes au four,cepes au four


Now let's add a new column using the 'is_english' function to try to pick out which observations are in english and which aren't

In [66]:
kaggle_data['in_english'] = kaggle_data['cleaned_title'].apply(is_english)

In [67]:
kaggle_data.head(50)

Unnamed: 0,title,cleaned_title,in_english
23898,Pippin Cider,pippin cider,False
387012,Habart tojas,habart tojas,False
152284,Ginger Ale - Half Bottle,ginger ale half bottle,False
297147,SIBERIAN PUNCH,siberian punch,False
27843,""" Dog's Head",dog s head,False
290212,Golden Trout in Aspic,golden trout in aspic,True
419739,rissolée potatoes,rissolée potatoes,False
406270,"Fried Ham and Eggs, coffee and rolls",fried ham and eggs coffee and rolls,True
360606,Pitcher of Cream,pitcher of cream,True
116898,Cepes au four,cepes au four,False


In [68]:
# See how many data points will be lost

kaggle_data['in_english'].value_counts()

False    31751
True     28249
Name: in_english, dtype: int64

### Grab only the cleaned titles of the dataframe that are in english

In [69]:
cleaned_kaggle = pd.DataFrame(kaggle_data[kaggle_data['in_english']==True]['cleaned_title'])

In [70]:
cleaned_kaggle.sample(20)

Unnamed: 0,cleaned_title
367407,gervais cheese with individual bar le duc
356129,tournedos of fillet cardinalice
289080,timbale of guinea hen chasseur
170594,oceanburger blend of codfish clams shrimp crabmeat with lettuce on a roll tartar sauce cole slaw french fries
397781,imp draught in pitchers
341594,table water from lithia springs wolf trap va
258125,golden bantam corn on the cob
405355,cherry jubilee
271262,bordeaux white graves superior dry
77384,chocolate eclairs each


In [71]:
# We will be able to add this much data from Kaggle:
len(cleaned_kaggle)

28249

#### Finally, let's filter out any entry that has "s" in it

In [72]:
# func to detect " s " in a string
def has_s_alone(my_text):
    if ' s ' in my_text: return True
    else: return False

In [75]:
cleaned_kaggle['has_s_alone'] = cleaned_kaggle['cleaned_title'].apply(has_s_alone)

In [None]:
# See how many data points will be lost by getting rid of standalone "s's"

cleaned_kaggle['has_s_alone'].value_counts()

In [77]:
final_cleaned_kaggle = pd.DataFrame(cleaned_kaggle[cleaned_kaggle['has_s_alone']==False]['cleaned_title'])

In [78]:
final_dishes = final_cleaned_kaggle.sample(25000)

In [79]:
final_dishes.to_pickle('../data/kaggle_dishes.pkl')

### Add reddit stuff

In [96]:
reddit_sample = reddit_data.sample(25000)

In [127]:
all_data = reddit_sample.append(final_dishes)

In [128]:
all_data['cleaned_title'] = all_data['cleaned_title'].apply(remove_my_stop)

### Finally, add the word " stop" to the end of every row

In [99]:
all_data['cleaned_title'] = all_data['cleaned_title'].astype(str) + ' stop'

In [129]:
all_data.sample(5)

Unnamed: 0,cleaned_title
248229,charcoal broiled lamb chop new green lima beans lyonnaise potatoes
14776,travelling gili air indonesia told chef something spicy cry disappointed spicy deliciousness
331097,roast chicken dressing
312530,fresh mushroom saute
31852,dinner tonight


In [130]:
all_data = all_data.reset_index()

In [131]:
all_data = all_data.drop(labels = ['index'], axis = 1)

In [124]:
all_data['divis_by_20'] = all_data.index % 20 == 0

In [132]:
all_data.head(50)

Unnamed: 0,cleaned_title
0,pbj adults gourmet peanut butter jelly sandwich
1,eat
2,lobster benedict tomato harrissa
3,ultimate chocolate cake
4,nsfw face ass
5,street tacos lunch tripa lengua carnitas cabeza durham north carolina
6,turkey brine
7,green chile mac cheese deep pizza bacon top
8,garlic challots butter
9,broccoli babe


In [133]:
all_data.to_pickle('../data/kaggle_and_reddit_dishes.pkl')