In [18]:
# Imports
import numpy as np
import pandas as pd
import csv
import re 
import string
import nltk
from nltk.corpus import words, stopwords
from langdetect import detect

# Setting options

pd.set_option('display.max_colwidth', -1)
nltk.download('stopwords')
nltk.download('words')

# Load stop words
stop_words = stopwords.words('english')
wordlist = words.words()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielwilentz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/danielwilentz/nltk_data...
[nltk_data]   Package words is already up-to-date!


## Read in Data

In [19]:
df = pd.read_pickle('../data/kaggle_and_reddit_dishes.pkl')

In [20]:
df = df.rename(columns={"cleaned_title": "title"})

In [21]:
df.shape

(50000, 1)

In [22]:
df.sample(10)

Unnamed: 0,title
729,making sushi sushi chef making kit really helped lot
39172,creme de menthe and vanilla bean ice cream
41575,gray hare soup cream of lettuce aux croutons souffle
2199,lightroom presets vol creativetacos
1600,cheesy spiral penne pasta diced wieners
23698,heard karma around anybody knows melt cheese roast potatoes
46236,cream of corn washingtin
2948,poached egg croissant ham bone
17395,meat croatian restaurant berlin
40332,carlshamns flagg punsch


Ok, so a big problem with my LSTM has been the precense of extra spaces. I'm going to try to fix that by eliminating them in the cleaning process here.

In [23]:
def has_multiple_consec_spaces(my_text):
    if re.search('[ ]{2,}', my_text):
        return True
    else:
        return False

In [24]:
def reduce_spaces(my_text):
    '''
    reduce two or more consecutive spaces with one space
    '''
    return re.sub('[ ]{2,}', ' ', my_text)

In [None]:
df['has_mult_consec_spaces'] = df['title'].apply(has_multiple_consec_spaces)

In [25]:
df.sample(10)

Unnamed: 0,title
40392,roast long island duckling sage dressing candied yams apple sauce vegetables jardiniere
41779,scotch lamb broth with barley
9028,bbq baby back ribs pressure cooker finger lickin
36836,escalopes of bass villeroy
41259,veal cutlet breaded cream gravy or tomato sauce
49112,chicken giblet with noodles
37258,filet and shrimp oz served with salad french fries or baked potato and fresh vegetables
31065,assorted jell chantilly
35787,puree africaine gratin
35766,fried rice sub gum


Let's apply reduce_spaces to the whole thing

In [26]:
df['cleaned_title'] = df['title'].apply(reduce_spaces)

In [27]:
df['has_mult_consec_spaces'] = df['cleaned_title'].apply(has_multiple_consec_spaces)

In [28]:
df['has_mult_consec_spaces'].value_counts()

False    50000
Name: has_mult_consec_spaces, dtype: int64

In [29]:
df.sample(10)

Unnamed: 0,title,cleaned_title,has_mult_consec_spaces
6544,cook gongura chicken curry village style street catalog,cook gongura chicken curry village style street catalog,False
45848,fish entrees or roasts on dinner bill,fish entrees or roasts on dinner bill,False
47012,oyster stew when available,oyster stew when available,False
33918,clam chowder fulton market style,clam chowder fulton market style,False
20126,kind allowed saudi upside rice veggies mutton,kind allowed saudi upside rice veggies mutton,False
37376,jellied chicken comsomme,jellied chicken comsomme,False
25052,buzzards bay oysters half shell,buzzards bay oysters half shell,False
42496,scotch lobsters half newburg taknen from the shell sliced cooked in lobster stock brandy and cream served with rice,scotch lobsters half newburg taknen from the shell sliced cooked in lobster stock brandy and cream served with rice,False
29286,fresh blueberry pie,fresh blueberry pie,False
23140,traditional filipino breakfast tapsilog marinated beef garlic fried rice sunny side eggs,traditional filipino breakfast tapsilog marinated beef garlic fried rice sunny side eggs,False


In [30]:
output = df.drop(['title', 'has_mult_consec_spaces'], axis = 1)

In [31]:
output.sample(20)

Unnamed: 0,cleaned_title
739,exactly sure bread look
45612,cold sliced turkey sandwich with mayonnaise cranberry jelly
37379,easter punch
47098,nureyev vodka white creme de cacao
43917,boiled fowls supreme sauce
35414,chicken chop suey with mushrooms
18775,japanese curry steamed rice dumplings
12542,rosemary garlic crusted top sirloin
24599,double shackburger cheese fries
19025,told guys might breakfast steak eggs xposted ketorecipes


In [32]:
output.to_pickle('../data/kaggle_and_reddit_dishes_no_spaces.pkl')