# Data Mining Project

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
from scipy.special import comb
from itertools import combinations, permutations

import os

# List all datests files
for dirname, _, filenames in os.walk('../Data/start'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


../Data/start/groceries.csv
../Data/start/test.json
../Data/start/train.json
../Data/start/groceries - groceries.csv


## Utilities

In [2]:
# Unique fields
#print(json_df['cuisine'].unique())
#print(json_df['ingredients'].unique())

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import re

#nltk.download('stopwords')
#nltk.download('punkt')
ps = PorterStemmer()


def itemParser(row):
    """
    Row items cleaner. 
  
    This function is useful to normalize the items of the chart  
    
    Parameters: 
    row (list): Row of chart items
  
    Returns: 
    list: Normalized items 

    """
    word_ps = []
    for s in row:
        
        #REGULAR EXPRESSIONS:

        #Remove punctuations
        s = re.sub(r'[^\w\s]', '', s)

        #Remove Digits
        s = re.sub(r"(\d)", "", s)

        #Remove content inside paranthesis
        s = re.sub(r'\([^)]*\)', '', s)

        #Remove Brand Name
        s = re.sub(u'\w*\u2122', '', s)

        #Convert to lowercase
        s = s.lower()

        #print(ps.stem(s))
        #Remove Stop Words
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(s)
        
        filtered_sentence = [ps.stem(w) for w in word_tokens if not w in stop_words]
        #print(filtered_sentence)
        s = ' '.join(filtered_sentence)
        #print("join:" + s)
        word_ps.append(s)
    return word_ps



## Dataset ETL

### Groceries

In [13]:
chart = []
with open('../Data/start/groceries.csv', 'r') as f:
    for line in f:
        chart.append(itemParser(line[:-1].split(',')))
        #chart.append(line[:-1].split(','))

In [14]:
chart[0:10]
groceries_items = set()
for row in chart:
    groceries_items.update(row)

print(len(groceries_items))

169


### Recipes

In [8]:
json_df = pd.read_json ('../Data/start/train.json')

In [15]:
list_ingredients = set()
for item in json_df.ingredients:
    #item = [lower(i) for i in item]
    list_ingredients.update(item)
    
print(len(list_ingredients))

6714


In [16]:
json_df['new_ingredients'] = json_df.apply(lambda x: itemParser(x.ingredients), axis=1)
json_df.head()

Unnamed: 0,id,cuisine,ingredients,new_ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...","[romain lettuc, black oliv, grape tomato, garl..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...","[plain flour, ground pepper, salt, tomato, gro..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[egg, pepper, salt, mayonais, cook oil, green ..."
3,22213,indian,"[water, vegetable oil, wheat, salt]","[water, veget oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallot, cornflour, cayenn pepp..."


In [83]:
# Export clean dataframe to csv
json_df.to_csv('./data/train-clean.csv', sep=",")

## Dataset Similar Items
We would test the similarity of the two datasets computing their intersection.

In [17]:
#groceries_items
ingredients_items = []
for row in json_df['new_ingredients']:
    for item in row:
        ingredients_items.append(item)
        
#print("Before: " + str(len(ingredients_items)))
#ingredients_items[0:10]
ingredients_items = set(ingredients_items)
#print("After: " + str(len(ingredients_items)))

In [18]:
common = groceries_items.intersection(ingredients_items)
uncommon = groceries_items.union(ingredients_items) - common

#Before 59
print(len(common))

66
