# Data Mining Project

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
from scipy.special import comb
from itertools import combinations, permutations

import os

# List all datests files
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


./data/groceries_aligned.csv
./data/test.json
./data/train-clean.csv
./data/groceries.csv
./data/train.json
./data/groceries - groceries.csv
./data/.ipynb_checkpoints/groceries-checkpoint.csv


## Utilities

In [2]:
# Unique fields
#print(json_df['cuisine'].unique())
#print(json_df['ingredients'].unique())

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import re

#nltk.download('stopwords')
#nltk.download('punkt')
ps = PorterStemmer()


def itemParser(row):
    """
    Row items cleaner. 
  
    This function is useful to normalize the items of the chart  
    
    Parameters: 
    row (list): Row of chart items
  
    Returns: 
    list: Normalized items 

    """
    word_ps = []
    for s in row:
        
        #REGULAR EXPRESSIONS:

        #Remove punctuations
        s = re.sub(r'[^\w\s]', '', s)

        #Remove Digits
        s = re.sub(r"(\d)", "", s)

        #Remove content inside paranthesis
        s = re.sub(r'\([^)]*\)', '', s)

        #Remove Brand Name
        s = re.sub(u'\w*\u2122', '', s)

        #Convert to lowercase
        s = s.lower()

        #print(ps.stem(s))
        #Remove Stop Words
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(s)
        
        filtered_sentence = [ps.stem(w) for w in word_tokens if not w in stop_words]
        #print(filtered_sentence)
        s = ' '.join(filtered_sentence)
        #print("join:" + s)
        word_ps.append(s)
    return word_ps



In [180]:
def apriori(chart, minimum_support=0.1, confidence=0.22):
    
    #groceries_items = set()
    
    mapping = dict()
    singleton_count = dict()
    uuid = 0
    
    for row in chart:
        for item in row:
            if item not in mapping.keys():
                mapping[item] = uuid
                singleton_count[item] = 0
                uuid += 1
            singleton_count[item] += 1
    #Take mapping 
    singleton_count = {mapping[k]: v for (k,v) in singleton_count.items() if v > minimum_support}
    print(singleton_count)
    #a1_sorted_keys = sorted(singleton_count, key=singleton_count.get, reverse=True)
    


## Dataset ETL

### Groceries

In [109]:
chart = []
with open('./data/groceries.csv', 'r') as f:
    for line in f:
        chart.append(itemParser(line[:-1].split(',')))

In [182]:
#print(chart[:10])
apriori(chart, minimum_support=3, confidence=0.22)

{0: 814, 1: 174, 2: 576, 3: 18, 4: 1032, 5: 1372, 6: 571, 7: 2513, 8: 744, 9: 390, 10: 42, 11: 1903, 12: 101, 13: 368, 14: 545, 15: 75, 16: 35, 17: 1809, 18: 329, 19: 792, 20: 78, 21: 170, 22: 56, 23: 414, 24: 1087, 25: 488, 26: 524, 27: 171, 28: 173, 29: 516, 30: 580, 31: 1715, 32: 422, 33: 333, 34: 711, 35: 785, 36: 128, 37: 269, 38: 275, 39: 875, 40: 163, 41: 189, 42: 1072, 43: 106, 44: 89, 45: 372, 46: 378, 47: 294, 48: 27, 49: 764, 50: 924, 51: 638, 52: 969, 53: 256, 54: 327, 55: 51, 56: 324, 57: 515, 58: 567, 59: 327, 60: 705, 61: 32, 62: 220, 63: 365, 64: 68, 65: 624, 66: 110, 67: 279, 68: 241, 69: 229, 70: 256, 71: 80, 72: 174, 73: 176, 74: 276, 75: 207, 76: 89, 77: 246, 78: 473, 79: 148, 80: 140, 81: 50, 82: 189, 83: 83, 84: 36, 85: 299, 86: 88, 87: 102, 88: 55, 89: 106, 90: 279, 91: 106, 92: 305, 93: 160, 94: 187, 95: 41, 96: 91, 97: 241, 98: 148, 99: 54, 100: 112, 101: 29, 102: 45, 103: 32, 104: 79, 105: 168, 106: 15, 107: 57, 108: 71, 109: 130, 110: 30, 111: 19, 112: 84, 11

In [115]:
chart[0:10]
groceries_items = set()
for row in chart:
    groceries_items.update(row)

print(len(groceries_items))

169


### Recipes

In [130]:
json_df = pd.read_json ('./data/train.json')

In [131]:
list_ingredients = set()
for item in json_df.ingredients:
    #item = [lower(i) for i in item]
    list_ingredients.update(item)
    
print(len(list_ingredients))

6714


In [132]:
json_df['new_ingredients'] = json_df.apply(lambda x: itemParser(x.ingredients), axis=1)
json_df.head()

Unnamed: 0,cuisine,id,ingredients,new_ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...","[romain lettuc, black oliv, grape tomato, garl..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...","[plain flour, ground pepper, salt, tomato, gro..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[egg, pepper, salt, mayonais, cook oil, green ..."
3,indian,22213,"[water, vegetable oil, wheat, salt]","[water, veget oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallot, cornflour, cayenn pepp..."


In [83]:
# Export clean dataframe to csv
json_df.to_csv('./data/train-clean.csv', sep=",")

## Dataset Similar Items
We would test the similarity of the two datasets computing their intersection.

In [137]:
#groceries_items
ingredients_items = []
for row in json_df['new_ingredients']:
    for item in row:
        ingredients_items.append(item)
        
#print("Before: " + str(len(ingredients_items)))
#ingredients_items[0:10]
ingredients_items = set(ingredients_items)
#print("After: " + str(len(ingredients_items)))

In [145]:
common = groceries_items.intersection(ingredients_items)
uncommon = groceries_items.union(ingredients_items) - common

print(ingredients_items)

{'can beer', 'light bulb', 'toilet cleaner', 'liquor appet', 'tidbit', 'curd chees', 'preserv product', 'kitchen utensil', 'soften', 'dog food', 'liver loaf', 'redblush wine', 'season product', 'chocol marshmallow', 'cocoa drink', 'abras cleaner', 'cookwar', 'long life bakeri product', 'dish', 'dental care', 'rub alcohol', 'hair spray', 'misc beverag', 'dish cleaner', 'zwieback', 'decalcifi', 'whippedsour cream', 'can fruit', 'hamburg meat', 'cook chocol', 'specialti chees', 'male cosmet', 'photofilm', 'butter milk', 'finish product', 'readi soup', 'whiski', 'make remov', 'napkin', 'cleaner', 'bathroom cleaner', 'hous keep product', 'spread chees', 'fruitveget juic', 'brown bread', 'specialti chocol', 'can veget', 'uhtmilk', 'nutsprun', 'cling filmbag', 'sweet spread', 'nut snack', 'can fish', 'bag', 'specialti fat', 'salti snack', 'deterg', 'beverag', 'frozen fish', 'newspap', 'pip fruit', 'mayonnais', 'potato product', 'sound storag medium', 'bottl water', 'artif sweeten', 'femal san

## Other

In [18]:
'''# Unique fields
#print(json_df['cuisine'].unique())
#print(json_df['ingredients'].unique())

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import re

#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('english')

ps = PorterStemmer()


new = []
for s in json_df['ingredients']:
    #print(str(s) + "\n")
    s = ','.join(s)
    new.append(s)
    
print(new[0:3])

json_df['ing'] = new

ingredients = list()

l=[]
for s in json_df['ing']:
    #REGULAR EXPRESSIONS:
    
    #Remove punctuations
    #s=re.sub(r'[^\w\s]','',s)
    
    #Remove Digits
    s=re.sub(r"(\d)", "", s)
    
    #Remove content inside paranthesis
    s=re.sub(r'\([^)]*\)', '', s)
    
    #Remove Brand Name
    s=re.sub(u'\w*\u2122', '', s)
    
    #Convert to lowercase
    s=s.lower()
    
    #Remove Stop Words
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(s)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    #print(filtered_sentence)
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
            ingredients.append(w)
    s=' '.join(filtered_sentence)
    
    
    #Remove low-content adjectives
    
    
    #Porter Stemmer Algorithm
    words = word_tokenize(s)
    word_ps=[]
    for w in words:
        word_ps.append(ps.stem(w))
    s=' '.join(word_ps)
    ingredients.append(word_ps)
    
    l.append(s)
json_df['ing_mod']=l
print(json_df.head(10))

'''

['romaine lettuce,black olives,grape tomatoes,garlic,pepper,purple onion,seasoning,garbanzo beans,feta cheese crumbles', 'plain flour,ground pepper,salt,tomatoes,ground black pepper,thyme,eggs,green tomatoes,yellow corn meal,milk,vegetable oil', 'eggs,pepper,salt,mayonaise,cooking oil,green chilies,grilled chicken breasts,garlic powder,yellow onion,soy sauce,butter,chicken livers']
       cuisine     id                                        ingredients  \
0        greek  10259  [romaine lettuce, black olives, grape tomatoes...   
1  southern_us  25693  [plain flour, ground pepper, salt, tomatoes, g...   
2     filipino  20130  [eggs, pepper, salt, mayonaise, cooking oil, g...   
3       indian  22213                [water, vegetable oil, wheat, salt]   
4       indian  13162  [black pepper, shallots, cornflour, cayenne pe...   
5     jamaican   6602  [plain flour, sugar, butter, eggs, fresh ginge...   
6      spanish  42779  [olive oil, salt, medium shrimp, pepper, garli...   
7      

In [17]:
ingredientes


NameError: name 'ingredientes' is not defined

In [29]:
import numpy as np
testList = list(set(df.as_matrix().reshape((1,-1)).tolist()[0]))
no_integers = [x for x in testList if not isinstance(x, int) ]
print (no_integers)


[nan, 'other vegetables', 'liquor (appetizer)', 'sweet spreads', 'liquor', 'ice cream', 'syrup', 'tropical fruit', 'bathroom cleaner', 'potted plants', 'shopping bags', 'cling film/bags', 'turkey', 'beverages', 'salad dressing', 'hamburger meat', 'white wine', 'organic sausage', 'salt', 'yogurt', 'brandy', 'cake bar', 'chocolate marshmallow', 'butter', 'snack products', 'baby cosmetics', 'frozen fruits', 'curd', 'pickled vegetables', 'prosecco', 'ready soups', 'sliced cheese', 'honey', 'artif. sweetener', 'hair spray', 'candy', 'napkins', 'soups', 'flower (seeds)', 'dishes', 'pork', 'cooking chocolate', 'rum', 'ham', 'dessert', 'potato products', 'onions', 'berries', 'cat food', 'chocolate', 'kitchen towels', 'newspapers', 'whole milk', 'sparkling wine', 'light bulbs', 'grapes', 'pudding powder', 'nuts/prunes', 'packaged fruit/vegetables', 'Instant food products', 'frozen potato products', 'specialty bar', 'meat spreads', 'cereals', 'oil', 'sausage', 'photo/film', 'frankfurter', 'insta

  


In [4]:
def apyori(df, minimum_support=0.1, confidence=0.22):
    df_values = df.values.astype(str)
    index, counts = np.unique(df_values,return_counts=True)
    df_item = pd.DataFrame(zip(index, counts), columns = ['produto', 'frequencia'])
    df_item.drop(df_item[(df_item['produto'] == 'nan' )|(df_item['produto'] == 'None' )].index, inplace=True)
    df_item.sort_values(by='frequencia', ascending=False, inplace=True)
    df_item.reset_index(drop=True, inplace=True)
    df_item_frequent = df_item[df_item['frequencia']>= minimum_support*len(df)]
    df_itemset_frequencia = pd.DataFrame(columns=['itemset', 'frequencia'])
    for i in range(1, len(df_item_frequent)+1):
        comb = list(combinations(df_item_frequent['produto'].values, i) )
        for w in comb:
            count = 0 
            for instancia in df_values:
                if all(elem in instancia  for elem in w):
                    count = count +1
            if count >= (minimum_support*len(df)/2):#tirar /2
                df_itemset_frequencia = df_itemset_frequencia.append({'itemset':w, 'frequencia':count}, ignore_index=True)
    df_itemset_frequencia.sort_values(by='frequencia', inplace=True, ascending=False)
    confiabilidade = pd.DataFrame(columns=['regras', 'frequencia', 'confiabilidade'])
    for w in df_itemset_frequencia['itemset'].values:
        w_p = list(permutations(w,len(w)))
        for j in w_p:
            #print (len(j[0]))

            p_uniao = []
            for i in range(len(j)):

                count = 0 
                for instancia in df_values:
                    if all(elem in instancia  for elem in j[i:]):
                        count = count +1
                p_uniao.append(count/len(df))

            if len(j) != 1:
                a = p_uniao[-2]/p_uniao[-1]

                for i in range(len(p_uniao)-2):
                    a = p_uniao[-i-3]/a
                j = list(j)
                j.reverse()
                confiabilidade = confiabilidade.append({'regras':j, 'frequency':p_uniao[0], 'confidence':a}, ignore_index=True)
            else:
                confiabilidade = confiabilidade.append({'regras':j, 'frequency':p_uniao[0], 'confidence':p_uniao[0]}, ignore_index=True)
    confiabilidade.sort_values(by='frequency', ascending=False)
    return confiabilidade[confiabilidade['confidence']>=confidence]


apyori(df.drop(columns='Item(s)'))


TypeError: data argument can't be an iterator

## Tests

In [None]:
#Dataframe copy
tmp = json_df[0:5].copy()
print(tmp.head())