In [85]:
'''
import libraries and set print options
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import csv
pd.set_option('display.max_rows', 1000)

In [86]:
data = pd.read_csv('SR_Legacy_products.csv') ##read csv as a dataframe
full_data = data.copy()
data.head() ## show the top 5

Unnamed: 0,NBDID,full_product_name
0,1001,"Butter, salted"
1,1002,"Butter, whipped, with salt"
2,1003,"Butter oil, anhydrous"
3,1004,"Cheese, blue"
4,1005,"Cheese, brick"


## RULES:
### - First word is the main product
### - Second word is a distinguishing descriptor
### - If 3 words total, don’t need 3rd word### - If 4+ words, 3rd word is helpful

# Data Cleaning - Process Is Commented

In [87]:
DROP_COLS = [
    'NBDID'
]
data = data.drop(DROP_COLS, axis=1) ## drop unnecessary columns for now
data.head()

Unnamed: 0,full_product_name
0,"Butter, salted"
1,"Butter, whipped, with salt"
2,"Butter oil, anhydrous"
3,"Cheese, blue"
4,"Cheese, brick"


In [88]:
'''
Find the number of commas in each title 
'''
data['num_commas'] = data.apply(lambda x: len(x.full_product_name.split(","))-1, axis=1)
#data = data[data['num_commas'] < 4]

In [89]:
len(data)

7793

In [103]:
'''
This will remove 3rd word if three words total, otherwise, will remove any words after the third one
Will also reverse the order of the words
'''
def remove_extra_words(x):
    x_ = x.split(",")
    if len(x_) == 2 or len(x_) == 3:
        if 'raw' in x_[1]:
            return x_[0].strip()
        else:
            return x_[1].strip() + ' ' + x_[0].strip()
    elif len(x_) >= 4:
        return x_[2].strip() + ' ' + x_[1].strip() + ' ' + x_[0].strip()
    else:
        return x_[0].strip()

In [104]:
'''
This will remove all words after two
'''
def two_words_only(x):
    x_ = x.split(",")
    if (len(x_) > 1):
        if 'raw' in x_[1]:
            return x_[0].strip()
        else:
            return x_[1].strip() + ' ' + x_[0].strip()
    else:
        return x_[0].strip()

In [105]:
'''
Apply the method to obtain different column with the cleaning process applied
'''
data['clean_long_title'] = data.apply(lambda x: remove_extra_words(x.full_product_name), axis=1)

In [106]:
'''
Apply the method to obtain different column with the cleaning process applied
'''
data['clean_short_title'] = data.apply(lambda x: two_words_only(x.full_product_name), axis=1)

In [107]:
data.tail() ##print dataframe

Unnamed: 0,full_product_name,num_commas,clean_long_title,clean_short_title
7788,"Fish, mackerel, salted",2,mackerel Fish,mackerel Fish
7789,"Mollusks, scallop, (bay and sea), cooked, steamed",4,(bay and sea) scallop Mollusks,scallop Mollusks
7790,"Syrup, Cane",1,Cane Syrup,Cane Syrup
7791,"Mollusks, snail, raw",2,snail Mollusks,snail Mollusks
7792,"Turtle, green, raw",2,green Turtle,green Turtle


In [108]:
'''
Turn the dataframe's clean titles into a list and clean out bad case
'''
clean_long_titles = [x for x in list(set(data.clean_long_title)) if x != ""]

In [109]:
'''
Turn the dataframe's clean titles into a list and clean out bad case
'''
clean_short_titles = [x for x in list(set(data.clean_short_title)) if x != ""]

In [110]:
'''
Save the lists as a json file
'''
with open('cleaned_long_SR_Legacy_data.json', 'w') as outfile:
    json.dump(clean_titles, outfile)
    
with open('cleaned_short_SR_Legacy_data.json', 'w') as outfile:
    json.dump(clean_titles, outfile)

In [111]:
'''
Put clean data in csv so we can visualize it
'''
# csvfile = 'cleaned_SR_Legacy_data.csv'
# with open(csvfile, "w") as fp:
#     wr = csv.writer(fp, dialect='excel')
# #     wr.writerows(clean_titles)
#     wr.writerow(['title'])
#     for row in clean_titles:
#         wr.writerow([row])

csvfile = 'cleaned_SR_Legacy_data.csv'
with open(csvfile, "w") as fp:
    wr = csv.writer(fp, dialect='excel')
#     wr.writerows(clean_titles)
    wr.writerow(['long clean title', 'short clean title'])
    for i in range(len(data)):
        wr.writerow([data.clean_long_title[i], data.clean_short_title[i]])