In [1]:
import csv
import nltk
import re
import string
from collections import defaultdict
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/jh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# ID Mapping
ridToIdx = {}
with open('Mapping_recipe_id.csv', mode='r', encoding='utf-8') as fr:
    reader = csv.DictReader(fr)
    for row in reader:
        ridToIdx[row['recipe_id']] = row['new_recipe_id']

uidToIdx = {}
with open('Mapping_user_id.csv', mode='r', encoding='utf-8') as fr:
    reader = csv.DictReader(fr)
    for row in reader:
        uidToIdx[row['user_id']] = row['new_user_id']

In [9]:
stop_words = set(nltk.corpus.stopwords.words('english'))
punctuation = set(string.punctuation)
stemmer = PorterStemmer()

In [11]:
# Text Preprocessing
def preprocess_text(text):
    text = ''.join([c.lower() if c not in punctuation else ' ' for c in text])
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words and not re.search(r'\d', word)]
    return ' '.join(tokens)

In [13]:
# Process RAW_recipes.csv
with open("RAW_recipes.csv", mode='r', encoding='utf-8') as fr, \
    open('RAW_recipes_textProc.csv', mode='w', encoding='utf-8', newline='') as fw:
    reader = csv.DictReader(fr)
    writer = csv.DictWriter(fw, fieldnames=reader.fieldnames)
    writer.writeheader()
    for row in reader:
        row['description'] = preprocess_text(row['description'])
        row['steps'] = preprocess_text(row['steps'])
        writer.writerow(row)

In [17]:
# Process RAW_interactions.csv
with open("RAW_interactions.csv", mode='r', encoding='utf-8') as fr, \
    open('RAW_interactions_textProc.csv', mode='w', encoding='utf-8', newline='') as fw:
    reader = csv.DictReader(fr)
    writer = csv.DictWriter(fw, fieldnames=reader.fieldnames)
    writer.writeheader()
    for row in reader:
        row['user_id'] = uidToIdx[row['user_id']]
        row['recipe_id'] = ridToIdx[row['recipe_id']]
        row['review'] = preprocess_text(row['review'])
        writer.writerow(row)

In [18]:
# Stem Word Mapping for Reviews
stemWords = {}
wIdx = 0
with open('RAW_interactions_textProc.csv', mode='r', encoding='utf-8') as fr:
    reader = csv.DictReader(fr)
    for row in reader:
        words = row['review'].split()
        for word in words:
            if word not in stemWords:
                stemWords[word] = [wIdx, 1]
                wIdx += 1
            else:
                stemWords[word][1] += 1

In [23]:
# Sort and Save Stem Word Statistics - Interactions
sorted_stemWords = sorted(stemWords.items(), key=lambda x: x[1][1], reverse=True)

with open('Stat_interaction_stemword.csv', mode='w', encoding='utf-8', newline='') as fw:
    fieldnames = ['word', 'word_id', 'frequency']
    writer = csv.DictWriter(fw, fieldnames=fieldnames)
    writer.writeheader()
    for word, values in sorted_stemWords:
        writer.writerow({'word': word, 'word_id': values[0], 'frequency': values[1]})