In [69]:
import gzip
import math
import numpy
import random
import sklearn
import string
from collections import defaultdict
from nltk.stem.porter import *
from sklearn import linear_model
from gensim.models import Word2Vec
import dateutil
from scipy.sparse import lil_matrix

In [71]:
import csv
import nltk
nltk.download('stopwords')
import re
from string import punctuation

[nltk_data] Downloading package stopwords to /Users/jh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [105]:
ridToIdx = {}
with open('Mapping_recipe_id.csv', mode='r', encoding='utf-8') as fr:
    reader = csv.DictReader(fr)
    for row in reader:
        key = row['recipe_id']
        value = row['new_recipe_id']
        if key not in ridToIdx:
            ridToIdx[key] = value

In [107]:
uidToIdx = {}
with open('Mapping_user_id.csv', mode='r', encoding='utf-8') as fr:
    reader = csv.DictReader(fr)
    for row in reader:
        key = row['user_id']
        value = row['new_user_id']
        if key not in uidToIdx:
            uidToIdx[key] = value

In [109]:
stemWords = {}
with open('Mapping_stemword.csv', mode='r', encoding='utf-8') as fr:
    reader = csv.DictReader(fr)
    for row in reader:
        key = row['word']
        value = [row['word_id'], row['frequency']]
        if key not in stemWords:
            stemWords[key] = value

In [75]:
stop_words = set(nltk.corpus.stopwords.words('english'))
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()

In [77]:
# Define Text-Preprocessing functions
def preprocess_text(text):
    text = ''.join([c.lower() if not c in punctuation else ' ' for c in text])
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words and not re.search(r'\d', word)] # remove number tokens
    return ' '.join(tokens)

In [79]:
with open("RAW_recipes.csv", mode='r', encoding='utf-8') as fr, \
    open('RAW_recipes_textProc.csv', mode='w', encoding='utf-8', newline='') as fw:
    reader = csv.DictReader(fr)
    writer = csv.DictWriter(fw, fieldnames=reader.fieldnames)
    writer.writeheader()
        
    for row in reader:
        row['description'] = preprocess_text(row['description'])
        row['steps'] = preprocess_text(row['steps'])
        writer.writerow(row)

In [81]:
vocabulary = {}  # Will map words to indices
index = 0
uidToIdx = {}
uIdx = 0

with open("RAW_interactions.csv", mode='r', encoding='utf-8') as fr:
    reader = csv.DictReader(fr)
    for row in reader:
        if row['user_id'] not in uidToIdx:
            uidToIdx[row['user_id']] = uIdx
            uIdx += 1
        r = ''.join([c for c in row['review'].lower() if not c in punctuation])
        for w in r.split():
            #w = stemmer.stem(w)                             # Stemming
            if w not in stop_words and w not in vocabulary:  # If word is not a stopword and not in vocabulary
                vocabulary[w] = index
                index += 1

In [62]:
len(vocabulary)

329651

In [52]:
# Create word count vector using the dictionary
def create_count_vector(datum):
    feat = [0 for _ in range(len(vocabulary))]
    r = ''.join([c for c in datum['review'].lower() if not c in punctuation])
    for w in r.split():
        #w = stemmer.stem(w)                         # Stemming
        if w not in stop_words and w in vocabulary:  # If it's not a stopword and it's in the vocabulary
            feat[vocabulary[w]] += 1 
    return feat

In [85]:
with open("RAW_interactions.csv", mode='r', encoding='utf-8') as fr, \
    open('RAW_interactions_textProc.csv', mode='w', encoding='utf-8', newline='') as fw:
    reader = csv.DictReader(fr)
    fieldnames = reader.fieldnames
    #fieldnames = reader.fieldnames + ['wordcount_vector']
    writer = csv.DictWriter(fw, fieldnames=fieldnames)
    writer.writeheader()

    for row in reader:
        row['user_id'] = uidToIdx[row['user_id']]
        row['recipe_id'] = ridToIdx[row['recipe_id']]
        row['review'] = preprocess_text(row['review'])
        #cnt_vec = create_count_vector(row)
        #row['wordcount_vector'] = cnt_vec
        writer.writerow(row)

In [101]:
with open('Mapping_user_id.csv', mode='w', encoding='utf-8', newline='') as fw:
    fieldnames = ['user_id', 'new_user_id']
    writer = csv.DictWriter(fw, fieldnames=fieldnames)
    writer.writeheader()
    for key, value in uidToIdx.items():
        row = {}
        row['user_id'] = key
        row['new_user_id'] = value
        writer.writerow(row)

In [93]:
stemWords = {}
wIdx = 0
with open('RAW_interactions_textProc.csv', mode='r', encoding='utf-8') as fr:
    reader = csv.DictReader(fr)
    for row in reader:
        rWords = row['review'].split()
        for w in rWords:
            if w not in stemWords:
                stemWords[w] = [wIdx, 1]
                wIdx += 1
            else:
                stemWords[w][1] += 1

In [103]:
with open('Mapping_stemword.csv', mode='w', encoding='utf-8', newline='') as fw:
    fieldnames = ['word', 'word_id', 'frequency']
    writer = csv.DictWriter(fw, fieldnames=fieldnames)
    writer.writeheader()
    for key, value in stemWords.items():
        row = {}
        row['word'] = key
        row['word_id'] = value[0]
        row['frequency'] = value[1]
        writer.writerow(row)

In [99]:
len(stemWord)

94828