# Data Processing

We will now drop any rows that are irrelevant to us and save it to a CSV file.

In [2]:
import pandas as pd
import random

# load data
df = pd.read_csv('data/Books_rating.csv')

# only preserve 'review/summary', 'review/text', and 'review/score' columns
df = df[['review/summary', 'review/text', 'review/score']]

# rename columns
df.columns = ['summary', 'text', 'score']

# choose 1 million random rows
df = df.sample(n=1000000, random_state=1)

# save to new csv file
df.to_csv('data/Books_rating_relevant_columns.csv', index=False)

# print first 5 rows
df.head()

Unnamed: 0,summary,text,score
2896109,Best edition of this classic.,I've always recommended this Yale edition of F...,5.0
2381153,Great Book!!,This is required reading for my 16 yr old son....,5.0
1028690,Not just a book for consultant,"Plain-spoken, finished the book only has taken...",4.0
1945977,Outrageously Bad,Wow... this is one of the most ridiculous stor...,1.0
2812693,Cunning and determination,A crew has mutinied and threatens to hang thei...,4.0


Process the text to remove stopwards, lemmatize, strip unneeded characters, then vectorize.

In [7]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk
from tqdm import tqdm

tqdm.pandas()

lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

def stemmer(text):
  if text != text:
    return ''
  
  # replace non-alphanumeric characters with space
  text = re.sub(r'[^a-zA-Z0-9]', ' ', text)

  # remove multiple spaces
  text = re.sub(' +', ' ', text)

  # # lowercase
  text = text.lower()

  # tokenize
  tokens = word_tokenize(text)

  # remove stopwords
  tokens = [w for w in tokens if not w in stopwords]

  # lemmatize
  tokens = [lemmatizer.lemmatize(w) for w in tokens]

  # join tokens
  text = ' '.join(tokens)

  return text

# load df
df = pd.read_csv('data/Books_rating_relevant_columns.csv')

# apply stemmer to text
df['stemmed_text'] = df['text'].progress_apply(stemmer)
df['stemmed_summary'] = df['summary'].progress_apply(stemmer)
df['stemmed_summary_text'] = df['stemmed_summary'] + ' ' + df['stemmed_text']

# remove trailing spaces
df['stemmed_summary_text'] = df['stemmed_summary_text'].progress_apply(lambda x: x.strip())

# replace NaN with empty string
df.fillna('', inplace=True)

# remove rows with empty stemmed_summary_text
df = df[df['stemmed_summary_text'] != '']

# drop unused columns
df.drop(columns=['text', 'summary', 'stemmed_summary', 'stemmed_text'], inplace=True)

# save to new csv file
df.to_csv('data/Books_rating_stemmed.csv', index=False)

# print first 5 rows
df.head()

KeyboardInterrupt: 