# Preprocess and Split
This notebook takes the original Steam reviews dataset, samples 100000 entries, preprocesses the review text for each entry, splits the entries into train, dev, and test subsets, and outputs csv files after these tasks are done.

In [1]:
import contractions
from bs4 import BeautifulSoup
import unicodedata
import re
from autocorrect import Speller
import numpy as np
import pandas as pd
import pkg_resources
from symspellpy import SymSpell, Verbosity

The following can be changed to change the input filename and the name of the output files depending on level of processing.

In [24]:
filename = "steam.csv"
file_set = "processed_with_spelling"

In [3]:
# read in data
raw_df = pd.read_csv(filename,
                 header=None,
                 names=["GameID", "Review", "Recommend", "FoundHelpful"]).sample(n=100000, random_state=1)

In [25]:
df = raw_df.copy()

# set positive sentiment to 1 and negative sentiment to 0
df.loc[df['Recommend'] == 1, 'Sentiment'] = 1.0
df.loc[df['Recommend'] == -1, 'Sentiment'] = 0.0
df.drop(columns=["GameID", "Recommend", "FoundHelpful"], inplace=True)

# show a few examples
df.head()

Unnamed: 0,Review,Sentiment
5907579,Darksiders is similar in concept to Castlevani...,1.0
257903,An amzing game but your probably better off bu...,1.0
6175973,Great game in which you can also shoot some co...,1.0
6369607,"I acquired RAGE on a sale, and for the price I...",1.0
5064658,Because this is not ripping off agario,0.0


The following cells load the spelling autocorrection resources, define preprocessing functions, and actually clean up the reviews data.

In [26]:
# load symspellpy resources
# this code is based on the symspellpy example here: https://symspellpy.readthedocs.io/en/latest/examples/lookup_compound.html
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
# term_index is the column of the term and count_index is the
# column of the term frequency
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

# define preprocessing functions
# preprocessing steps are based on functions defined here: https://towardsdatascience.com/deep-transfer-learning-for-natural-language-processing-text-classification-with-universal-1a2c69e5baa9
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def expand_contractions(text):
    return contractions.fix(text)

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

def correct_spelling(text):
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2,
                                            transfer_casing=True)
    one_joined_value_removed = str(suggestions[0])[:str(suggestions[0]).rfind(",")]
    final = one_joined_value_removed[:one_joined_value_removed.rfind(",")]
    return final
            
def pre_process_document(document):
    # strip HTML
    document = strip_html_tags(document)
    # lower case
    document = document.lower()
    # remove extra newlines (often might be present in really noisy text)
    document = document.translate(document.maketrans("\n\t\r", "   "))
    # remove accented characters
    document = remove_accented_chars(document)
    # expand contractions    
    document = expand_contractions(document)  
    # remove special characters and\or digits    
    # insert spaces between special characters to isolate them    
    special_char_pattern = re.compile(r'([{.(-)!}])')
    document = special_char_pattern.sub(" \\1 ", document)
    document = remove_special_characters(document, remove_digits=True)  
    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()
    # correct spelling
    document = correct_spelling(document)
    
    return document

pre_process_corpus = np.vectorize(pre_process_document)

In [27]:
# negative reviews examples
df[df["Sentiment"]==0.0].head()

Unnamed: 0,Review,Sentiment
5064658,Because this is not ripping off agario,0.0
6067328,"this game sucks, man. I dont know why people l...",0.0
2553403,I don't know what is worse the developers of t...,0.0
5580024,Dropped this game. I had no trouble getting th...,0.0
4503034,Early Access Review,0.0


In [28]:
# there are null reviews
df[df["Review"].isnull()].head()

Unnamed: 0,Review,Sentiment
3631319,,0.0
4039112,,0.0
2417963,,1.0
2276278,,1.0
5907045,,1.0


In [29]:
# replace null reviews with null strings
# not replacing null reviews with null strings causes issues with the models
df.loc[df["Review"].isnull(), 'Review'] = ""

In [30]:
# there are no null values for sentiment
df[df["Sentiment"].isnull()]

Unnamed: 0,Review,Sentiment


In [31]:
# preprocess by replacing html and accented/special characters
df.loc[:, 'Review'] = pre_process_corpus(df["Review"])

  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


The following cells split the data into train, dev, and test subsets and saves the data to files.

In [32]:
# split into test, dev, and train
rand = np.random.rand(len(df))
train_df = df[rand < 0.6]
dev_df = df[(rand >= 0.6) & (rand < 0.8)]
test_df = df[rand >= 0.8]
print("Number of train observations: {}".format(len(train_df)))
print("Number of dev observations: {}".format(len(dev_df)))
print("Number of test observations: {}".format(len(test_df)))

Number of train observations: 59881
Number of dev observations: 20089
Number of test observations: 20030


In [33]:
# write out datasets
train_df.to_csv(filename.split(".")[0] + "_" + file_set + "_train.csv", index=False)
dev_df.to_csv(filename.split(".")[0] + "_" + file_set + "_dev.csv", index=False)
test_df.to_csv(filename.split(".")[0] + "_" + file_set + "_test.csv", index=False)