In [1]:
import contractions
from bs4 import BeautifulSoup
import unicodedata
import re
from autocorrect import Speller
import numpy as np
import pandas as pd

In [2]:
filename = "steam_sample.csv"

In [3]:
# read in data
df = pd.read_csv(filename,
                 header=None,
                 names=["GameID", "Review", "Recommend", "FoundHelpful"])

# set positive sentiment to 1 and negative sentiment to 0
df.loc[df['Recommend'] == 1, 'Sentiment'] = 1.0
df.loc[df['Recommend'] == -1, 'Sentiment'] = 0.0
df.drop(columns=["GameID", "Recommend", "FoundHelpful"], inplace=True)

# show a few examples
df.head()

Unnamed: 0,Review,Sentiment
0,Ruined my life.,1.0
1,This will be more of a ''my experience with th...,1.0
2,This game saved my virginity.,1.0
3,• Do you like original games? • Do you like ga...,1.0
4,"Easy to learn, hard to master.",1.0


In [4]:
# define preprocessing functions
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def expand_contractions(text):
    return contractions.fix(text)

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

def correct_spelling(text):
    spell = Speller(lang='en')
    return spell(text)
            
def pre_process_document(document):
    # strip HTML
    document = strip_html_tags(document)
    # lower case
    document = document.lower()
    # remove extra newlines (often might be present in really noisy text)
    document = document.translate(document.maketrans("\n\t\r", "   "))
    # remove accented characters
    document = remove_accented_chars(document)
    # expand contractions    
    document = expand_contractions(document)  
    # remove special characters and\or digits    
    # insert spaces between special characters to isolate them    
    special_char_pattern = re.compile(r'([{.(-)!}])')
    document = special_char_pattern.sub(" \\1 ", document)
    document = remove_special_characters(document, remove_digits=True)  
    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()
    # correct spelling
    # document = correct_spelling(document)
    
    return document

pre_process_corpus = np.vectorize(pre_process_document)

In [5]:
# negative reviews examples
df[df["Sentiment"]==0.0].head()

Unnamed: 0,Review,Sentiment
3330,"You are 14 years old, first time trying to pla...",0.0
3725,Game full of cheaters nice valve.,0.0
4357,"Eh, its alright",0.0
4366,"Eh, its alright",0.0
4523,full of russians,0.0


In [6]:
# there are null reviews
df[df["Review"].isnull()].head()

Unnamed: 0,Review,Sentiment
505,,1.0
539,,1.0
540,,1.0
588,,1.0
675,,1.0


In [7]:
# replace null reviews with null strings
df.loc[df["Review"].isnull(), 'Review'] = ""

In [8]:
# there are no null values for sentiment
df[df["Sentiment"].isnull()]

Unnamed: 0,Review,Sentiment


In [9]:
# preprocess by replacing html and accented/special characters
df.loc[:, 'Review'] = pre_process_corpus(df["Review"])

  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


In [10]:
# split into test, dev, and train
rand = np.random.rand(len(df))
train_df = df[rand < 0.6]
dev_df = df[(rand >= 0.6) & (rand < 0.8)]
test_df = df[rand >= 0.8]
print("Number of train observations: {}".format(len(train_df)))
print("Number of dev observations: {}".format(len(dev_df)))
print("Number of test observations: {}".format(len(test_df)))

Number of train observations: 59835
Number of dev observations: 20038
Number of test observations: 20127


In [12]:
# write out datasets
train_df.to_csv(filename.split(".")[0] + "_train.csv", index=False)
dev_df.to_csv(filename.split(".")[0] + "_dev.csv", index=False)
test_df.to_csv(filename.split(".")[0] + "_test.csv", index=False)