In [None]:
%%capture
!pip install kaggle
!pip install unidecode
!pip install contractions

In [None]:
%%capture
import pandas as pd
from google.colab import files
from IPython.display import display
from IPython.display import display_html
import nltk
from nltk import wordnet
from nltk.stem.lancaster import LancasterStemmer
from nltk import punkt
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk import casual_tokenize
from nltk import word_tokenize
from nltk import WordNetLemmatizer
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from unidecode import unidecode
import string
from contractions import contractions_dict
from google.colab import drive
import os


In [None]:
#Uploading .json file associated with my Kaggle account
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
#Kaggle key needed to use their datasets

In [None]:
!kaggle datasets download -d andrewmvd/steam-reviews
!unzip steam-reviews.zip

Downloading steam-reviews.zip to /content
100% 683M/685M [00:08<00:00, 144MB/s]
100% 685M/685M [00:08<00:00, 86.4MB/s]
Archive:  steam-reviews.zip
  inflating: dataset.csv             


In [None]:
steam_reviews = pd.read_csv('dataset.csv')

steam_reviews.head()

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
0,10,Counter-Strike,Ruined my life.,1,0
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,10,Counter-Strike,This game saved my virginity.,1,0
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1


In [None]:
steam_reviews.rename(columns={'review_score':'score','review_text':'text'},inplace=True)
steam_reviews.reset_index(drop = True, inplace = True)

In [None]:
vc = steam_reviews['score'].value_counts().reset_index(name='count')
vcn = steam_reviews['score'].value_counts(normalize=True).reset_index(name='proportion')
rowsold = len(steam_reviews.index)

review_score_statsold = pd.merge(vc, vcn, on='index')
review_score_statsold.rename(columns={'index': 'score'}, inplace=True)

display(review_score_statsold.style.set_caption('Current dataset balance\n'))
print(f'Total number of reviews: {rowsold}\n')


Unnamed: 0,score,count,proportion
0,1,5260420,0.81975
1,-1,1156686,0.18025


Total number of reviews: 6417106



Since we have such a big dataset, sampling an equal number of positive and negative reviews after simple filtering operations could lead to better results than trying to balance the dataset with SMOTE before or after sampling.

In [None]:
#Removing empty instances

new_df = steam_reviews
new_df.isnull().sum()

app_id               0
app_name        183234
text              7305
score                0
review_votes         0
dtype: int64

In [None]:
new_df[new_df['app_name'].isnull()].sample(5)

Unnamed: 0,app_id,app_name,text,score,review_votes
1623025,224600,,Can't Stop... Won't Stop..... This game has co...,1,0
6411019,9930,,Really bad game. DONT WASTE YOU MONEY,-1,0
379233,17300,,I remember the first time I played this game. ...,1,1
4593742,352460,,Early Access Review,1,0
292101,12120,,Classic.,1,0


We can see that there are more than 180k instances where the app_name column is empty, meaning we do not know the name of the game that was reviewed. However, this information is not important since we are trying to predict whether the review is positive or negative. Furthermore, the reviews seem to be real and valid reviews.

In [None]:
new_df.dropna(subset=['text'], inplace=True)
new_df.isnull().sum()

app_id               0
app_name        183073
text                 0
score                0
review_votes         0
dtype: int64

In [None]:
#Checking the amount of duplicate rows and duplicate reviews

print(new_df.duplicated().sum())
new_df.duplicated(['text']).sum()

1790872


1936328

In [None]:
#Removing duplicate rows and duplicate reviews
new_df.drop_duplicates(inplace=True)
new_df.drop_duplicates(['text'], inplace=True)
print(new_df.duplicated().sum())
new_df.duplicated(['text']).sum()

0


0

In [None]:
# Filter rows where the 'text' column contains '♥♥' or more consecutive occurrences of the symbol ♥
sample_rows_with_consecutive_hearts = new_df[new_df['text'].str.contains('♥{2,}')].sample(n=5)

# Display the sample rows
sample_rows_with_consecutive_hearts[['text']].head(5)


Unnamed: 0,text
6219841,Hard as ♥♥♥♥ 10/10
5652033,"CHEEKI BREEKI I V DAMKE but seriously, it's a..."
6135339,♥♥♥ portal 2 dive right into the colon
5260298,I can fly around as keemstar and bash people w...
1311975,The last game Ubisoft released before becoming...


Steam censors swearwords with hearts, however, it does so very inconsistently, therefore it is debatable whether to delete them, replace them (with a common swearword perhaps), remove all instances from the dataset or just leave them be. I am going to just remove the hearts with pre-processing later in the pipeline.

In [None]:
reviews_app_name = new_df['app_name'].reset_index(name='app_name')

# Look at most common instances of review app_name
tc = reviews_app_name['app_name'].value_counts().reset_index(name='count')
tc.rename(columns={'index': 'app_name'}, inplace=True)
tc.head(100)

Unnamed: 0,app_name,count
0,Terraria,77362
1,PAYDAY 2,61839
2,Undertale,47154
3,Dota 2,46511
4,Warframe,43452
...,...,...
95,Spec Ops: The Line,8166
96,Sid Meier's Civilization V,8015
97,XCOM 2,8012
98,Company of Heroes 2,8008


Since our final dataset will be of 300k total instances, we run the risk of one game comprising a hefty percent of our dataset. To prevent this, we will limit maximum instances of one game to 5000, which will be just around 1.5% of our total instances.

In [None]:
# Calculate the value counts of 'app_name'
app_name_counts = new_df['app_name'].value_counts().reset_index()
app_name_counts.columns = ['app_name', 'count']

# Identify app_names that occur more than 5000 times
high_count_app_names = app_name_counts[app_name_counts['count'] > 5000]['app_name']

# Create a list to store DataFrames for each app_name
app_name_dataframes = []

# Loop through each app_name and append up to 5000 rows for each
for app_name in high_count_app_names:
    app_name_data = new_df[new_df['app_name'] == app_name].sample(5000)
    app_name_dataframes.append(app_name_data)

# Include all rows where app_name occurs less than or equal to 5000 times
low_count_app_names = app_name_counts[app_name_counts['count'] <= 5000]['app_name']
low_count_data = new_df[new_df['app_name'].isin(low_count_app_names)]

# Concatenate all DataFrames in app_name_dataframes list
new_df_filtered = pd.concat(app_name_dataframes)

# Append the low_count_data
new_df_filtered = pd.concat([new_df_filtered, low_count_data])

#Check again
reviews_app_name = new_df_filtered['app_name'].reset_index(name='app_name')

# Look at most common instances of review app_name
tc = reviews_app_name['app_name'].value_counts().reset_index(name='count')
tc.rename(columns={'index': 'app_name'}, inplace=True)
tc.head(100)


Unnamed: 0,app_name,count
0,Terraria,5000
1,theHunter Classic,5000
2,One Finger Death Punch,5000
3,POSTAL 2,5000
4,The Wolf Among Us,5000
...,...,...
95,The Binding of Isaac: Rebirth,5000
96,Emily is Away,5000
97,Path of Exile,5000
98,Mount & Blade: Warband,5000


In [None]:
# Sample an equal number of positive and negative reviews
# The sample is 350.000 to account for data loss during preprocessing
positive_reviews = new_df_filtered[new_df_filtered['score'] == 1].sample(n=175000, random_state=42)

negative_reviews = new_df_filtered[new_df_filtered['score'] == -1].sample(n=175000, random_state=42)

# Concatenate the two dataframes
data = pd.concat([negative_reviews, positive_reviews])

In [None]:
def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(html_str.replace('table', 'table style="display:inline"'), raw=True)

vc2 = data['score'].value_counts().reset_index(name='count')
vcn2 = data['score'].value_counts(normalize=True).reset_index(name='proportion')
rowsnew = len(data.index)

score_statsnew = pd.merge(vc2, vcn2, on='index')
score_statsnew.rename(columns={'index': 'score'}, inplace=True)
score_statsnew = score_statsnew.sort_values(by='score', ascending=False)

display_side_by_side(review_score_statsold.style.set_caption('Old dataset balance'), score_statsnew.style.set_caption('New dataset balance'))
print(f'Old number of reviews: {rowsold}', f'New number of reviews: {rowsnew}')

Unnamed: 0,score,count,proportion
0,1,5260420,0.81975
1,-1,1156686,0.18025

Unnamed: 0,score,count,proportion
1,1,175000,0.5
0,-1,175000,0.5


Old number of reviews: 6417106 New number of reviews: 350000


In [None]:
#Check for balance of specific games
reviews_app_name = data['app_name'].reset_index(name='app_name')

tc = reviews_app_name['app_name'].value_counts().reset_index(name='count')
tc.rename(columns={'index': 'app_name'}, inplace=True)
tc.head(100)

Unnamed: 0,app_name,count
0,Infestation: Survivor Stories 2020,1072
1,Fallout 4,1055
2,Nether,1028
3,theHunter Classic,998
4,Call of Duty: Ghosts,940
...,...,...
95,DARK SOULS™: Prepare To Die Edition,488
96,Wolfenstein: The New Order,485
97,Mirror's Edge,485
98,Lords Of The Fallen,483


In [None]:
#@title Preprocessing

def lowercase(text):
    return text.lower()

def url_replace(text):
    url_pattern = r'((www\.[\S]+)|(https?://[\S]+))'
    return re.sub(url_pattern, 'URL', text)

def replace_emoticons(text):
    SAD_FACE = [':(', ':c', ":'(", ":/"]
    HAPPY_FACE = [':)', ':D', ":'D", ":')", ":P"]
    HEART = ['&lt;3']
    BETTER_THAN = ['&gt']

    for face in SAD_FACE:
        if face in text:
            text = text.replace(face, 'sad')

    for face in HAPPY_FACE:
        if face in text:
            text = text.replace(face, 'happy')

    for face in HEART:
        if face in text:
            text = text.replace(face, 'love')

    for face in BETTER_THAN:
        if face in text:
            text = text.replace(face, 'is better than')

    return text

def replace_ratings(text):
    for match in re.finditer(r'([0-9][0-9]?(\.[0-9])?|100?)\/(100?)', text):
        numerator =  match.group(1)
        denominator = match.group(3)
        rating = float(numerator) / float(denominator)

        repl_str = f'{numerator}/{denominator}'

        if rating < 0.5:
            text = text.replace(repl_str, 'terrible')
        elif rating < 0.6:
            text = text.replace(repl_str, 'bad')
        elif rating < 0.8:
            text = text.replace(repl_str, 'good')
        elif rating < 1:
            text = text.replace(repl_str, 'great')
        else:
            text = text.replace(repl_str, 'excellent')

    return text

def remove_emoji(text):
    emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"
                        u"\U0001F300-\U0001F5FF"
                        u"\U0001F680-\U0001F6FF"
                        u"\U0001F1E0-\U0001F1FF"
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        u"\U0001f926-\U0001f937"
                        u"\U0001F1F2"
                        u"\U0001F1F4"
                        u"\U0001F620"
                        u"\u200d"
                        u"\u2640-\u2642"
                        u"\u2600-\u2B55"
                        u"\u23cf"
                        u"\u23e9"
                        u"\u231a"
                        u"\ufe0f"
                        u"\u3030"
                        u"\U00002500-\U00002BEF"
                        u"\U00010000-\U0010ffff"
                        "]+", flags=re.UNICODE)

    return emoji_pattern.sub(r'', text)

def expand_contractions(text, contraction_mapping=contractions_dict):

    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        expanded_contraction = contraction_mapping.get(match)
        if expanded_contraction is None:
            return match
        first_char = match[0]
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction


    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def remove_punctuation(text):
    text = text.lower().strip()
    # Replace punctuation characters with spaces
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    # Remove extra spaces resulting from punctuation replacement
    text = re.sub(r'\s+', ' ', text)
    return text

def replace_repeated_letters(text):
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    return text


def preprocess_text(text):
    text = url_replace(text)
    text = replace_emoticons(text)
    text = replace_ratings(text)
    text = remove_emoji(text)
    text = expand_contractions(text)
    text = remove_punctuation(text)
    text = replace_repeated_letters(text)
    text = lowercase(text)

    return text



In [None]:
#Apply to dataset
data['text'] = data['text'].apply(preprocess_text)

In [None]:
#Delete empty and short instances that might have happened due to pre-processing
data.dropna(subset=['text'], inplace=True)
data = data[data['text'].apply(lambda text: len(text.split()) >= 3)]
data.shape[0]

347337

In [None]:
#Shuffling the data and creating the final partitions
shuf = data.sample(300000, random_state=42)

train = shuf[:int(0.7*len(shuf))]
val = shuf[int(0.7*len(shuf)):int(0.85*len(shuf))]
test = shuf[int(0.85*len(shuf)):]

In [None]:
#Saving for future use

train.to_csv('train.csv', index=False)
val.to_csv('val.csv', index=False)
test.to_csv('test.csv', index=False)

files.download('train.csv')
files.download('val.csv')
files.download('test.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
train.isnull().sum()

app_id          0
app_name        0
text            0
score           0
review_votes    0
dtype: int64

In [None]:
#@title Apply pre-processing only to 1 row for testing purposes

prepr_data = data.copy()

prepr_data.loc[33681, 'text'] = preprocess_text(prepr_data.loc[33681, 'text'])

# Print the row with index 109539
display(prepr_data.loc[[33681]])