In [1]:
# # Uncomment these lines if the packages are not installed
# pip install emot
# pip install emoji
# pip install gensim

import csv
import pickle
import re
import pandas as pd
from tqdm import tqdm
from emot.emo_unicode import EMOJI_UNICODE, EMOTICONS_EMO  # For EMOTICONS and EMOJI
from gensim.parsing.preprocessing import remove_stopwords
from nltk.corpus import stopwords
import requests
import emoji
import types

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            try:
                yield val.__name__, val.__version__
                pass
            except:
                yield val.__name__
                pass
            pass
        pass
    pass

list(imports())

# Reading data
FPS_reviews = pd.read_csv("../data/raw_data/data_filtered.csv.gz", compression="gzip",low_memory=False)


In [2]:
# Deal with profanities profanities censored as '♥'
def identify_profanities_1(selected_reviews):
    """
    Identifies and replaces censored profanities in reviews, where the censorship character is '♥'.

    Args:
    selected_reviews: A list of reviews to process.

    Returns:
    A list of reviews with profanities uncensored.
    """
    # download the .txt file from https://github.com/kast1450/steam-profanity-filter/blob/main/english/english-profanity.txt
    with open("../data/supplemental_data/english-profanity.txt", encoding='utf-8') as file: 
        profanity_list = [item.strip() for item in file]

    profanity_dictionary = {}
    for word in profanity_list:
        censored = "♥" * len(word)
        profanity_dictionary[word] = censored

    for i in tqdm(range(len(selected_reviews))):
        review = str(selected_reviews[i])
        for profanity, censored_version in profanity_dictionary.items():
            review = review.replace(censored_version, profanity)
        selected_reviews[i] = review

    return selected_reviews

# Deal with profanities censored as '*'
def identify_profanities_2(selected_reviews):
    """
    Identifies and replaces censored profanities in reviews, where the censorship character is '*'.

    Args:
    selected_reviews: A list of reviews to process.

    Returns:
    A list of reviews with profanities uncensored.
    """
    with open("../data/supplemental_data/english-profanity.txt", encoding='utf-8') as file:
        profanity_list = [item.strip() for item in file]

    profanity_dictionary = {}
    for word in profanity_list:
        censored = "*" * len(word)
        profanity_dictionary[word] = censored

    for i in tqdm(range(len(selected_reviews))):
        review = str(selected_reviews[i])
        for profanity, censored_version in profanity_dictionary.items():
            review = review.replace(censored_version, profanity)
        selected_reviews[i] = review

    return selected_reviews


In [3]:
# Deal with URLs
def remove_urls_https_and_www(selected_reviews):
    """
    Removes URLs from the given list of reviews.

    Args:
    selected_reviews: A list of reviews.

    Returns:
    A list of reviews with URLs removed.
    """
    for i in tqdm(range(len(selected_reviews))):
        review = str(selected_reviews[i])
        patterns = [r"url=https?://(www\.)?", r"https?://(www\.)?",
                    r"www.?", r".com\S+"]
        for pattern in patterns:
            url = re.compile(pattern)
            review = url.sub('', review).strip().strip('/')
        selected_reviews[i] = review
    return selected_reviews

# Deal with emojis
def convert_emojis_to_word(selected_review):
    """
    Converts emojis in the reviews to their word representation.

    Args:
    selected_review: A list of reviews.

    Returns:
    A list of reviews with emojis converted to words.
    """
    for i in tqdm(range(len(selected_review))):
        review = emoji.demojize(str(selected_review[i]), delimiters=("", " "))
        selected_review[i] = review
    return selected_review

# Deal with emoticons
def convert_emoticons_to_word(selected_review):
    """
    Converts emoticons in the reviews to their word representation.

    Args:
    selected_review: A list of reviews.

    Returns:
    A list of reviews with emoticons converted to words.
    """
    for i in tqdm(range(len(selected_review))):
        review = str(selected_review[i])
        for emot in EMOTICONS_EMO:
            review = review.replace(emot, EMOTICONS_EMO[emot].replace(" ", "_"))
        selected_review[i] = review
    return selected_review

# Deal with casing
def lower_case(selected_reviews):
    """
    Converts all characters in the reviews to lower case.

    Args:
    selected_reviews: A list of reviews.

    Returns:
    A list of reviews in lower case.
    """
    return [review.lower() for review in tqdm(selected_reviews)]

# Deal with punctuation
def remove_punctuation(selected_reviews):
    """
    Removes punctuation from the reviews.

    Args:
    selected_reviews: A list of reviews.

    Returns:
    A list of reviews with punctuation removed.
    """
    return [re.sub(r'[^A-Za-z0-9_]', ' ', str(review)) for review in tqdm(selected_reviews)]

# Deal with stop-words
def remove_stop_words(selected_reviews):
    """
    Removes stop words from the reviews.

    Args:
    selected_reviews: A list of reviews.

    Returns:
    A list of reviews with stop words removed.
    """
    # please download the file from here https://drive.google.com/file/d/1Mg1VFspYOembPVnZ3ocYBi9aC2Y8gRWR/view?usp=sharing
    with open('../data/supplemental_data/stopwords-english') as f:
        stopwords_list = f.read().splitlines()
    return [remove_stopwords(str(review)) for review in tqdm(selected_reviews)]

# Deal with double spacing
def remove_spacing(selected_reviews):
    """
    Removes extra spaces from the reviews.

    Args:
    selected_reviews: A list of reviews.

    Returns:
    A list of reviews with extra spaces removed.
    """
    return [re.sub(r'\s+', ' ', str(review), flags=re.I) for review in tqdm(selected_reviews)]


In [6]:
# Run the above functions and save the file.

FPS_reviews["review"] = identify_profanities_1(FPS_reviews["review"].tolist())
FPS_reviews["review"] = identify_profanities_2(FPS_reviews["review"].tolist())
FPS_reviews["review"] = remove_urls_https_and_www(FPS_reviews["review"].tolist())
FPS_reviews["review"] = convert_emojis_to_word(FPS_reviews["review"].tolist())
FPS_reviews["review"] = convert_emoticons_to_word(FPS_reviews["review"].tolist())
FPS_reviews["review"] = lower_case(FPS_reviews["review"].tolist())
FPS_reviews["review"] = remove_punctuation(FPS_reviews["review"].tolist())
FPS_reviews["review"] = remove_stop_words(FPS_reviews["review"].tolist())
FPS_reviews["review"] = remove_spacing(FPS_reviews["review"].tolist())

# Save the processed data to a CSV file
FPS_reviews.to_csv('../data/interim_data/04_text_mining/preprocessed/reviews_preprocessed.csv.gz', index=False, compression='gzip')