In [1]:
import pandas as pd
import re
import json
import emoji
from collections import OrderedDict

In [16]:
train = pd.read_csv(r"train.csv", encoding='utf8')
train.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1


In [17]:
test = pd.read_csv(r"test.csv", encoding='utf8')
test.head()

Unnamed: 0,review_id,title,year,user_review
0,1603,Counter-Strike: Global Offensive,2015.0,"Nice graphics, new maps, weapons and models. B..."
1,1604,Counter-Strike: Global Offensive,2018.0,I would not recommend getting into this at its...
2,1605,Counter-Strike: Global Offensive,2018.0,Edit 11/12/18I have tried playing CS:GO recent...
3,1606,Counter-Strike: Global Offensive,2015.0,The game is great. But the community is the wo...
4,1607,Counter-Strike: Global Offensive,2015.0,I thank TrulyRazor for buying this for me a lo...


In [4]:
def data_cleaning(data):
    # expand_contractions
    CONTRACTION_MAP = eval(open('CONTRACTION_MAP.txt', 'r').read())
    def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):

        contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                          flags=re.IGNORECASE|re.DOTALL)
        def expand_match(contraction):
            match = contraction.group(0)
            first_char = match[0]
            expanded_contraction = contraction_mapping.get(match)\
                                    if contraction_mapping.get(match)\
                                    else contraction_mapping.get(match.lower())                       
            expanded_contraction = first_char+expanded_contraction[1:]
            return expanded_contraction

        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
        return expanded_text
    data['cleaned_user_review'] = data['user_review'].apply(lambda x: expand_contractions(x))
    # removing "Early Access Review"
    data['cleaned_user_review'] = data['cleaned_user_review'].apply(lambda x: re.sub('^Early Access Review', '', x))
    # removing URLs
    data['cleaned_user_review'] = data['cleaned_user_review'].apply(lambda x: re.sub(r"http\S+", "", x))
    # removing emojis
    data['cleaned_user_review'] = data['cleaned_user_review'].apply(lambda x: re.sub(emoji.get_emoji_regexp(), '', x))
    # removing repetitive words
    data['cleaned_user_review'] = data['cleaned_user_review'].apply(lambda x: re.sub(r'\b(\w+)( \1\b)+', r'\1', x))
    # removing repetitive phrases
    data['cleaned_user_review'] = data['cleaned_user_review'].apply(lambda x: (' '.join(OrderedDict((w,w) for w in x.split()).keys())))
    # removing repetitive letters or punctuations
    data['cleaned_user_review'] = data['cleaned_user_review'].apply(lambda x: re.sub(r'([!,.+?])\1+', r'\1\1', x))
    # removing selective punctuations
    data['cleaned_user_review'] = data['cleaned_user_review'].apply(lambda x: re.sub(r"[|~|\\\\|./|_|-|<|>|#|@|!|&]",'', x))
    # removing special characters
    data['cleaned_user_review'] = data['cleaned_user_review'].apply(lambda x: re.sub(r'[^\x00-\x7f]',r' ', x))
    # removing more than single space
    data['cleaned_user_review'] = data['cleaned_user_review'].apply(lambda x: re.sub(r'\s+', ' ', x))
    
    return data

In [18]:
cleaned_train_data = data_cleaning(train)
cleaned_test_data = data_cleaning(test)

In [19]:
cleaned_train_data

Unnamed: 0,review_id,title,year,user_review,user_suggestion,cleaned_user_review
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1,I am scared and hearing creepy voices So will ...
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1,"Best game, more better than Sam Peppers YouTub..."
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1,"A littly iffy on the controls, but once you kn..."
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1,"Great game, fun and colorful all thatA side no..."
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1,Not many games have the cute tag right next to...
5,6,Spooky's Jump Scare Mansion,2015.0,"Early Access ReviewIt's pretty cute at first, ...",1,"It is pretty cute at first, but then later get..."
6,7,Spooky's Jump Scare Mansion,2017.0,Great game. it's a cute little horror game tha...,1,Great game it is a cute little horror game tha...
7,8,Spooky's Jump Scare Mansion,2015.0,Spooky's Jump Scare Mansion is a Free Retro ma...,1,Spookys Jump Scare Mansion is a Free Retro maz...
8,9,Spooky's Jump Scare Mansion,2015.0,"Somewhere between light hearted, happy parody ...",0,"Somewhere between light hearted, happy parody ..."
9,10,Spooky's Jump Scare Mansion,2015.0,This game with its cute little out of the wall...,1,This game with its cute little out of the wall...
