# **Data Pre-Processing (Reviews)**

# Importing Libraries

In [None]:
import pandas as pd
import spacy
import scipy
import numpy as np
import uuid

# Importing Functions

In [None]:
from data_cleaning_functions.clean_text import clean_text
from saving_loading_functions.saving_file import saving_file
from saving_loading_functions.loading_file import loading_file

# Declaring Filepaths

In [None]:
raw_filepath = 'data/raw/'
uuid_filepath = 'data/processed/uuid_dataframes/'
adhoc_filepath = 'data/processed/adhoc_fixes/'
filtered_filepath = 'data/processed/filtered_dataframes/'
tokenised_filepath = 'data/processed/tokenised_sentences/'

# Loading Datasets from S3 Bucket

In [None]:
# loading digital and conventional bank dataset
dig_df = loading_file(raw_filepath, 'digital_bank_scraped_data.csv', 0)
conven_df = loading_file(raw_filepath, 'conventional_bank_scraped_data.csv', 0)
n26_df = loading_file(raw_filepath, 'n26_bank_scraped_data.csv', 0)

# 1) Assigning UUID

UUID serves as a unique ID for data engineering further down the pipeline

In [None]:
############## assigning UUID ##########################
# digital bank
dig_df["uuid"] = 0
for i in range(len(dig_df)):
    dig_df.loc[i, "uuid"] = uuid.uuid4()

# conventional bank
conven_df["uuid"] = 0
for i in range(len(conven_df)):
    conven_df.loc[i, "uuid"] = uuid.uuid4()  

# n26
n26_df["uuid"] = 0
for i in range(len(n26_df)):
    n26_df.loc[i, "uuid"] = uuid.uuid4()  

############ saving files onto bucket ####################
# saving resampled dataset
saving_file(dig_df, uuid_filepath, 'digital_bank_scraped_data_uuid.csv')
saving_file(conven_df, uuid_filepath, 'conventional_bank_scraped_data_uuid.csv')
saving_file(n26_df, uuid_filepath, 'n26_bank_scraped_data_uuid.csv')

# 2) Merging Onto Single Dataframe

In [None]:
# merging
df = pd.concat([dig_df, conven_df, n26_df], ignore_index=True)

# saving dataframe
saving_file(df, uuid_filepath, 'all_bank_scraped_data_missing_dates_uuid.csv')

# 3) Identifying and Rescraping Reviews with Missing Dates

In [None]:
# identifying user_urls with missing dates
rem_users_df = df.loc[pd.isna(df.loc[:, "date"]), "user_url"]

# saving dataframe
saving_file(rem_users_df, adhoc_filepath, 'review_missing_dates_user_urls.csv')

In [None]:
# after scraping for given users and uploading file onto bucket
# loading file from bucket
rem_users_data_df = loading_file(adhoc_filepath, 'review_missing_dates_user_urls_data.csv', 0)
rem_users_data_df

Unnamed: 0,user_url,company,date
0,/users/60636a29bd9132001982c2fa,Monzo,2022-06-09T16:57:55.000Z
1,/users/62a2fabe6d0355001101eb11,Wise (formerly TransferWise),2022-06-10T10:23:07.000Z
2,/users/629899df1f28f40011af0c71,Starling Bank,2022-06-09T15:19:53.000Z
3,/users/5ecfa7dd75750c96c6efb734,Starling Bank,2022-06-09T08:23:40.000Z
4,/users/56583efd0000ff0001ee1b9c,Starling Bank,2022-07-05T14:04:25.000Z
...,...,...,...
11377,/users/501c06e400006400011e919f,Wise (formerly TransferWise),2013-06-27T11:04:37.000Z
11378,/users/50168b6100006400011e3e15,Wise (formerly TransferWise),2012-07-30T13:29:25.000Z
11379,/users/50082d9100006400011d94d5,Wise (formerly TransferWise),2012-07-19T16:07:01.000Z
11380,/users/50081f2900006400011d93e9,Wise (formerly TransferWise),2012-07-19T15:15:41.000Z


In [None]:
# replacing the missing dates 
for i in range(len(rem_users_data_df)):
    url = rem_users_data_df.loc[i, "user_url"]
    company = rem_users_data_df.loc[i, "company"]
    date = rem_users_data_df.loc[i, "date"]
    df.loc[(df.loc[:, "user_url"] == url) & (df.loc[:, "company"] == company), "date"] = date

print("Remaining number of reviews without a date: {}".format(len(df.loc[pd.isna(df.loc[:, "date"]), :])))

Remaining number of reviews without a date: 24


Due to the review being removed by the user as of 19/08/2022

In [None]:
# dropping reviews with missing date
df.dropna(axis=0, subset=['date'], inplace=True)
df.reset_index(drop=True, inplace=True)

# saving dataframe
saving_file(df, uuid_filepath, 'all_bank_scraped_data_filled_dates_uuid.csv')

# 4) Filtering for Poor Reviews, Dropping Duplicates and Replacing Empty Texts with Titles

In [None]:
def filter_df(bank):
"""
Filter a dataset for the given bank, saving the resulting the dataset
"""

    # filtering for only reviews of that bank name
    filter_df = df.loc[df.loc[:, 'company'] == bank, :]

    # filtering for poor reviews
    filter_df_poor = filter_df.loc[(filter_df.loc[:, 'rating'] == 'Rated 1 out of 5 stars') | 
                                   (filter_df.loc[:, 'rating'] == 'Rated 2 out of 5 stars') | 
                                   (filter_df.loc[:, 'rating'] == 'Rated 3 out of 5 stars'), :]
    filter_df_poor.drop_duplicates() # dropping duplicates
    filter_df_poor.reset_index(drop=True, inplace=True)

    # replacing empty texts with titles (i.e. use title instead if no text)
    for i in range(len(filter_df_poor)):
        if pd.isnull(filter_df_poor.loc[i, "text"]):
            filter_df_poor.loc[i, "text"] = filter_df_poor.loc[i, "title"]
    
    bank_name = bank.split(" ")[0].lower()
    
    # saving dataframe
    saving_file(filter_df_poor, filtered_filepath, f'{bank_name}_df_poor.csv')
    
# iterating over all banks and filtering
formal_bank_names = list(set(df.loc[:, "company"]))

for bank in formal_bank_names:
    filter_df(bank)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


# 4) Sentence Tokenization and Additional Cleaning

After tokenising the text into sentences, the following data cleaning steps are taken:
 - Removing NA's 
 - Removing non-sensical sentences (i.e. sentences with fewer than 3 words) 
 - Removing duplicated cleaned sentences 

In [None]:
# load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_trf")

def sent_token_clean(bank):
    """
    Takes a dataset of reviews, splits it by bank, tokenises the sentences, 
    cleans it, and saves the resulting file
    """

    bank_name = bank.split(" ")[0].lower()
    
    # loading dataset
    df = loading_file(filtered_filepath, f'{bank_name}_df_poor.csv', 1)
    
    # sentence tokenisation
    sent_list = []
    text_list = []
    for i in range(len(df)):
        text = df.loc[i, "text"]
        if pd.isnull(text):
            pass
        else:
            doc = nlp(text)
            entry_sent_list = [str(sentence) for sentence in doc.sents if str(sentence) not in sent_list]
            entry_text_list = [str(text) for sentence in doc.sents if str(sentence) not in sent_list]
            sent_list.extend(entry_sent_list)
            text_list.extend(entry_text_list)
    
    # cleaning tokenised sentences
    cleaned_sent_list = [clean_text(sent, bank_name) for sent in sent_list]
    
    # creating new dataframe with original review, tokenised sentence and cleaned sentence
    sent_dict = {"text": text_list, "sentences": sent_list, "cleaned_sentences": cleaned_sent_list}
    bank_sent_df = pd.DataFrame(sent_dict)
    
    print("\n{}".format(bank_name.title()))
    
    # removing NA and duplicated cleaned sentences
    print("Original number of cleaned sentences: {}".format(len(bank_sent_df)))

    # removing na's
    bank_sent_df = bank_sent_df[np.where((bank_sent_df['cleaned_sentences'].str.len()>0), True, False)]
    print("Number of cleaned sentences after dropping NA: {}".format(len(bank_sent_df)))

    # removing non-sensical sentences (i.e. sentences with length <= 2)
    bank_sent_df = bank_sent_df[np.where((bank_sent_df['cleaned_sentences'].str.split(" ").str.len()>2), True, False)]
    print("Number of cleaned sentences after dropping non-sensical sentences: {}".format(len(bank_sent_df)))
    
    # removing duplicated cleaned sentences
    bank_sent_df.drop_duplicates(subset=['cleaned_sentences'], inplace=True)
    print("Number of cleaned sentences after dropping duplicates: {}".format(len(bank_sent_df)))

    bank_sent_df.reset_index(drop=True, inplace=True)
    
    # saving file onto bucket
    saving_file(bank_sent_df, tokenised_filepath, f'{bank_name}_tokenized_sentences_df.csv')


# iterating over different banks
for bank in formal_bank_names:
    sent_token_clean(bank)




N26
Original number of cleaned sentences: 13846
Number of cleaned sentences after dropping NA: 13804
Number of cleaned sentences after dropping non-sensical sentences: 13199
Number of cleaned sentences after dropping duplicates: 13078


Token indices sequence length is longer than the specified maximum sequence length for this model (3335 > 512). Running this sequence through the model will result in indexing errors



Wise
Original number of cleaned sentences: 42354
Number of cleaned sentences after dropping NA: 42280
Number of cleaned sentences after dropping non-sensical sentences: 40608
Number of cleaned sentences after dropping duplicates: 40160





Starling
Original number of cleaned sentences: 21571
Number of cleaned sentences after dropping NA: 21524
Number of cleaned sentences after dropping non-sensical sentences: 20680
Number of cleaned sentences after dropping duplicates: 20516





Monzo
Original number of cleaned sentences: 13044
Number of cleaned sentences after dropping NA: 13009
Number of cleaned sentences after dropping non-sensical sentences: 12436
Number of cleaned sentences after dropping duplicates: 12307





Revolut
Original number of cleaned sentences: 60092
Number of cleaned sentences after dropping NA: 60008
Number of cleaned sentences after dropping non-sensical sentences: 57730
Number of cleaned sentences after dropping duplicates: 57055





Hsbc
Original number of cleaned sentences: 32739
Number of cleaned sentences after dropping NA: 32696
Number of cleaned sentences after dropping non-sensical sentences: 31551
Number of cleaned sentences after dropping duplicates: 31151





Lloyds
Original number of cleaned sentences: 13295
Number of cleaned sentences after dropping NA: 13267
Number of cleaned sentences after dropping non-sensical sentences: 12742
Number of cleaned sentences after dropping duplicates: 12664





Natwest
Original number of cleaned sentences: 19988
Number of cleaned sentences after dropping NA: 19949
Number of cleaned sentences after dropping non-sensical sentences: 19135
Number of cleaned sentences after dropping duplicates: 18947





Barclays
Original number of cleaned sentences: 5986
Number of cleaned sentences after dropping NA: 5967
Number of cleaned sentences after dropping non-sensical sentences: 5689
Number of cleaned sentences after dropping duplicates: 5646





Santander
Original number of cleaned sentences: 23378
Number of cleaned sentences after dropping NA: 23340
Number of cleaned sentences after dropping non-sensical sentences: 22512
Number of cleaned sentences after dropping duplicates: 22175
