In [1]:
# To mount Google Drive
from google.colab import drive

# Mount drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

import re

df = pd.read_csv('/content/drive/My Drive/FYP/FYP Y4S1/review_activity/interim/baseline_ocsvm_results.csv')

In [3]:
fake_reviews_df = df.groupby('acc_num').agg({'fake_reviews':np.sum}).reset_index()
fake_reviews_df = fake_reviews_df.rename(columns={"fake_reviews":"total_fake_reviews"})
fake_reviews_df

Unnamed: 0,acc_num,total_fake_reviews
0,AE2242WWRDTRHP3Y4WK2K2GQIOUA,0
1,AE22JJYZQC6KT354WG4JSJCU4WXA,0
2,AE22M5XAIOKNXQT7JSZY3LNU3VOQ,0
3,AE22MKKJATCXH6RL42WXLDUEJ5WQ,0
4,AE22RZJ3SCYG6RZTGJ6HVNFRX7PA,0
...,...,...
64628,AHZZCA3IUYD3WHWOFHECMLPNJZLQ,0
64629,AHZZJOVR5WVI2PCSST3A6JQN57SA,0
64630,AHZZONVVQ454QUCD23HMD5E7VNIQ,0
64631,AHZZTMJPOUBTHE3KC3NONEWETDAA,0


In [4]:
profiles_df = pd.read_csv('/content/drive/My Drive/FYP/FYP Y4S1/review_activity/interim/consolidated_profiles.csv')
profiles_df = pd.merge(profiles_df,fake_reviews_df,left_on=['acc_num'], right_on = ['acc_num'], how = 'left')
profiles_df['proportion_fake_reviews'] = profiles_df['total_fake_reviews']/profiles_df['cleaned_total_reviews_posted']
# Normalizing the data so that the data, approximately follows a Gaussian distribution 
helpful_votes = profiles_df['cleaned_average_helpfulVotes']
max_helpful_votes = max(helpful_votes)
profiles_df['cleaned_average_helpfulVotes'] = [float(i)/max_helpful_votes for i in helpful_votes]
profiles_df['cleaned_not_brand_monogamist'] = 1- profiles_df['cleaned_brand_monogamist']
profiles_df['cleaned_not_brand_loyalist'] = 1- profiles_df['cleaned_brand_loyalist']
profiles_df['cleaned_not_brand_repeater'] = 1- profiles_df['cleaned_brand_repeater']
suspicious_reviewer_columns = ['cleaned_not_brand_monogamist','cleaned_not_brand_loyalist','cleaned_not_brand_repeater','cleaned_never_verified_reviewer','cleaned_one_hit_wonder']
profiles_df['suspicious_reviewer_score'] = (profiles_df['cleaned_not_brand_monogamist'] + profiles_df['cleaned_not_brand_loyalist'] + profiles_df['cleaned_not_brand_repeater'] + profiles_df['cleaned_never_verified_reviewer'] + profiles_df['cleaned_one_hit_wonder']) / len(suspicious_reviewer_columns)
interested_columns = ['acc_num','proportion_fake_reviews','suspicious_reviewer_score']
new_profiles_df = profiles_df[interested_columns]
new_profiles_df = new_profiles_df.fillna(0)
new_profiles_df.loc[new_profiles_df.proportion_fake_reviews > 1, 'proportion_fake_reviews'] = 1
new_profiles_df

Unnamed: 0,acc_num,proportion_fake_reviews,suspicious_reviewer_score
0,AG3SZWYYCVNKZFTF5ONOHIR2W3ZQ,0.0,0.8
1,AE2GLRIB53WWNGS3LFPDBK4FHOXA,0.0,1.0
2,AF74PPSRBCD46LSS3AIOH3CZ6UYA,0.0,0.8
3,AHDJNPXWQMD5N2CHG7Q6KQDJPRRA,0.0,0.8
4,AENWE3IEOTCHQNSTWT37HQAAOV3A,0.0,0.8
...,...,...,...
65925,AGGEU6ZFNNZFH6OD2JRZCTWUJFEA,0.0,0.8
65926,AG67I7R63CBOFXW52RVMYJPTACMA,0.0,0.8
65927,AF4VJ7HRBAKTTEE7OMMEDOUALYKQ,0.0,0.8
65928,AEZQ35CJNSPVOB5RGXWHL3W3OJHA,0.0,0.8


In [5]:
def convert_datetime(date):
    dates = date.split()
    months = {"January":1, "February":2,"March":3,"April":4,"May":5,"June":6,"July":7,"August":8,"September":9,"October":10,"November":11,"December":12}
    
    day = int(re.sub("\D", "", dates[1]))
    month = months[dates[0]]
    year = int(dates[2].strip())
    return datetime(year,month,day)

scoring_df = df[['asin','acc_num','cleaned_reviews_profile_link','cleaned_reviews_text','cleaned_reviews_date_posted','fake_reviews','decision_function']]
scoring_df = pd.merge(scoring_df,new_profiles_df,left_on=['acc_num'], right_on = ['acc_num'], how = 'left')
scoring_df['cleaned_reviews_date_posted'] = scoring_df['cleaned_reviews_date_posted'].apply(convert_datetime)

scoring_df['diff_days'] = datetime.now() - scoring_df['cleaned_reviews_date_posted']
scoring_df['diff_days'] = scoring_df['diff_days'] // np.timedelta64(1,'D')  

scoring_df['time_score'] = -1
# within 1 mth
scoring_df.loc[ (scoring_df.diff_days <= 30) & (scoring_df.time_score == -1), 'time_score'] = 1
# 1 to 3 mth
scoring_df.loc[ (scoring_df.diff_days <= 90) & (scoring_df.time_score == -1), 'time_score'] = 0.8
# 3 to 6 mth
scoring_df.loc[ (scoring_df.diff_days <= 180) & (scoring_df.time_score == -1), 'time_score'] = 0.6
# 6 mth to 12 mth
scoring_df.loc[ (scoring_df.diff_days <= 360) & (scoring_df.time_score == -1), 'time_score'] = 0.4
# 12 to 24 mth
scoring_df.loc[ (scoring_df.diff_days <= 720) & (scoring_df.time_score == -1), 'time_score'] = 0.2
# more than 24 mth
scoring_df.loc[ (scoring_df.diff_days > 720) & (scoring_df.time_score == -1), 'time_score'] = 0

bin_labels = {0:'Not Severe', 1:'Severe',2:'Very Severe'}

decision_function = scoring_df['decision_function']
max_decision_function = max(decision_function)
min_decision_function = min(decision_function)
scoring_df['decision_function'] = [(float(i) - min_decision_function)/(max_decision_function - min_decision_function) for i in decision_function]

scoring_df['impact_score'] = (0.1* scoring_df.time_score + (0.4 * (scoring_df.fake_reviews) * (scoring_df.decision_function)) + 0.25*scoring_df.proportion_fake_reviews + 0.25*scoring_df.suspicious_reviewer_score)
scoring_df['rank'] = scoring_df['impact_score'].rank(method='first')
scoring_df['impact'] = pd.qcut(scoring_df['rank'].values, 3,).codes
scoring_df = scoring_df.replace({"impact": bin_labels}) 

scoring_df = scoring_df.sort_values(by='impact_score', ascending=False)

scoring_df = scoring_df.drop(columns=['diff_days','time_score','rank'])

display(scoring_df)


Unnamed: 0,asin,acc_num,cleaned_reviews_profile_link,cleaned_reviews_text,cleaned_reviews_date_posted,fake_reviews,decision_function,proportion_fake_reviews,suspicious_reviewer_score,impact_score,impact
77325,B01MG4PZK0,AHMHJAVMKXCMTNNQQFZ3XAWKOP4A,https://www.amazon.com/gp/profile/amzn1.accoun...,combination acne prone skin cream moisturizers...,2020-04-20,1,0.850746,1.0,0.6,0.780298,Very Severe
77324,B004BCXAM8,AHA4R3CCVO7V4YVG6ZFHKQBP5QQA,https://www.amazon.com/gp/profile/amzn1.accoun...,love mascara waterproof make lash look amaze,2020-04-20,1,0.847068,1.0,0.6,0.778827,Very Severe
77336,B0014ZWHZQ,AEE2OOAYH3UXEOPJIZYKNKZAB7XA,https://www.amazon.com/gp/profile/amzn1.accoun...,product ha receive package product complete ti...,2020-08-19,1,0.844075,1.0,0.6,0.777630,Very Severe
77321,B0014ZWHZQ,AEU66P4OCNI5C3UQOP3ZCD7LU2NA,https://www.amazon.com/gp/profile/amzn1.accoun...,good product,2020-02-25,1,0.844031,1.0,0.6,0.777612,Very Severe
15626,B000NP7K1E,AEYASIF42HPNELSYVXBG7XT5MF7Q,https://www.amazon.com/gp/profile/amzn1.accoun...,receive product delivery announcement wa late ...,2020-09-06,1,0.637355,1.0,0.8,0.764942,Very Severe
...,...,...,...,...,...,...,...,...,...,...,...
40131,B005EII2R6,AEZKYS3KRHOB4EWQUEAZ3KKOCQGQ,https://www.amazon.com/gp/profile/amzn1.accoun...,first loreal infallible try glad cause beautif...,2013-08-05,0,0.120726,0.0,0.8,0.200000,Not Severe
40136,B001268TE0,AHZQMVUFXRR6KI2DDFK3FMPVFO6Q,https://www.amazon.com/gp/profile/amzn1.accoun...,use since 2010 love,2016-03-09,0,0.096166,0.0,0.8,0.200000,Not Severe
40137,B00AUFS9FS,AHZQMVUFXRR6KI2DDFK3FMPVFO6Q,https://www.amazon.com/gp/profile/amzn1.accoun...,perfect hide scar,2016-03-09,0,0.110783,0.0,0.8,0.200000,Not Severe
40139,B004BCXAM8,AENOWAVGFM7MJ4RAH4REEZI56U3Q,https://www.amazon.com/gp/profile/amzn1.accoun...,voluminous eyelash doe smell like stay lash da...,2014-05-07,0,0.084526,0.0,0.8,0.200000,Not Severe
