# Data Import & Cleaning

### Contents:

- [Data Import](#Data-Import)
- [Data Cleaning](#Data-Cleaning)
- [Feature Engineering](#Feature-Engineering)

### Import Libraries

In [1]:
#import standard libraries
import pandas as pd
import numpy as np

#import emoji
import emoji

from natsort import natsorted, index_natsorted, order_by_index

#import warnings to ignore flags when the project is complete
#import warnings
#warnings.filterwarnings('ignore')

#import pre-processing libraries for data cleaning
import string
import re
import nltk
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

## Data Import

**Read scrapped data for the following videos**

In [2]:
va = pd.read_csv('../../data/scrapped_data/va_ebeveadmin_914887879433670.csv')

In [3]:
va

Unnamed: 0,video_for,totalEmojiReaction,views
0,ebeveadmin/videos/914887879433670,30,905


In [4]:
df = pd.read_csv('../../data/scrapped_data/ebeveadmin_914887879433670.csv', encoding='utf-8')

In [5]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime
0,Hi,Amir Abdul Majid,0:22
1,Ak20+1,E-Beve,1:00:37
2,"<span class=""pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 k7cz35w2 bsnbvmp4""><img alt=""😋"" height=""32"" referrerpolicy=""origin-when-cross-origin"" src=""https://static.xx.fbcdn.net/images/emoji.php/v9/t59/2/32/1f60b.png"" width=""32""/></span>",Pauline Ang,1:01:03
3,FLL12,Irene Lee,1:01:08
4,AK+1,Pauline Ang,1:01:28
5,"I have to off line now, thanks . Let me know how much I need to pay you.",Irene Lee,1:01:31
6,"Aiyo I am slow <span class=""pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu""><img alt=""🤣"" height=""16"" referrerpolicy=""origin-when-cross-origin"" src=""https://static.xx.fbcdn.net/images/emoji.php/v9/tf1/2/16/1f923.png"" width=""16""/></span>",Pauline Ang,1:01:51
7,"Thanks <span class=""pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu""><img alt=""🙏"" height=""16"" referrerpolicy=""origin-when-cross-origin"" src=""https://static.xx.fbcdn.net/images/emoji.php/v9/t1f/2/16/1f64f.png"" width=""16""/></span>",Pauline Ang,1:02:23
8,Also can. Tiger prawn. Cost??,Pauline Ang,1:02:57
9,New buyers please whatsapp 88896368 your fb name and address,き リーサン,1:03


In [6]:
#https://stackoverflow.com/questions/32072076/find-the-unique-values-in-a-column-and-then-sort-them
#check if the seller uses multiple accounts to reply
postCommentAuthor_unique = df['postCommentAuthor'].unique()
print(sorted(postCommentAuthor_unique))

['Amir Abdul Majid', 'Amy Heng', "Anep Q'Ratu", 'Aysha Khamarudin Al Takhi', 'Diana Ng', 'E-Beve', 'Emily Wong', 'Ernest Tan', 'Firdaus Nordin', 'Halim Boncet', 'Irene Lee', 'Jo Jo Koh', 'Lilian Wong', 'Lily Koh', 'Lukie Neo', 'Min Xuan', 'Munchkin Nacki', 'Nurhafyzah Adzehar', 'Pauline Ang', 'Pauline Ng', 'Rey Teo', 'Serene Tan ST', 'Snowy Sue', 'Susan Ng', 'Tay Poh Huat', 'Vincent Wan', 'Wilson Low', 'Wong Chow Ching', 'Wong Wei Yang', 'Zulkifli Bin Osman', 'اسماعيل ڤسچل', 'き リーサン', '梁文斌', '马小玲']


In [7]:
#find comments posted by the seller only
df.loc[df['postCommentAuthor'] == 'E-Beve']

Unnamed: 0,postComment,postCommentAuthor,postCommentTime
1,Ak20+1,E-Beve,1:00:37
17,I take all,E-Beve,1:05:20
20,"Nvm I make for my <span class=""pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu""><img alt=""🐶"" height=""16"" referrerpolicy=""origin-when-cross-origin"" src=""https://static.xx.fbcdn.net/images/emoji.php/v9/tce/2/16/1f436.png"" width=""16""/></span>",E-Beve,1:07:48
21,"today Lazy go out buy Saba for <span class=""pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu""><img alt=""🐶"" height=""16"" referrerpolicy=""origin-when-cross-origin"" src=""https://static.xx.fbcdn.net/images/emoji.php/v9/tce/2/16/1f436.png"" width=""16""/></span>",E-Beve,1:08:04
27,Can potong Saba head pls,E-Beve,1:10:19
31,Yes,E-Beve,1:13:26
35,Chong Arigatou,E-Beve,1:14:44
37,"<span class=""pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu""><img alt=""👍🏼"" height=""16"" referrerpolicy=""origin-when-cross-origin"" src=""https://static.xx.fbcdn.net/images/emoji.php/v9/t57/2/16/1f44d_1f3fc.png"" width=""16""/></span><span class=""pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu""><img alt=""👍🏼"" height=""16"" referrerpolicy=""origin-when-cross-origin"" src=""https://static.xx.fbcdn.net/images/emoji.php/v9/t57/2/16/1f44d_1f3fc.png"" width=""16""/></span><span class=""pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu""><img alt=""👍🏼"" height=""16"" referrerpolicy=""origin-when-cross-origin"" src=""https://static.xx.fbcdn.net/images/emoji.php/v9/t57/2/16/1f44d_1f3fc.png"" width=""16""/></span><span class=""pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu""><img alt=""👍🏼"" height=""16"" referrerpolicy=""origin-when-cross-origin"" src=""https://static.xx.fbcdn.net/images/emoji.php/v9/t57/2/16/1f44d_1f3fc.png"" width=""16""/></span><span class=""pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu""><img alt=""👍🏼"" height=""16"" referrerpolicy=""origin-when-cross-origin"" src=""https://static.xx.fbcdn.net/images/emoji.php/v9/t57/2/16/1f44d_1f3fc.png"" width=""16""/></span>",E-Beve,1:15:57
41,Rat descale and clean can liao,E-Beve,1:17:15
46,Okay!,E-Beve,1:19:33


In [8]:
#check the comments to have a gauge
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' Chong Arigatou ', ' today Lazy go out buy Saba for <span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu"><img alt="🐶" height="16" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/tce/2/16/1f436.png" width="16"/></span>', '1', '17', '18', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '4q', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '88896368', '<span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu"><img alt="👍🏼" height="16" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/t57/2/16/1f44d_1f3fc.png" width="16"/></span><span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu"><img alt="👍🏼"

## Data Cleaning

### Convert the emojis to text for easy cleaning

We noticed that the emojis have html parsers attached to it. Hence, we will convert the images of the emojis to text first, to remove the html parsers to the emojis, while retaining the emoji's text. We will convert it back to emoji afterwards.

In [9]:
df['postComment'] = df['postComment'].apply(emoji.demojize)

In [10]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' Chong Arigatou ', ' today Lazy go out buy Saba for <span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu"><img alt=":dog_face:" height="16" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/tce/2/16/1f436.png" width="16"/></span>', '1', '17', '18', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '4q', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '88896368', '<span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu"><img alt=":beaming_face_with_smiling_eyes:" height="16" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/tee/2/16/1f601.png" width="16"/></span><span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7

### Clean the comments using Regex

From the above unique values of the comments, there are several cleaning issues that needs to be addressed.

    1. Removal of links or URLs. 
        - URLs are present when the Facebook users manually type a link in the comment. 
        - Additionally, when users are tagged, their Facebook profile URL is printed as a result.
        - Similarly, there is a unique URL linked to each emoji as well. 
        
    2. Removal of HTML special entities
        - Examples of HTML special entities are '&amp' and '&gt'. 
        
    3. Removal of other HTML special terms
        - Examples of other HTML special entities are whitespace HTML special entities like '#x200B' and '#xa0'.
        
    4. Removal of HTML Parsers to the Emojis
        - Previously, the emojis have been demojized to text already. However, the HTML parsers to the mojis remain. Hence, they are required to be removed as well.
        
    5. Removal of HTML Parcers to Whitespaces
        - When a next line is entered in the same comment, there will be a HTML parser to the this whitespace. Hence, this are to be removed as well.
        
    6. Removal of HTML Parsers to tagged names
        - When users are tagged in the comments, in addition to their Facebook profile URL, there will be HTML parsers to the tagged names as well. Hence, this are to be removed as well.
        
    7. Removsl of other HTML Parsers   

In [11]:
def clean(row):
    
    # Remove links or URLs
    row['postComment'] = re.sub(
        pattern=r'https?:\/\/.*\/\w*', 
        repl='', 
        string=row['postComment'],
        flags=re.M)
    
    # Remove HTML special entities (e.g.. &amp, &gt;)
    row['postComment'] = re.sub(
        pattern=r'\&\w*;',
        repl='',
        string=row['postComment'])    
    
    # Remove emoji html parsers
    #https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
    row['postComment'] = re.sub(
        pattern=r'<span\sclass=\"([a-z0-9]{8}\s)+[a-z0-9]{8}\">',
        repl='', 
        string=row['postComment'],
        flags=re.M)
 
    # Remove emoji html parsers
    row['postComment'] = re.sub(
        pattern = r'<img\salt=\"(\:\w*)(\-?)(\w*\:)\"\s[a-z]{6}=\"\d\d\"\s[a-z]{14}=.{26}\s[a-z]{3}=\">',
        repl=r'\1\2\3',
        string=row['postComment'],
        flags=re.M)
    
    # Remove whitespaces
    row['postComment'] = re.sub(
        pattern=r'<\/div><div\s.*\s.*>',
        repl=' ',
        string=row['postComment'],
        flags=re.M)
    
    # Remove tagged names
    row['postComment'] = re.sub(
        pattern=r'<a\sclass=\"([a-z0-9]{8}\s)+[a-z0-9]{8}\"\shref=\">',
        repl='',
        string=row['postComment'],
        flags=re.M)
    

    # Remove consecutive non-ASCII characters
    # This will remove the chinese comments
    #https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
    row['postComment'] = re.sub(
        pattern=r'[^\x00-\x7F]+', 
        repl=' ', 
        string=row['postComment'],
        flags=re.M)
    
    return row

In [12]:
df2 = df.apply(clean, axis=1)

In [13]:
postComment_unique2 = df2['postComment'].unique()
print(sorted(postComment_unique2))

[' ', ' :face_savoring_food:', ' ?', ' Chong Arigatou ', ' fillet 1kg.can give some got Belly', ' fillet still has', ' today Lazy go out buy Saba for :dog_face:', '1', '17', '18', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '4q', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '88896368', ':beaming_face_with_smiling_eyes:juz surfaced ', ':face_savoring_food:', ':thumbs_up_medium-light_skin_tone:', 'AGL18+1', 'AK lah ', 'AK+1', 'AK20+1', 'Aiyo I am slow :rolling_on_the_floor_laughing: ', 'Aiyoh:rolling_on_the_floor_laughing:', 'Ak20+1', 'Already  messager  you ', 'Also can. Tiger prawn. Cost??', 'Any lobsters', 'Anymore crabs??', 'BLACK POMFRET 350G-400G $4 BP+1', 'BP+2', 'BTF10', 'BTF10+1', 'Btf+1', 'Buy bcoz of u want to see u', 'Buyers only', 'CENCARU $8 CEN+1', 'CP+1 $12', 'CP+2', 'Can', 'Can ', 'Can cut', 'Can pay COD '

**Convert encoded emoji text back to emojis**

In [14]:
df2['postComment'] = df2['postComment'].apply(emoji.emojize)

In [15]:
postComment_unique2 = df2['postComment'].unique()
print(sorted(postComment_unique2))

[' ', ' ?', ' Chong Arigatou ', ' fillet 1kg.can give some got Belly', ' fillet still has', ' today Lazy go out buy Saba for 🐶', ' 😋', '1', '17', '18', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '4q', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '88896368', 'AGL18+1', 'AK lah ', 'AK+1', 'AK20+1', 'Aiyo I am slow 🤣 ', 'Aiyoh🤣', 'Ak20+1', 'Already  messager  you ', 'Also can. Tiger prawn. Cost??', 'Any lobsters', 'Anymore crabs??', 'BLACK POMFRET 350G-400G $4 BP+1', 'BP+2', 'BTF10', 'BTF10+1', 'Btf+1', 'Buy bcoz of u want to see u', 'Buyers only', 'CENCARU $8 CEN+1', 'CP+1 $12', 'CP+2', 'Can', 'Can ', 'Can cut', 'Can pay COD ', 'Can potong Saba head pls', 'Can send before 4pm today', 'Can wash for me the sotong ', 'Cencaru-1', 'Chong hao same sec sch ', 'Clean, gut n cut into 2 pcs pls, xie xie', 'Cod 11.50 ', 'Cod 42', '

### Drop Empty Posts

Blank comments are dropped to ensure that unique comments are obtained.

In [16]:
#drop empty posts
df2 = df2.loc[((df2['postComment'] != ' ')),:]

In [17]:
postComment_unique_2 = df2['postComment'].unique()
print(sorted(postComment_unique_2))

[' ?', ' Chong Arigatou ', ' fillet 1kg.can give some got Belly', ' fillet still has', ' today Lazy go out buy Saba for 🐶', ' 😋', '1', '17', '18', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '4q', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '88896368', 'AGL18+1', 'AK lah ', 'AK+1', 'AK20+1', 'Aiyo I am slow 🤣 ', 'Aiyoh🤣', 'Ak20+1', 'Already  messager  you ', 'Also can. Tiger prawn. Cost??', 'Any lobsters', 'Anymore crabs??', 'BLACK POMFRET 350G-400G $4 BP+1', 'BP+2', 'BTF10', 'BTF10+1', 'Btf+1', 'Buy bcoz of u want to see u', 'Buyers only', 'CENCARU $8 CEN+1', 'CP+1 $12', 'CP+2', 'Can', 'Can ', 'Can cut', 'Can pay COD ', 'Can potong Saba head pls', 'Can send before 4pm today', 'Can wash for me the sotong ', 'Cencaru-1', 'Chong hao same sec sch ', 'Clean, gut n cut into 2 pcs pls, xie xie', 'Cod 11.50 ', 'Cod 42', 'Cod f

### Reindexing the dataframe 
**New Column to reindex the dataframe in accordance to time**

From the data, we can tell that there is an inconsistent timestamp being used in the column 'postCommentTime'. For example, there are times like '0:57' and '1:00:14' which indicates 0 hour 0 mins 57 secs, and 1 hour 0 mins 14 secs. 

Hence, a new column 'postCommentTime_final' is created to ensure that a timestamp of HH:MM:SS is being used consistently throughout. However, we note that the number of days is being included for TimeDeltaIndex.

As a result, we will read the timestamp, by excluding the number of days.  

In [18]:
#TimedeltaIndex
#https://stackoverflow.com/questions/54877467/pandas-convert-hhmm-and-hhmmss-to-standard-hhmmss-in-python
# for example, time of '0:57' will then be 00:00:57; 0 hours 0 mins 57 secs
df2['postCommentTime_final'] = pd.to_timedelta(np.where(df2['postCommentTime'].str.count(':') == 1, '00:' + df2['postCommentTime'], df2['postCommentTime']))

In [19]:
df2.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Hi,Amir Abdul Majid,0:22,0 days 00:00:22
1,Ak20+1,E-Beve,1:00:37,0 days 01:00:37
2,😋,Pauline Ang,1:01:03,0 days 01:01:03
3,FLL12,Irene Lee,1:01:08,0 days 01:01:08
4,AK+1,Pauline Ang,1:01:28,0 days 01:01:28


In [20]:
df2['postCommentTime_final'] = df2['postCommentTime_final'].astype(str).map(lambda x: x[7:])

In [21]:
df2

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Hi,Amir Abdul Majid,0:22,00:00:22
1,Ak20+1,E-Beve,1:00:37,01:00:37
2,😋,Pauline Ang,1:01:03,01:01:03
3,FLL12,Irene Lee,1:01:08,01:01:08
4,AK+1,Pauline Ang,1:01:28,01:01:28
5,"I have to off line now, thanks . Let me know how much I need to pay you.",Irene Lee,1:01:31,01:01:31
6,Aiyo I am slow 🤣,Pauline Ang,1:01:51,01:01:51
7,Thanks 🙏,Pauline Ang,1:02:23,01:02:23
8,Also can. Tiger prawn. Cost??,Pauline Ang,1:02:57,01:02:57
9,New buyers please whatsapp 88896368 your fb name and address,き リーサン,1:03,00:01:03


In [22]:
#reindex according to postCommentTime_final
#previous natsort in data collection didnt take into account the different timestamp format
df3 = df2.reindex(index=order_by_index(df2.index, index_natsorted(df2.postCommentTime_final)))

In [23]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Hi,Amir Abdul Majid,0:22,00:00:22
9,New buyers please whatsapp 88896368 your fb name and address,き リーサン,1:03,00:01:03
32,Hi Babe,き リーサン,1:14,00:01:14
110,Loved n shared,Firdaus Nordin,1:37,00:01:37
128,Very big n deep,Pauline Ng,2:42,00:02:42
129,LNS,Min Xuan,3:13,00:03:13
130,Morning Tiff👋,き リーサン,3:15,00:03:15
131,Lns done toilet auntie,Munchkin Nacki,3:46,00:03:46
132,Lns,Munchkin Nacki,4:36,00:04:36
133,Share to my friends lor,Aysha Khamarudin Al Takhi,4:37,00:04:37


In [24]:
#reset the index for the dataframe
#https://stackoverflow.com/questions/20490274/how-to-reset-index-in-a-pandas-dataframe

df3 = df3.reset_index(drop=True)

In [25]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Hi,Amir Abdul Majid,0:22,00:00:22
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,1:03,00:01:03
2,Hi Babe,き リーサン,1:14,00:01:14
3,Loved n shared,Firdaus Nordin,1:37,00:01:37
4,Very big n deep,Pauline Ng,2:42,00:02:42
5,LNS,Min Xuan,3:13,00:03:13
6,Morning Tiff👋,き リーサン,3:15,00:03:15
7,Lns done toilet auntie,Munchkin Nacki,3:46,00:03:46
8,Lns,Munchkin Nacki,4:36,00:04:36
9,Share to my friends lor,Aysha Khamarudin Al Takhi,4:37,00:04:37


**Obtain the length of the video**

Assuming that the last comment of the video is the total length of the video, we will input the length of the video from the comments dataframe into the video attributes dataframe.

In [26]:
#retrieve last comment to obtain the length of the video
df3['postCommentTime_final'].iloc[-1]

'01:46:25'

In [27]:
#https://stackoverflow.com/questions/6402812/how-to-convert-an-hmmss-time-string-to-seconds-in-python
def get_sec(time_str):
    """Get Seconds from time."""
    h, m, s = time_str.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

In [28]:
#retrieve last comment to obtain the length of the video in seconds
va['videoLength']= get_sec(df3['postCommentTime_final'].iloc[-1])

In [29]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength
0,ebeveadmin/videos/914887879433670,30,905,6385


## Feature Engineering

### Comments made by the Seller

**New Column to identify the number of comments made by the seller in the video**

From the total sum of comments made by the seller, we will input it into the video attributes dataframe.

In [30]:
(df3['postCommentAuthor']=='E-Beve').sum()

39

In [31]:
va['numSellerComments'] = (df3['postCommentAuthor']=='E-Beve').sum()

In [32]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments
0,ebeveadmin/videos/914887879433670,30,905,6385,39


**New Column to identify if the comment is made by the Seller or not**

In [33]:
#create a new column to show if the comment is made by the seller or not
df3['isSeller'] = df3['postCommentAuthor'].map(lambda x:1 if x =='E-Beve' else 0)

In [34]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller
0,Hi,Amir Abdul Majid,0:22,00:00:22,0
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,1:03,00:01:03,0
2,Hi Babe,き リーサン,1:14,00:01:14,0
3,Loved n shared,Firdaus Nordin,1:37,00:01:37,0
4,Very big n deep,Pauline Ng,2:42,00:02:42,0


In [35]:
df3['isSeller'].value_counts()

0    420
1     39
Name: isSeller, dtype: int64

In [36]:
#show all the seller's comments
df3.loc[df3['isSeller'] == 1]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller
263,Whatsapp 88896368,E-Beve,12:25,00:12:25,1
264,Wat the delivery fee?,E-Beve,12:25,00:12:25,1
266,Whatsapp 88896368,E-Beve,13:56,00:13:56,1
270,Do you have crab,E-Beve,16:57,00:16:57,1
276,I want to 2,E-Beve,20:33,00:20:33,1
279,Can pay COD,E-Beve,21:37,00:21:37,1
280,Don't answer those numbers with +65 in front.,E-Beve,22:56,00:22:56,1
287,CP+2,E-Beve,26:42,00:26:42,1
290,Rat+3,E-Beve,29:24,00:29:24,1
293,hi sis and admin lns done,E-Beve,31:12,00:31:12,1


### Length of comments

**New Column to identify the length of each comment**

From the comments, we take the length of each comment as the total number of words each comment has.

In [37]:
#length of each comment
#https://stackoverflow.com/questions/37483470/how-to-calculate-number-of-words-in-a-string-in-dataframe
df3['postCommentLength'] = df3['postComment'].str.split().str.len()

In [38]:
df3.head(10)

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength
0,Hi,Amir Abdul Majid,0:22,00:00:22,0,1
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,1:03,00:01:03,0,10
2,Hi Babe,き リーサン,1:14,00:01:14,0,2
3,Loved n shared,Firdaus Nordin,1:37,00:01:37,0,3
4,Very big n deep,Pauline Ng,2:42,00:02:42,0,4
5,LNS,Min Xuan,3:13,00:03:13,0,1
6,Morning Tiff👋,き リーサン,3:15,00:03:15,0,2
7,Lns done toilet auntie,Munchkin Nacki,3:46,00:03:46,0,4
8,Lns,Munchkin Nacki,4:36,00:04:36,0,1
9,Share to my friends lor,Aysha Khamarudin Al Takhi,4:37,00:04:37,0,5


**New Column to identify the total number of comments in the video**

From the total number of comments in the video, we will input it into the video attributes dataframe.

In [39]:
#total number of comments
df3['postCommentLength'].sum()

964

In [40]:
va['numComments'] = df3['postCommentLength'].sum()

In [41]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments
0,ebeveadmin/videos/914887879433670,30,905,6385,39,964


### LNS

LNS is an acronym that stands for 'like and share'. It is a form of customer engagement as it indicates by the customers to the sellers that they have liked and shared the video on their Facebook wall. 

**New Column to identify if Customers are engaging in liking and sharing the video**

In [42]:
#if the customer has commented 'lns' or 'ls' which stands for 'like & shared' & 'like shared' respectively
def lns(comment):
    if re.search(r'(l)(n?)(s)', comment, re.IGNORECASE):
        return int(1)
    else:
        return int(0)

In [43]:
df3['lns'] = df3['postComment'].map(lambda x:lns(x))

In [44]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns
0,Hi,Amir Abdul Majid,0:22,00:00:22,0,1,0
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,1:03,00:01:03,0,10,0
2,Hi Babe,き リーサン,1:14,00:01:14,0,2,0
3,Loved n shared,Firdaus Nordin,1:37,00:01:37,0,3,0
4,Very big n deep,Pauline Ng,2:42,00:02:42,0,4,0


**New Column to identify if the number of Customers who explicitly inform the sellers that they are engaging in liking and sharing the video**

In [45]:
#range of customer's engagement for LNS
df3['lns'].value_counts()

0    433
1     26
Name: lns, dtype: int64

In [46]:
(df3['lns']==1).sum()

26

In [47]:
va['lnsQuantity'] = (df3['lns']==1).sum()

In [48]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity
0,ebeveadmin/videos/914887879433670,30,905,6385,39,964,26


## Sales Quantity

**New Columns to identify the quantity of sales made**

From the comments, using regex, we first see an overview of the comments that are related to the sale of the products. 

In [49]:
#overview of the sales
df3[df3['postComment'].str.contains('(\w*)(\s)*(\+)(\s)*(\d*)', regex=True)]

  df3[df3['postComment'].str.contains('(\w*)(\s)*(\+)(\s)*(\d*)', regex=True)]


Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns
268,AGL18+1,Munchkin Nacki,15:52,00:15:52,0,1,0
272,KUKUP GOLDEN POMFRET 450-500G $6 GOL+1,Lily Koh,17:25,00:17:25,0,6,0
273,Gol+1,Munchkin Nacki,18:23,00:18:23,0,1,0
275,GOL +2,Susan Ng,20:25,00:20:25,0,2,0
277,"So long u see + in front for singapore number incoming call can ignore, all scam calls",Munchkin Nacki,20:45,00:20:45,0,17,1
278,KUKUP RED SNAPPER 550G $10 RS+1,Pauline Ng,20:55,00:20:55,0,6,0
280,Don't answer those numbers with +65 in front.,E-Beve,22:56,00:22:56,1,8,0
282,WILD CHINESE POMFRET 450-500G $12 CP+1,Irene Lee,23:17,00:23:17,0,6,0
287,CP+2,E-Beve,26:42,00:26:42,1,1,0
289,RAT GROUPER 250G $6 RAT+1,Snowy Sue,28:38,00:28:38,0,5,0


In [50]:
def sale(comment):
    if re.findall(r'(\+)(\s)?(\d)', comment):
        results = re.findall(r'\+\s?\d', comment)
        total = 0
        for r in results:
            total += int(r[-1])
        return total
    else:
        return int(0)

In [51]:
df3['sales'] = df3['postComment'].apply(lambda x:sale(x))

In [52]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales
0,Hi,Amir Abdul Majid,0:22,00:00:22,0,1,0,0
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,1:03,00:01:03,0,10,0,0
2,Hi Babe,き リーサン,1:14,00:01:14,0,2,0,0
3,Loved n shared,Firdaus Nordin,1:37,00:01:37,0,3,0,0
4,Very big n deep,Pauline Ng,2:42,00:02:42,0,4,0,0
5,LNS,Min Xuan,3:13,00:03:13,0,1,1,0
6,Morning Tiff👋,き リーサン,3:15,00:03:15,0,2,0,0
7,Lns done toilet auntie,Munchkin Nacki,3:46,00:03:46,0,4,1,0
8,Lns,Munchkin Nacki,4:36,00:04:36,0,1,1,0
9,Share to my friends lor,Aysha Khamarudin Al Takhi,4:37,00:04:37,0,5,0,0


The cell at row 367 & 428 are ordered using the full product name and/or without the '+' to the product code. Hence, the sales quantity is manually filled in.

The cell at row 280 is not a sales. Hence, it will be manually edited as well

In [53]:
df3.loc[280, 'sales'] = int(0)
df3.loc[367, 'sales'] = int(1)
df3.loc[428, 'sales'] = int(1)

In [54]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales
0,Hi,Amir Abdul Majid,0:22,00:00:22,0,1,0,0
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,1:03,00:01:03,0,10,0,0
2,Hi Babe,き リーサン,1:14,00:01:14,0,2,0,0
3,Loved n shared,Firdaus Nordin,1:37,00:01:37,0,3,0,0
4,Very big n deep,Pauline Ng,2:42,00:02:42,0,4,0,0
5,LNS,Min Xuan,3:13,00:03:13,0,1,1,0
6,Morning Tiff👋,き リーサン,3:15,00:03:15,0,2,0,0
7,Lns done toilet auntie,Munchkin Nacki,3:46,00:03:46,0,4,1,0
8,Lns,Munchkin Nacki,4:36,00:04:36,0,1,1,0
9,Share to my friends lor,Aysha Khamarudin Al Takhi,4:37,00:04:37,0,5,0,0


In [55]:
#if the comments consist the sale information for the product, we will indicate it as '0', otherwise '1'
def no_sale_info(comment):
    if re.search(r'(\$)(\s)?(.*)', comment, re.IGNORECASE):
        return int(0)
    else:
        return int(1)

In [56]:
df3['no_sale_info'] = df3['postComment'].map(lambda x:no_sale_info(x))

In [57]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info
0,Hi,Amir Abdul Majid,0:22,00:00:22,0,1,0,0,1
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,1:03,00:01:03,0,10,0,0,1
2,Hi Babe,き リーサン,1:14,00:01:14,0,2,0,0,1
3,Loved n shared,Firdaus Nordin,1:37,00:01:37,0,3,0,0,1
4,Very big n deep,Pauline Ng,2:42,00:02:42,0,4,0,0,1
5,LNS,Min Xuan,3:13,00:03:13,0,1,1,0,1
6,Morning Tiff👋,き リーサン,3:15,00:03:15,0,2,0,0,1
7,Lns done toilet auntie,Munchkin Nacki,3:46,00:03:46,0,4,1,0,1
8,Lns,Munchkin Nacki,4:36,00:04:36,0,1,1,0,1
9,Share to my friends lor,Aysha Khamarudin Al Takhi,4:37,00:04:37,0,5,0,0,1


In [58]:
#sales made by the customers; exclude the seller's comments on the codes for the sales
#https://stackoverflow.com/questions/19914937/applying-function-with-multiple-arguments-to-create-a-new-pandas-column
df3['salesQuantity'] = np.multiply(df3['no_sale_info'], df3['sales'])

In [59]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info,salesQuantity
0,Hi,Amir Abdul Majid,0:22,00:00:22,0,1,0,0,1,0
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,1:03,00:01:03,0,10,0,0,1,0
2,Hi Babe,き リーサン,1:14,00:01:14,0,2,0,0,1,0
3,Loved n shared,Firdaus Nordin,1:37,00:01:37,0,3,0,0,1,0
4,Very big n deep,Pauline Ng,2:42,00:02:42,0,4,0,0,1,0
5,LNS,Min Xuan,3:13,00:03:13,0,1,1,0,1,0
6,Morning Tiff👋,き リーサン,3:15,00:03:15,0,2,0,0,1,0
7,Lns done toilet auntie,Munchkin Nacki,3:46,00:03:46,0,4,1,0,1,0
8,Lns,Munchkin Nacki,4:36,00:04:36,0,1,1,0,1,0
9,Share to my friends lor,Aysha Khamarudin Al Takhi,4:37,00:04:37,0,5,0,0,1,0


In [60]:
#range of sales quantity
df3['salesQuantity'].value_counts()

0    413
1     31
2      9
3      6
Name: salesQuantity, dtype: int64

In [61]:
#total number of orders made
df3['salesQuantity'].sum()

67

In [62]:
va['salesQuantity'] = df3['salesQuantity'].sum()

In [63]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity
0,ebeveadmin/videos/914887879433670,30,905,6385,39,964,26,67


## Products 

The seller will comment and post the unique product codes for each product. Thereafter, customers who are keen on purchasing the products will explicitly comment out the specific & unique product codes, in addition to the quanityt of each product that they wish to purchase.

**New Columns to identify the products purchased by the Customers**

Regex is used to identify the products being offered and the products being purchased as well.

In [64]:
#function to identify the code of the product bought
def sale2(comment):
    if re.search(r'(\w*)(\s?)\+(\s?)(\d)', comment):
        return str(re.search(r'(\w*)(\s?)\+(\s?)(\d)', comment).group(0)[:-2])
    else:
        return int(0)

In [65]:
#identifies all comments that have the codes of the products purchased by the Customers
#this column will be dropped afterwards.
df3['productBought'] = df3['postComment'].apply(lambda x:sale2(x))

In [66]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info,salesQuantity,productBought
0,Hi,Amir Abdul Majid,0:22,00:00:22,0,1,0,0,1,0,0
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,1:03,00:01:03,0,10,0,0,1,0,0
2,Hi Babe,き リーサン,1:14,00:01:14,0,2,0,0,1,0,0
3,Loved n shared,Firdaus Nordin,1:37,00:01:37,0,3,0,0,1,0,0
4,Very big n deep,Pauline Ng,2:42,00:02:42,0,4,0,0,1,0,0
5,LNS,Min Xuan,3:13,00:03:13,0,1,1,0,1,0,0
6,Morning Tiff👋,き リーサン,3:15,00:03:15,0,2,0,0,1,0,0
7,Lns done toilet auntie,Munchkin Nacki,3:46,00:03:46,0,4,1,0,1,0,0
8,Lns,Munchkin Nacki,4:36,00:04:36,0,1,1,0,1,0,0
9,Share to my friends lor,Aysha Khamarudin Al Takhi,4:37,00:04:37,0,5,0,0,1,0,0


In [67]:
df3['productBought'].unique()

array([0, 'AGL18', 'GOL', 'Gol', 'GOL ', 'RS', 'with ', 'CP', 'RAT',
       'Rat', 'TUNA', 'SB', 'BP', 'LMC', 'C', 'DURI', 'ST', 'SP', 'TP',
       'Tp', 'MUS', 'Mus', 'AK', 'AK20', 'Ak20', 'ST ', 'LEA', 'SAH',
       'CEN', 'SALH', 'Salh', 'SAL', 'Sal', 'RG', 'Rg', 'prawns', 'RGS8',
       'RGs8', 'Mussel', '17', 'BTF10', 'Btf', 'PRAWN '], dtype=object)

The cell at row 280 for the column 'product' is erroneous. Hence, it will be manually edited.

The cell at rows 307 & 437 are not exactly ordered in accordance to the product code. Hence, they would be manually edited.

The cell at row 353 has 3 products in 1 comment and is not advised in accordance to most product codes. Hence, it will be manually edited.

The cells at row 367 & 397 have been ordered using the full name of the product instead of the product code. Hence, they are manually filled in.

The cell at row 428 is ordered without the '+' to the product code. Hence, they are manually filled in.

In [68]:
#https://stackoverflow.com/questions/13842088/set-value-for-particular-cell-in-pandas-dataframe-using-index
df3.loc[280, 'productBought'] = int(0)
df3.loc[307, 'productBought'] = 'LMC'
df3.loc[353, 'productBought'] = 'RAT12, RAT15, RAT18'
df3.loc[367, 'productBought'] = 'CEN'
df3.loc[397, 'productBought'] = 'PRAWN'
df3.loc[428, 'productBought'] = 'WG20'
df3.loc[437, 'productBought'] = 'WG17'
df3.loc[428, 'productBought'] = 'WG20'

In [69]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info,salesQuantity,productBought
0,Hi,Amir Abdul Majid,0:22,00:00:22,0,1,0,0,1,0,0
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,1:03,00:01:03,0,10,0,0,1,0,0
2,Hi Babe,き リーサン,1:14,00:01:14,0,2,0,0,1,0,0
3,Loved n shared,Firdaus Nordin,1:37,00:01:37,0,3,0,0,1,0,0
4,Very big n deep,Pauline Ng,2:42,00:02:42,0,4,0,0,1,0,0
5,LNS,Min Xuan,3:13,00:03:13,0,1,1,0,1,0,0
6,Morning Tiff👋,き リーサン,3:15,00:03:15,0,2,0,0,1,0,0
7,Lns done toilet auntie,Munchkin Nacki,3:46,00:03:46,0,4,1,0,1,0,0
8,Lns,Munchkin Nacki,4:36,00:04:36,0,1,1,0,1,0,0
9,Share to my friends lor,Aysha Khamarudin Al Takhi,4:37,00:04:37,0,5,0,0,1,0,0


In [70]:
df3['productBought'].unique()

array([0, 'AGL18', 'GOL', 'Gol', 'GOL ', 'RS', 'CP', 'RAT', 'Rat', 'TUNA',
       'SB', 'BP', 'LMC', 'DURI', 'ST', 'SP', 'TP', 'Tp', 'MUS', 'Mus',
       'AK', 'AK20', 'Ak20', 'ST ', 'RAT12, RAT15, RAT18', 'LEA', 'SAH',
       'CEN', 'SALH', 'Salh', 'SAL', 'Sal', 'RG', 'Rg', 'PRAWN', 'RGS8',
       'RGs8', 'Mussel', 'WG20', 'WG17', 'BTF10', 'Btf', 'PRAWN '],
      dtype=object)

Change the produce codes to be uppercase for consistency

In [71]:
#change the produce codes to be uppercase for consistency, and since python is case sensitive.
#https://stackoverflow.com/questions/39512002/convert-whole-dataframe-from-lower-case-to-upper-case-with-pandas
df3['productBought'] = df3['productBought'].astype(str).str.upper()

In [72]:
df3['productBought'].unique()

array(['0', 'AGL18', 'GOL', 'GOL ', 'RS', 'CP', 'RAT', 'TUNA', 'SB', 'BP',
       'LMC', 'DURI', 'ST', 'SP', 'TP', 'MUS', 'AK', 'AK20', 'ST ',
       'RAT12, RAT15, RAT18', 'LEA', 'SAH', 'CEN', 'SALH', 'SAL', 'RG',
       'PRAWN', 'RGS8', 'MUSSEL', 'WG20', 'WG17', 'BTF10', 'BTF',
       'PRAWN '], dtype=object)

Remove whitespaces at the end of the string

In [73]:
df3['productBought'] = df3['productBought'].str.rstrip()

In [74]:
df3['productBought'].unique()

array(['0', 'AGL18', 'GOL', 'RS', 'CP', 'RAT', 'TUNA', 'SB', 'BP', 'LMC',
       'DURI', 'ST', 'SP', 'TP', 'MUS', 'AK', 'AK20',
       'RAT12, RAT15, RAT18', 'LEA', 'SAH', 'CEN', 'SALH', 'SAL', 'RG',
       'PRAWN', 'RGS8', 'MUSSEL', 'WG20', 'WG17', 'BTF10', 'BTF'],
      dtype=object)

In [75]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info,salesQuantity,productBought
0,Hi,Amir Abdul Majid,0:22,00:00:22,0,1,0,0,1,0,0
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,1:03,00:01:03,0,10,0,0,1,0,0
2,Hi Babe,き リーサン,1:14,00:01:14,0,2,0,0,1,0,0
3,Loved n shared,Firdaus Nordin,1:37,00:01:37,0,3,0,0,1,0,0
4,Very big n deep,Pauline Ng,2:42,00:02:42,0,4,0,0,1,0,0
5,LNS,Min Xuan,3:13,00:03:13,0,1,1,0,1,0,0
6,Morning Tiff👋,き リーサン,3:15,00:03:15,0,2,0,0,1,0,0
7,Lns done toilet auntie,Munchkin Nacki,3:46,00:03:46,0,4,1,0,1,0,0
8,Lns,Munchkin Nacki,4:36,00:04:36,0,1,1,0,1,0,0
9,Share to my friends lor,Aysha Khamarudin Al Takhi,4:37,00:04:37,0,5,0,0,1,0,0


### Price of Products

**New Column to identify the price of the products**

This new column is created to identify the regex of the unique product codes and their corresponding prices, as adviced by the seller.

In [76]:
#products offered by the seller
df3[df3['postComment'].str.contains('(\$)(\s)?(.*)', regex=True)]

  df3[df3['postComment'].str.contains('(\$)(\s)?(.*)', regex=True)]


Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info,salesQuantity,productBought
272,KUKUP GOLDEN POMFRET 450-500G $6 GOL+1,Lily Koh,17:25,00:17:25,0,6,0,1,0,0,GOL
278,KUKUP RED SNAPPER 550G $10 RS+1,Pauline Ng,20:55,00:20:55,0,6,0,1,0,0,RS
282,WILD CHINESE POMFRET 450-500G $12 CP+1,Irene Lee,23:17,00:23:17,0,6,0,1,0,0,CP
289,RAT GROUPER 250G $6 RAT+1,Snowy Sue,28:38,00:28:38,0,5,0,1,0,0,RAT
295,TONGKOL/TUNA 1.1KG $10 TUNA+1,Irene Lee,32:27,00:32:27,0,4,0,1,0,0,TUNA
298,SEABASS 650-700G $8 SB+1,Munchkin Nacki,37:12,00:37:12,0,4,0,1,0,0,SB
299,BLACK POMFRET 350G-400G $4 BP+1,Munchkin Nacki,37:25,00:37:25,0,5,0,1,0,0,BP
303,LIVE MUDCRAB 650-750G $24 LMC+1,Munchkin Nacki,40:10,00:40:10,0,5,0,1,0,0,LMC
309,IKAN DURI/CAT FISH $9/KG DURI+1,Irene Lee,42:36,00:42:36,0,5,0,1,0,0,DURI
312,Sotong $18 ST+1,梁文斌,45:13,00:45:13,0,3,0,1,0,0,ST


In [77]:
def price(comment):
    if re.search(r'(\$)(\s)?(.*)', comment):
        return str(re.search(r'(\$)(\s)?(.*)', comment).group(0))
    else:
        return int(0)

In [78]:
df3['productPrice'] = df3['postComment'].apply(lambda x:price(x))

In [79]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info,salesQuantity,productBought,productPrice
0,Hi,Amir Abdul Majid,0:22,00:00:22,0,1,0,0,1,0,0,0
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,1:03,00:01:03,0,10,0,0,1,0,0,0
2,Hi Babe,き リーサン,1:14,00:01:14,0,2,0,0,1,0,0,0
3,Loved n shared,Firdaus Nordin,1:37,00:01:37,0,3,0,0,1,0,0,0
4,Very big n deep,Pauline Ng,2:42,00:02:42,0,4,0,0,1,0,0,0
5,LNS,Min Xuan,3:13,00:03:13,0,1,1,0,1,0,0,0
6,Morning Tiff👋,き リーサン,3:15,00:03:15,0,2,0,0,1,0,0,0
7,Lns done toilet auntie,Munchkin Nacki,3:46,00:03:46,0,4,1,0,1,0,0,0
8,Lns,Munchkin Nacki,4:36,00:04:36,0,1,1,0,1,0,0,0
9,Share to my friends lor,Aysha Khamarudin Al Takhi,4:37,00:04:37,0,5,0,0,1,0,0,0


In [80]:
df3['productPrice'].unique()

array([0, '$6 GOL+1', '$10 RS+1', '$12 CP+1', '$6 RAT+1', '$10 TUNA+1',
       '$8 SB+1', '$4 BP+1', '$24 LMC+1', '$9/KG  DURI+1', '$18 ST+1',
       '$14 SP+1', '$18 TP+1', '$4 MUS+1', '$20 AK+1',
       '$12, $15, $18 RAT12, RAT15, RAT18', '$17 LEA+1', '$4 SAH+1',
       '$42', '$8 CEN+1', '$9 SAL+1', '$6 RG+1', '$17, $20 WG17, WG20',
       '$17+1', '$12', '$14'], dtype=object)

The cell at rows 363, 426, 437, 442 & 446 for the column 'productPrice' are not exactly ordered in accordance to the most product information. Hence, they would be manually edited.

In [81]:
df3.loc[363, 'productPrice']= '$42 CODFISH'
df3.loc[426, 'productPrice']= '$20 WG20'
df3.loc[437, 'productPrice']= '$17 WG17'
df3.loc[442, 'productPrice']= '$12 CP+1'
df3.loc[446, 'productPrice']= '$14 SP+1'

In [82]:
df3['productPrice'].unique()

array([0, '$6 GOL+1', '$10 RS+1', '$12 CP+1', '$6 RAT+1', '$10 TUNA+1',
       '$8 SB+1', '$4 BP+1', '$24 LMC+1', '$9/KG  DURI+1', '$18 ST+1',
       '$14 SP+1', '$18 TP+1', '$4 MUS+1', '$20 AK+1',
       '$12, $15, $18 RAT12, RAT15, RAT18', '$17 LEA+1', '$4 SAH+1',
       '$42 CODFISH', '$8 CEN+1', '$9 SAL+1', '$6 RG+1', '$20 WG20',
       '$17 WG17'], dtype=object)

Excluding the comments where there was no product codes as adviced by the seller, we find the number of unique products offered by the seller.

In [83]:
#number of unique products offered by the seller
int(df3['productPrice'].nunique()) - int(1) + int(2) #since row 353 has 3 product codes instead of 1

25

In [84]:
#total number of products offered
va['numProducts'] = int(df3['productPrice'].nunique()) - int(1) + int(2)

In [85]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts
0,ebeveadmin/videos/914887879433670,30,905,6385,39,964,26,67,25


**Drop irrelevant columns**

The following column was dropped for the following reasons:

1. 'postCommentTime'
- Since a new column 'postCommentTime_final' was created to ensure that a consistent timestamp of HH:MM:SS is used consistently throughout the dataframe, and the dataframe has been thereafter reindex and sorted in accordance to time in ascending order, we dropped the original inconsistent time column 'postCommentTime' as it had varying timestamp formats of HH:MM:SS, MM:SS and M:SS.

2. 'no_sale_info'
- This column 'notSeller' was solely created to calculate the quantity of sales made, and the products purchased by the customers. Hence, we are able to delete it after the quantity of sales have been calculated and the identification of the products purchased by the customers have been identified.

3. 'sales'
- This column identifies the quantity of products to the unique & specific product codes. However, it includes the comments which advises for the product sale information as well. Hence, this column was solely created to be multiplied against the column 'no_sale_info' to calculate the true quantity of sales made by customers only. Hence, we are able to delete it after the quantity of sales have been calculated.

In [86]:
#drop unwanted columns
df3.drop(['postCommentTime', 'no_sale_info', 'sales'], axis=1, inplace=True)

In [87]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productBought,productPrice
0,Hi,Amir Abdul Majid,00:00:22,0,1,0,0,0,0
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,00:01:03,0,10,0,0,0,0
2,Hi Babe,き リーサン,00:01:14,0,2,0,0,0,0
3,Loved n shared,Firdaus Nordin,00:01:37,0,3,0,0,0,0
4,Very big n deep,Pauline Ng,00:02:42,0,4,0,0,0,0


### Revenue from the sale of the products

**Dummify the products bought to find the revenue**

In [88]:
#getdummies the products bought
df3 = pd.get_dummies(df3, columns = ['productBought'], drop_first = True)

In [89]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,productBought_SAH,productBought_SAL,productBought_SALH,productBought_SB,productBought_SP,productBought_ST,productBought_TP,productBought_TUNA,productBought_WG17,productBought_WG20
0,Hi,Amir Abdul Majid,00:00:22,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,00:01:03,0,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Hi Babe,き リーサン,00:01:14,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Loved n shared,Firdaus Nordin,00:01:37,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Very big n deep,Pauline Ng,00:02:42,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [90]:
def clean_columns(col):
    return col.replace(', ', '_')

In [91]:
df3.columns = [clean_columns(col) for col in df3.columns]
df3.head(1)

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,productBought_SAH,productBought_SAL,productBought_SALH,productBought_SB,productBought_SP,productBought_ST,productBought_TP,productBought_TUNA,productBought_WG17,productBought_WG20
0,Hi,Amir Abdul Majid,00:00:22,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [92]:
# iterating the columns
for col in df3.columns:
    print(col)

postComment
postCommentAuthor
postCommentTime_final
isSeller
postCommentLength
lns
salesQuantity
productPrice
productBought_AGL18
productBought_AK
productBought_AK20
productBought_BP
productBought_BTF
productBought_BTF10
productBought_CEN
productBought_CP
productBought_DURI
productBought_GOL
productBought_LEA
productBought_LMC
productBought_MUS
productBought_MUSSEL
productBought_PRAWN
productBought_RAT
productBought_RAT12_RAT15_RAT18
productBought_RG
productBought_RGS8
productBought_RS
productBought_SAH
productBought_SAL
productBought_SALH
productBought_SB
productBought_SP
productBought_ST
productBought_TP
productBought_TUNA
productBought_WG17
productBought_WG20


After the column 'productBought' has been dummified, the cells will return a '1' if that particular item is bought, and a '0' if it is not. Hence, we will replace all the '1' with the price of the product.

Then, we will create a new revenue column which is a multiplication of the column 'salesQuantity' against the price of the product (i.e. the dummified 'productBought' columnns).

Product AGL18

In [93]:
df3[df3['postComment'].str.contains('AGL18', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,productBought_SAH,productBought_SAL,productBought_SALH,productBought_SB,productBought_SP,productBought_ST,productBought_TP,productBought_TUNA,productBought_WG17,productBought_WG20
268,AGL18+1,Munchkin Nacki,00:15:52,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


We noticed that there is no sale information with regards to the product code 'AGL18'. Perhaps a wrong product code has been typed out and posted. Hence, we will exclude the sales related to the code 'AGL18'.

Product AK

In [94]:
df3[df3['postComment'].str.contains('AK', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,productBought_SAH,productBought_SAL,productBought_SALH,productBought_SB,productBought_SP,productBought_ST,productBought_TP,productBought_TUNA,productBought_WG17,productBought_WG20
334,WILD ANGKAH $20 AK+1,Lily Koh,00:58:44,0,4,0,0,$20 AK+1,0,1,...,0,0,0,0,0,0,0,0,0,0
335,AK20+1,Min Xuan,00:58:54,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
339,AK+1,Pauline Ang,01:01:28,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
345,AK lah,Pauline Ang,01:03:43,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [95]:
df3['productBought_AK'] = df3['productBought_AK'].map(lambda x:float(20.00) if x == int(1) else 0)

In [96]:
df3['revenue_AK'] = np.multiply(df3['productBought_AK'], df3['salesQuantity'])

In [97]:
revenue_AK = "The total revenue from the sale of the product {} is ${}". format ("AK", format(df3['revenue_AK'].sum(), '.2f'))
print(revenue_AK)


The total revenue from the sale of the product AK is $20.00


Product AK20

In [98]:
df3[df3['postComment'].str.contains('AK20', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,productBought_SAL,productBought_SALH,productBought_SB,productBought_SP,productBought_ST,productBought_TP,productBought_TUNA,productBought_WG17,productBought_WG20,revenue_AK
335,AK20+1,Min Xuan,00:58:54,0,1,0,1,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0.0


We noticed that there is no sale information with regards to the product code 'AK20'. Perhaps a wrong product code has been typed out and posted. Hence, we will exclude the sales related to the code 'AK20'.

Product BP

In [99]:
df3[df3['postComment'].str.contains('BP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,productBought_SAL,productBought_SALH,productBought_SB,productBought_SP,productBought_ST,productBought_TP,productBought_TUNA,productBought_WG17,productBought_WG20,revenue_AK
299,BLACK POMFRET 350G-400G $4 BP+1,Munchkin Nacki,00:37:25,0,5,0,0,$4 BP+1,0,0.0,...,0,0,0,0,0,0,0,0,0,0.0
302,BP+2,E-Beve,00:39:31,1,1,0,2,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0.0


In [100]:
df3['productBought_BP'] = df3['productBought_BP'].map(lambda x:float(4.00) if x == int(1) else 0)

In [101]:
df3['revenue_BP'] = np.multiply(df3['productBought_BP'], df3['salesQuantity'])

In [102]:
revenue_BP = "The total revenue from the sale of the product {} is ${}". format ("BP", format(df3['revenue_BP'].sum(), '.2f'))
print(revenue_BP)


The total revenue from the sale of the product BP is $8.00


Product BTF

In [103]:
df3[df3['postComment'].str.contains('BTF', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,productBought_SALH,productBought_SB,productBought_SP,productBought_ST,productBought_TP,productBought_TUNA,productBought_WG17,productBought_WG20,revenue_AK,revenue_BP
292,BTF10,Irene Lee,00:31:06,0,1,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0
443,BTF10+1,E-Beve,01:38:04,1,1,0,1,0,0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0


We noticed that there is no sale information with regards to the product code 'BTF'. Perhaps a wrong product code has been typed out and posted. Hence, we will exclude the sales related to the code 'BTF'.

Product BTF10

In [104]:
df3[df3['postComment'].str.contains('BTF10', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,productBought_SALH,productBought_SB,productBought_SP,productBought_ST,productBought_TP,productBought_TUNA,productBought_WG17,productBought_WG20,revenue_AK,revenue_BP
292,BTF10,Irene Lee,00:31:06,0,1,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0
443,BTF10+1,E-Beve,01:38:04,1,1,0,1,0,0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0


We noticed that there is no sale information with regards to the product code 'BTF10'. Perhaps a wrong product code has been typed out and posted. Hence, we will exclude the sales related to the code 'BTF10'.

Product CEN

In [105]:
df3[df3['postComment'].str.contains('CEN', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,productBought_SALH,productBought_SB,productBought_SP,productBought_ST,productBought_TP,productBought_TUNA,productBought_WG17,productBought_WG20,revenue_AK,revenue_BP
366,CENCARU $8 CEN+1,Munchkin Nacki,01:14:06,0,3,0,0,$8 CEN+1,0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0


In [106]:
df3['productBought_CEN'] = df3['productBought_CEN'].map(lambda x:float(8.00) if x == int(1) else 0)

In [107]:
df3['revenue_CEN'] = np.multiply(df3['productBought_CEN'], df3['salesQuantity'])

In [108]:
revenue_CEN = "The total revenue from the sale of the product {} is ${}". format ("CEN", format(df3['revenue_CEN'].sum(), '.2f'))
print(revenue_CEN)


The total revenue from the sale of the product CEN is $8.00


Product CP

In [109]:
df3[df3['postComment'].str.contains('CP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,productBought_SB,productBought_SP,productBought_ST,productBought_TP,productBought_TUNA,productBought_WG17,productBought_WG20,revenue_AK,revenue_BP,revenue_CEN
282,WILD CHINESE POMFRET 450-500G $12 CP+1,Irene Lee,00:23:17,0,6,0,0,$12 CP+1,0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
287,CP+2,E-Beve,00:26:42,1,1,0,2,0,0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
442,CP+1 $12,Emily Wong,01:36:55,0,2,0,0,$12 CP+1,0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0


In [110]:
df3['productBought_CP'] = df3['productBought_CP'].map(lambda x:float(12.00) if x == int(1) else 0)

In [111]:
df3['revenue_CP'] = np.multiply(df3['productBought_CP'], df3['salesQuantity'])

In [112]:
revenue_CP = "The total revenue from the sale of the product {} is ${}". format ("CP", format(df3['revenue_CP'].sum(), '.2f'))
print(revenue_CP)


The total revenue from the sale of the product CP is $24.00


Product DURI

In [113]:
df3[df3['postComment'].str.contains('DURI', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,productBought_SP,productBought_ST,productBought_TP,productBought_TUNA,productBought_WG17,productBought_WG20,revenue_AK,revenue_BP,revenue_CEN,revenue_CP
309,IKAN DURI/CAT FISH $9/KG DURI+1,Irene Lee,00:42:36,0,5,0,0,$9/KG DURI+1,0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0


In [114]:
df3['productBought_DURI'] = df3['productBought_DURI'].map(lambda x:float(9.00) if x == int(1) else 0)

In [115]:
df3['revenue_DURI'] = np.multiply(df3['productBought_DURI'], df3['salesQuantity'])

In [116]:
revenue_DURI = "The total revenue from the sale of the product {} is ${}". format ("DURI", format(df3['revenue_DURI'].sum(), '.2f'))
print(revenue_DURI)


The total revenue from the sale of the product DURI is $0.00


Product GOL

In [117]:
df3[df3['postComment'].str.contains('GOL', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,productBought_ST,productBought_TP,productBought_TUNA,productBought_WG17,productBought_WG20,revenue_AK,revenue_BP,revenue_CEN,revenue_CP,revenue_DURI
272,KUKUP GOLDEN POMFRET 450-500G $6 GOL+1,Lily Koh,00:17:25,0,6,0,0,$6 GOL+1,0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
275,GOL +2,Susan Ng,00:20:25,0,2,0,2,0,0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


In [118]:
df3['productBought_GOL'] = df3['productBought_GOL'].map(lambda x:float(6.00) if x == int(1) else 0)

In [119]:
df3['revenue_GOL'] = np.multiply(df3['productBought_GOL'], df3['salesQuantity'])

In [120]:
revenue_GOL = "The total revenue from the sale of the product {} is ${}". format ("GOL", format(df3['revenue_GOL'].sum(), '.2f'))
print(revenue_GOL)


The total revenue from the sale of the product GOL is $30.00


Product LEA

In [121]:
df3[df3['postComment'].str.contains('LEA', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,productBought_TP,productBought_TUNA,productBought_WG17,productBought_WG20,revenue_AK,revenue_BP,revenue_CEN,revenue_CP,revenue_DURI,revenue_GOL
356,LEATHER JACKET 1.5KG $17 LEA+1,Lily Koh,01:08:10,0,5,0,0,$17 LEA+1,0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
359,LEA+1,Lily Koh,01:09:03,0,1,0,1,0,0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
386,LEA+1,梁文斌,01:20:19,0,1,0,1,0,0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
404,LEA cut steak pls,Emily Wong,01:24:54,0,4,1,0,0,0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:
df3['productBought_LEA'] = df3['productBought_LEA'].map(lambda x:float(17.00) if x == int(1) else 0)

In [123]:
df3['revenue_LEA'] = np.multiply(df3['productBought_LEA'], df3['salesQuantity'])

In [124]:
revenue_LEA = "The total revenue from the sale of the product {} is ${}". format ("LEA", format(df3['revenue_LEA'].sum(), '.2f'))
print(revenue_LEA)


The total revenue from the sale of the product LEA is $34.00


Product LMC

In [125]:
df3[df3['postComment'].str.contains('LMC', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,productBought_TUNA,productBought_WG17,productBought_WG20,revenue_AK,revenue_BP,revenue_CEN,revenue_CP,revenue_DURI,revenue_GOL,revenue_LEA
303,LIVE MUDCRAB 650-750G $24 LMC+1,Munchkin Nacki,00:40:10,0,5,0,0,$24 LMC+1,0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
305,LMC+3,Munchkin Nacki,00:40:32,0,1,0,3,0,0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [126]:
df3['productBought_LMC'] = df3['productBought_LMC'].map(lambda x:float(24.00) if x == int(1) else 0)

In [127]:
df3['revenue_LMC'] = np.multiply(df3['productBought_LMC'], df3['salesQuantity'])

In [128]:
revenue_LMC = "The total revenue from the sale of the product {} is ${}". format ("LMC", format(df3['revenue_LMC'].sum(), '.2f'))
print(revenue_LMC)


The total revenue from the sale of the product LMC is $120.00


Product MUS

In [129]:
df3[df3['postComment'].str.contains('MUS', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,productBought_WG17,productBought_WG20,revenue_AK,revenue_BP,revenue_CEN,revenue_CP,revenue_DURI,revenue_GOL,revenue_LEA,revenue_LMC
326,MUSSELS $4 MUS+1,Wong Chow Ching,00:54:36,0,3,1,0,$4 MUS+1,0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
328,MUS+2,Wong Chow Ching,00:54:55,0,1,0,2,0,0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [130]:
df3['productBought_MUS'] = df3['productBought_MUS'].map(lambda x:float(4.00) if x == int(1) else 0)

In [131]:
df3['revenue_MUS'] = np.multiply(df3['productBought_MUS'], df3['salesQuantity'])

In [132]:
revenue_MUS = "The total revenue from the sale of the product {} is ${}". format ("MUS", format(df3['revenue_MUS'].sum(), '.2f'))
print(revenue_MUS)


The total revenue from the sale of the product MUS is $12.00


Product MUSSELS

In [133]:
df3[df3['postComment'].str.contains('MUSSEL', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,productBought_WG20,revenue_AK,revenue_BP,revenue_CEN,revenue_CP,revenue_DURI,revenue_GOL,revenue_LEA,revenue_LMC,revenue_MUS
326,MUSSELS $4 MUS+1,Wong Chow Ching,00:54:36,0,3,1,0,$4 MUS+1,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [134]:
df3['productBought_MUSSEL'] = df3['productBought_MUSSEL'].map(lambda x:float(4.00) if x == int(1) else 0)

In [135]:
df3['revenue_MUSSEL'] = np.multiply(df3['productBought_MUSSEL'], df3['salesQuantity'])

In [136]:
revenue_MUSSEL = "The total revenue from the sale of the product {} is ${}". format ("MUSSEL", format(df3['revenue_MUSSEL'].sum(), '.2f'))
print(revenue_MUSSEL)


The total revenue from the sale of the product MUSSEL is $8.00


Product PRAWN

In [137]:
df3[df3['postComment'].str.contains('PRAWN', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_AK,revenue_BP,revenue_CEN,revenue_CP,revenue_DURI,revenue_GOL,revenue_LEA,revenue_LMC,revenue_MUS,revenue_MUSSEL
317,SMALL PRAWNS $14 SP+1,Lily Koh,00:48:59,0,4,0,0,$14 SP+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
446,SMALL PRAWN +1 $14,き リーサン,01:40:46,0,4,0,0,$14 SP+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [138]:
df3[df3['postComment'].str.contains('prawn', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_AK,revenue_BP,revenue_CEN,revenue_CP,revenue_DURI,revenue_GOL,revenue_LEA,revenue_LMC,revenue_MUS,revenue_MUSSEL
321,Tiger prawns large $18 TP+1,Lily Koh,00:52:19,0,5,0,0,$18 TP+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
343,Also can. Tiger prawn. Cost??,Pauline Ang,01:02:57,0,5,1,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
397,Tiger prawns+1,Amy Heng,01:21:55,0,2,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
408,Got grey prawn?,Munchkin Nacki,01:26:31,0,3,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
447,Tiger prawn $18 TP+1,梁文斌,01:41:27,0,4,0,0,$18 TP+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We noticed that there is no sale information with regards to the product code 'PRAWN'. Perhaps a wrong product code has been typed out and posted. Hence, we will exclude the sales related to the code 'PRAWN'.

Product RAT

In [139]:
df3[df3['postComment'].str.contains('RAT', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_AK,revenue_BP,revenue_CEN,revenue_CP,revenue_DURI,revenue_GOL,revenue_LEA,revenue_LMC,revenue_MUS,revenue_MUSSEL
289,RAT GROUPER 250G $6 RAT+1,Snowy Sue,00:28:38,0,5,0,0,$6 RAT+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353,"RAT GROUPER $12, $15, $18 RAT12, RAT15, RAT18",Lily Koh,01:06:20,0,8,0,0,"$12, $15, $18 RAT12, RAT15, RAT18",0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [140]:
df3['productBought_RAT'] = df3['productBought_RAT'].map(lambda x:float(6.00) if x == int(1) else 0)

In [141]:
df3['revenue_RAT'] = np.multiply(df3['productBought_RAT'], df3['salesQuantity'])

In [142]:
revenue_RAT = "The total revenue from the sale of the product {} is ${}". format ("RAT", format(df3['revenue_RAT'].sum(), '.2f'))
print(revenue_RAT)


The total revenue from the sale of the product RAT is $18.00


Product RAT12_RAT15_RAT18

In [143]:
df3[df3['postComment'].str.contains('RAT12', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_BP,revenue_CEN,revenue_CP,revenue_DURI,revenue_GOL,revenue_LEA,revenue_LMC,revenue_MUS,revenue_MUSSEL,revenue_RAT
353,"RAT GROUPER $12, $15, $18 RAT12, RAT15, RAT18",Lily Koh,01:06:20,0,8,0,0,"$12, $15, $18 RAT12, RAT15, RAT18",0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [144]:
df3['productBought_RAT12_RAT15_RAT18'] = df3['productBought_RAT12_RAT15_RAT18'].map(lambda x:float(18.00) if x == int(1) else 0)

In [145]:
df3['revenue_RAT12_RAT15_RAT18'] = np.multiply(df3['productBought_RAT12_RAT15_RAT18'], df3['salesQuantity'])

In [146]:
revenue_RAT12_RAT15_RAT18 = "The total revenue from the sale of the product {} is ${}". format ("RAT12_RAT15_RAT18", format(df3['revenue_RAT12_RAT15_RAT18'].sum(), '.2f'))
print(revenue_RAT12_RAT15_RAT18)


The total revenue from the sale of the product RAT12_RAT15_RAT18 is $0.00


Product RG

In [147]:
df3[df3['postComment'].str.contains('RG', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_CEN,revenue_CP,revenue_DURI,revenue_GOL,revenue_LEA,revenue_LMC,revenue_MUS,revenue_MUSSEL,revenue_RAT,revenue_RAT12_RAT15_RAT18
381,RED GROUPER 300G-400G $6 RG+1,Emily Wong,01:19:41,0,5,0,0,$6 RG+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
382,RG+3,Wong Wei Yang,01:19:47,0,1,0,3,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
389,RG+2,Emily Wong,01:20:33,0,1,0,2,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
393,RG+1,き リーサン,01:21:16,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
399,RG+10,Pauline Ang,01:22:31,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
405,RGS8+1,Munchkin Nacki,01:25:16,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
406,RGs8+1,Rey Teo,01:26:24,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
407,RGS8+1,き リーサン,01:26:27,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [148]:
df3['productBought_RG'] = df3['productBought_RG'].map(lambda x:float(6.00) if x == int(1) else 0)

In [149]:
df3['revenue_RG'] = np.multiply(df3['productBought_RG'], df3['salesQuantity'])

In [150]:
revenue_RG = "The total revenue from the sale of the product {} is ${}". format ("RG", format(df3['revenue_RG'].sum(), '.2f'))
print(revenue_RG)


The total revenue from the sale of the product RG is $96.00


Product RGS8

In [151]:
df3[df3['postComment'].str.contains('RGS8', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_CP,revenue_DURI,revenue_GOL,revenue_LEA,revenue_LMC,revenue_MUS,revenue_MUSSEL,revenue_RAT,revenue_RAT12_RAT15_RAT18,revenue_RG
405,RGS8+1,Munchkin Nacki,01:25:16,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
407,RGS8+1,き リーサン,01:26:27,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [152]:
df3[df3['postComment'].str.contains('rsg8', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_CP,revenue_DURI,revenue_GOL,revenue_LEA,revenue_LMC,revenue_MUS,revenue_MUSSEL,revenue_RAT,revenue_RAT12_RAT15_RAT18,revenue_RG


We noticed that there is no sale information with regards to the product code 'RGS8'. Perhaps a wrong product code has been typed out and posted. Hence, we will exclude the sales related to the code 'RGS8'.

Product RS

In [153]:
df3[df3['postComment'].str.contains('RS', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_CP,revenue_DURI,revenue_GOL,revenue_LEA,revenue_LMC,revenue_MUS,revenue_MUSSEL,revenue_RAT,revenue_RAT12_RAT15_RAT18,revenue_RG
278,KUKUP RED SNAPPER 550G $10 RS+1,Pauline Ng,00:20:55,0,6,0,0,$10 RS+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
369,RS15,Pauline Ang,01:14:49,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [154]:
df3['productBought_RS'] = df3['productBought_RS'].map(lambda x:float(10.00) if x == int(1) else 0)

In [155]:
df3['revenue_RS'] = np.multiply(df3['productBought_RS'], df3['salesQuantity'])

In [156]:
revenue_RS = "The total revenue from the sale of the product {} is ${}". format ("RS", format(df3['revenue_RS'].sum(), '.2f'))
print(revenue_RS)


The total revenue from the sale of the product RS is $0.00


Product SAH

In [157]:
df3[df3['postComment'].str.contains('SAH', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_DURI,revenue_GOL,revenue_LEA,revenue_LMC,revenue_MUS,revenue_MUSSEL,revenue_RAT,revenue_RAT12_RAT15_RAT18,revenue_RG,revenue_RS
357,SABAH $4 SAH+1,Wong Chow Ching,01:08:13,0,3,0,0,$4 SAH+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [158]:
df3['productBought_SAH'] = df3['productBought_SAH'].map(lambda x:float(4.00) if x == int(1) else 0)

In [159]:
df3['revenue_SAH'] = np.multiply(df3['productBought_SAH'], df3['salesQuantity'])

In [160]:
revenue_SAH = "The total revenue from the sale of the product {} is ${}". format ("SAH", format(df3['revenue_SAH'].sum(), '.2f'))
print(revenue_SAH)


The total revenue from the sale of the product SAH is $0.00


Product SAL

In [161]:
df3[df3['postComment'].str.contains('SAL', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_GOL,revenue_LEA,revenue_LMC,revenue_MUS,revenue_MUSSEL,revenue_RAT,revenue_RAT12_RAT15_RAT18,revenue_RG,revenue_RS,revenue_SAH
372,SALH+1,Tay Poh Huat,01:16:05,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
376,SALMON FILLET $9 SAL+1,梁文斌,01:17:28,0,4,0,0,$9 SAL+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
391,SAL+1 With some belly pls,Pauline Ang,01:20:59,0,5,1,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [162]:
df3['productBought_SAL'] = df3['productBought_SAL'].map(lambda x:float(9.00) if x == int(1) else 0)

In [163]:
df3['revenue_SAL'] = np.multiply(df3['productBought_SAL'], df3['salesQuantity'])

In [164]:
revenue_SAL = "The total revenue from the sale of the product {} is ${}". format ("SAL", format(df3['revenue_SAL'].sum(), '.2f'))
print(revenue_SAL)


The total revenue from the sale of the product SAL is $45.00


Product SALH

In [165]:
df3[df3['postComment'].str.contains('SALH', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_LEA,revenue_LMC,revenue_MUS,revenue_MUSSEL,revenue_RAT,revenue_RAT12_RAT15_RAT18,revenue_RG,revenue_RS,revenue_SAH,revenue_SAL
372,SALH+1,Tay Poh Huat,01:16:05,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [166]:
df3[df3['postComment'].str.contains('salh', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_LEA,revenue_LMC,revenue_MUS,revenue_MUSSEL,revenue_RAT,revenue_RAT12_RAT15_RAT18,revenue_RG,revenue_RS,revenue_SAH,revenue_SAL


We noticed that there is no sale information with regards to the product code 'SALH'. Perhaps a wrong product code has been typed out and posted. Hence, we will exclude the sales related to the code 'SALH'.

Product SB

In [167]:
df3[df3['postComment'].str.contains('SB', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_LEA,revenue_LMC,revenue_MUS,revenue_MUSSEL,revenue_RAT,revenue_RAT12_RAT15_RAT18,revenue_RG,revenue_RS,revenue_SAH,revenue_SAL
298,SEABASS 650-700G $8 SB+1,Munchkin Nacki,00:37:12,0,4,0,0,$8 SB+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
445,SEABASS $8 SB+1,E-Beve,01:40:16,1,3,0,0,$8 SB+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [168]:
df3['productBought_SB'] = df3['productBought_SB'].map(lambda x:float(8.00) if x == int(1) else 0)

In [169]:
df3['revenue_SB'] = np.multiply(df3['productBought_SB'], df3['salesQuantity'])

In [170]:
revenue_SB = "The total revenue from the sale of the product {} is ${}". format ("SB", format(df3['revenue_SB'].sum(), '.2f'))
print(revenue_SB)


The total revenue from the sale of the product SB is $0.00


Product SP

In [171]:
df3[df3['postComment'].str.contains('SP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_LMC,revenue_MUS,revenue_MUSSEL,revenue_RAT,revenue_RAT12_RAT15_RAT18,revenue_RG,revenue_RS,revenue_SAH,revenue_SAL,revenue_SB
317,SMALL PRAWNS $14 SP+1,Lily Koh,00:48:59,0,4,0,0,$14 SP+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
319,SP+1,E-Beve,00:51:13,1,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
330,SP+1,Irene Lee,00:57:05,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [172]:
df3['productBought_SP'] = df3['productBought_SP'].map(lambda x:float(14.00) if x == int(1) else 0)

In [173]:
df3['revenue_SP'] = np.multiply(df3['productBought_SP'], df3['salesQuantity'])

In [174]:
revenue_SP = "The total revenue from the sale of the product {} is ${}". format ("SP", format(df3['revenue_SP'].sum(), '.2f'))
print(revenue_SP)


The total revenue from the sale of the product SP is $28.00


Product ST

In [175]:
df3[df3['postComment'].str.contains('ST', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_MUS,revenue_MUSSEL,revenue_RAT,revenue_RAT12_RAT15_RAT18,revenue_RG,revenue_RS,revenue_SAH,revenue_SAL,revenue_SB,revenue_SP
312,Sotong $18 ST+1,梁文斌,00:45:13,0,3,0,0,$18 ST+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
316,ST+1,Lily Koh,00:48:35,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
348,ST +1,Pauline Ang,01:04:51,0,2,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [176]:
df3['productBought_ST'] = df3['productBought_ST'].map(lambda x:float(18.00) if x == int(1) else 0)

In [177]:
df3['revenue_ST'] = np.multiply(df3['productBought_ST'], df3['salesQuantity'])

In [178]:
revenue_ST = "The total revenue from the sale of the product {} is ${}". format ("ST", format(df3['revenue_ST'].sum(), '.2f'))
print(revenue_ST)


The total revenue from the sale of the product ST is $36.00


Product TP

In [179]:
df3[df3['postComment'].str.contains('TP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_MUSSEL,revenue_RAT,revenue_RAT12_RAT15_RAT18,revenue_RG,revenue_RS,revenue_SAH,revenue_SAL,revenue_SB,revenue_SP,revenue_ST
321,Tiger prawns large $18 TP+1,Lily Koh,00:52:19,0,5,0,0,$18 TP+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
344,TP+1,Pauline Ang,01:03:31,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
346,Ok ok lah take lah TP+1,Pauline Ang,01:04:12,0,6,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
447,Tiger prawn $18 TP+1,梁文斌,01:41:27,0,4,0,0,$18 TP+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [180]:
df3['productBought_TP'] = df3['productBought_TP'].map(lambda x:float(18.00) if x == int(1) else 0)

In [181]:
df3['revenue_TP'] = np.multiply(df3['productBought_TP'], df3['salesQuantity'])

In [182]:
revenue_TP = "The total revenue from the sale of the product {} is ${}". format ("TP", format(df3['revenue_TP'].sum(), '.2f'))
print(revenue_TP)


The total revenue from the sale of the product TP is $54.00


Product TUNA

In [183]:
df3[df3['postComment'].str.contains('TUNA', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_RAT,revenue_RAT12_RAT15_RAT18,revenue_RG,revenue_RS,revenue_SAH,revenue_SAL,revenue_SB,revenue_SP,revenue_ST,revenue_TP
295,TONGKOL/TUNA 1.1KG $10 TUNA+1,Irene Lee,00:32:27,0,4,0,0,$10 TUNA+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [184]:
df3['productBought_TUNA'] = df3['productBought_TUNA'].map(lambda x:float(10.00) if x == int(1) else 0)

In [185]:
df3['revenue_TUNA'] = np.multiply(df3['productBought_TUNA'], df3['salesQuantity'])

In [186]:
revenue_TUNA = "The total revenue from the sale of the product {} is ${}". format ("TUNA", format(df3['revenue_TUNA'].sum(), '.2f'))
print(revenue_TUNA)


The total revenue from the sale of the product TUNA is $0.00


Product WG17

In [187]:
df3[df3['postComment'].str.contains('WG17', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_RAT12_RAT15_RAT18,revenue_RG,revenue_RS,revenue_SAH,revenue_SAL,revenue_SB,revenue_SP,revenue_ST,revenue_TP,revenue_TUNA
426,"Wild grouper $17, $20 WG17, WG20",Wong Wei Yang,01:31:26,0,6,0,0,$20 WG20,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [188]:
df3[df3['postComment'].str.contains('Wild groupa', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_RAT12_RAT15_RAT18,revenue_RG,revenue_RS,revenue_SAH,revenue_SAL,revenue_SB,revenue_SP,revenue_ST,revenue_TP,revenue_TUNA
437,Wild groupa $17+1,Emily Wong,01:34:32,0,3,0,0,$17 WG17,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [189]:
df3['productBought_WG17'] = df3['productBought_WG17'].map(lambda x:float(17.00) if x == int(1) else 0)

In [190]:
df3['revenue_WG17'] = np.multiply(df3['productBought_WG17'], df3['salesQuantity'])

In [191]:
revenue_WG17 = "The total revenue from the sale of the product {} is ${}". format ("WG17", format(df3['revenue_WG17'].sum(), '.2f'))
print(revenue_WG17)


The total revenue from the sale of the product WG17 is $0.00


Product WG20

In [192]:
df3[df3['postComment'].str.contains('WG20', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_RG,revenue_RS,revenue_SAH,revenue_SAL,revenue_SB,revenue_SP,revenue_ST,revenue_TP,revenue_TUNA,revenue_WG17
426,"Wild grouper $17, $20 WG17, WG20",Wong Wei Yang,01:31:26,0,6,0,0,$20 WG20,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [193]:
df3['productBought_WG20'] = df3['productBought_WG20'].map(lambda x:float(20.00) if x == int(1) else 0)

In [194]:
df3['revenue_WG20'] = np.multiply(df3['productBought_WG20'], df3['salesQuantity'])

In [195]:
revenue_WG20 = "The total revenue from the sale of the product {} is ${}". format ("WG20", format(df3['revenue_WG20'].sum(), '.2f'))
print(revenue_WG20)


The total revenue from the sale of the product WG20 is $20.00


In [196]:
# iterating the columns
for col in df3.columns:
    print(col)

postComment
postCommentAuthor
postCommentTime_final
isSeller
postCommentLength
lns
salesQuantity
productPrice
productBought_AGL18
productBought_AK
productBought_AK20
productBought_BP
productBought_BTF
productBought_BTF10
productBought_CEN
productBought_CP
productBought_DURI
productBought_GOL
productBought_LEA
productBought_LMC
productBought_MUS
productBought_MUSSEL
productBought_PRAWN
productBought_RAT
productBought_RAT12_RAT15_RAT18
productBought_RG
productBought_RGS8
productBought_RS
productBought_SAH
productBought_SAL
productBought_SALH
productBought_SB
productBought_SP
productBought_ST
productBought_TP
productBought_TUNA
productBought_WG17
productBought_WG20
revenue_AK
revenue_BP
revenue_CEN
revenue_CP
revenue_DURI
revenue_GOL
revenue_LEA
revenue_LMC
revenue_MUS
revenue_MUSSEL
revenue_RAT
revenue_RAT12_RAT15_RAT18
revenue_RG
revenue_RS
revenue_SAH
revenue_SAL
revenue_SB
revenue_SP
revenue_ST
revenue_TP
revenue_TUNA
revenue_WG17
revenue_WG20


**Sum of total revenue from the video**

In [197]:
#total revenue from the video
total_revenue = df3.loc[:, 'revenue_AK': 'revenue_WG20'].values.sum()

#round the total revenue to 2 decimals places
total_revenue_rounded = format(total_revenue, '.2f')

print(f"The total revenue for the sale of products for the video is ${total_revenue_rounded}")


The total revenue for the sale of products for the video is $561.00


In [198]:
va['totalRevenue'] = total_revenue_rounded

In [199]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue
0,ebeveadmin/videos/914887879433670,30,905,6385,39,964,26,67,25,561.0


**New Column for the total revenue at that comment**

In [200]:
#https://stackoverflow.com/questions/42063716/pandas-sum-up-multiple-columns-into-one-column-without-last-column
df3['revenue'] = df3.loc[:, 'revenue_AK': 'revenue_WG20'].sum(axis=1)
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AK,...,revenue_SAH,revenue_SAL,revenue_SB,revenue_SP,revenue_ST,revenue_TP,revenue_TUNA,revenue_WG17,revenue_WG20,revenue
0,Hi,Amir Abdul Majid,00:00:22,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,00:01:03,0,10,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Hi Babe,き リーサン,00:01:14,0,2,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Loved n shared,Firdaus Nordin,00:01:37,0,3,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Very big n deep,Pauline Ng,00:02:42,0,4,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,LNS,Min Xuan,00:03:13,0,1,1,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Morning Tiff👋,き リーサン,00:03:15,0,2,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Lns done toilet auntie,Munchkin Nacki,00:03:46,0,4,1,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Lns,Munchkin Nacki,00:04:36,0,1,1,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Share to my friends lor,Aysha Khamarudin Al Takhi,00:04:37,0,5,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [201]:
#https://www.geeksforgeeks.org/how-to-move-a-column-to-first-position-in-pandas-dataframe/
#shift the revenue column to be the 8th column, i.e at position 7
eighth_column = df3.pop('revenue')

# insert column using insert(position,column_name,ninth_column) function
df3.insert(7, 'revenue', eighth_column)
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,productPrice,productBought_AGL18,...,revenue_RS,revenue_SAH,revenue_SAL,revenue_SB,revenue_SP,revenue_ST,revenue_TP,revenue_TUNA,revenue_WG17,revenue_WG20
0,Hi,Amir Abdul Majid,00:00:22,0,1,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,00:01:03,0,10,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Hi Babe,き リーサン,00:01:14,0,2,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Loved n shared,Firdaus Nordin,00:01:37,0,3,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Very big n deep,Pauline Ng,00:02:42,0,4,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,LNS,Min Xuan,00:03:13,0,1,1,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Morning Tiff👋,き リーサン,00:03:15,0,2,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Lns done toilet auntie,Munchkin Nacki,00:03:46,0,4,1,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Lns,Munchkin Nacki,00:04:36,0,1,1,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Share to my friends lor,Aysha Khamarudin Al Takhi,00:04:37,0,5,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


After the revenue of the sales have been calculated, all the dummified product columns and the column 'productPrice' will be dropped as they are no longer required to be checked against to identify the price of the product.

In [202]:
df3 = df3.loc[: ,'postComment':'revenue']
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue
0,Hi,Amir Abdul Majid,00:00:22,0,1,0,0,0.0
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,00:01:03,0,10,0,0,0.0
2,Hi Babe,き リーサン,00:01:14,0,2,0,0,0.0
3,Loved n shared,Firdaus Nordin,00:01:37,0,3,0,0,0.0
4,Very big n deep,Pauline Ng,00:02:42,0,4,0,0,0.0
5,LNS,Min Xuan,00:03:13,0,1,1,0,0.0
6,Morning Tiff👋,き リーサン,00:03:15,0,2,0,0,0.0
7,Lns done toilet auntie,Munchkin Nacki,00:03:46,0,4,1,0,0.0
8,Lns,Munchkin Nacki,00:04:36,0,1,1,0,0.0
9,Share to my friends lor,Aysha Khamarudin Al Takhi,00:04:37,0,5,0,0,0.0


**New Column to identify the frequency of the seller's comments in the video**

In [203]:
#frequency of seller's comments
va['frequencySeller']= np.divide(va['videoLength'].iloc[0],va['numSellerComments'])
#seller's comment appears on average of every 163 seconds

In [204]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue,frequencySeller
0,ebeveadmin/videos/914887879433670,30,905,6385,39,964,26,67,25,561.0,163.717949


**New Column to identify the seller**

In [205]:
df3['seller'] = 'ebeveadmin'

In [206]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller
0,Hi,Amir Abdul Majid,00:00:22,0,1,0,0,0.0,ebeveadmin
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,00:01:03,0,10,0,0,0.0,ebeveadmin
2,Hi Babe,き リーサン,00:01:14,0,2,0,0,0.0,ebeveadmin
3,Loved n shared,Firdaus Nordin,00:01:37,0,3,0,0,0.0,ebeveadmin
4,Very big n deep,Pauline Ng,00:02:42,0,4,0,0,0.0,ebeveadmin


**New Column for the Average Compound Score from Sentiment Analysis on raw & uncleaned comments for video**

In [207]:
# Instantiate Sentiment Intensity Analyzer
sent = SentimentIntensityAnalyzer()

In [208]:
df3['sentiment_score'] = df3['postComment'].apply(sent.polarity_scores)
df3['compound'] = [sent.polarity_scores(x)['compound'] for x in df3['postComment']]
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller,sentiment_score,compound
0,Hi,Amir Abdul Majid,00:00:22,0,1,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,00:01:03,0,10,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 0.796, 'pos': 0.204, 'compound': 0.3182}",0.3182
2,Hi Babe,き リーサン,00:01:14,0,2,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
3,Loved n shared,Firdaus Nordin,00:01:37,0,3,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.743}",0.743
4,Very big n deep,Pauline Ng,00:02:42,0,4,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0


In [209]:
#average compound scores for the video
#df3.shape[0] calculates the total number of rows in the dataframe
va['averageCompound']= (df3['compound'].sum())/(df3['compound'].sum())/df3.shape[0]
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue,frequencySeller,averageCompound
0,ebeveadmin/videos/914887879433670,30,905,6385,39,964,26,67,25,561.0,163.717949,0.002179


In [210]:
#drop the columns with regarding to the sentiment analysis on the raw & uncleaned comments
#as we will perform sentiment analysis at the comment level on processed comments
df3 = df3.loc[: ,'postComment':'seller']
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller
0,Hi,Amir Abdul Majid,00:00:22,0,1,0,0,0.0,ebeveadmin
1,New buyers please whatsapp 88896368 your fb name and address,き リーサン,00:01:03,0,10,0,0,0.0,ebeveadmin
2,Hi Babe,き リーサン,00:01:14,0,2,0,0,0.0,ebeveadmin
3,Loved n shared,Firdaus Nordin,00:01:37,0,3,0,0,0.0,ebeveadmin
4,Very big n deep,Pauline Ng,00:02:42,0,4,0,0,0.0,ebeveadmin


### Saving the cleaned dataframes

In [211]:
# export to csv - change the name of the data file for each video
va.to_csv('../../data/cleaned_data/cleaned_va_ebeveadmin_914887879433670.csv', index=False)

In [212]:
#check for nulls
#displaying only the columns with nulls and their sum
df3[df3.columns[df3.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [213]:
# export to csv - change the name of the data file for each video
df3.to_csv('../../data/cleaned_data/cleaned_ebeveadmin_914887879433670.csv', index=False)