# Data Import & Cleaning

### Contents:

- [Data Import](#Data-Import)
- [Data Cleaning](#Data-Cleaning)
- [Feature Engineering](#Feature-Engineering)

### Import Libraries

In [1]:
#import standard libraries
import pandas as pd
import numpy as np

#import emoji
import emoji

from natsort import natsorted, index_natsorted, order_by_index

#import warnings to ignore flags when the project is complete
#import warnings
#warnings.filterwarnings('ignore')

#import pre-processing libraries for data cleaning
import string
import re
import nltk
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

## Data Import

**Read scrapped data for the following videos**

In [2]:
va = pd.read_csv('../../data/scrapped_data/va_ebeveadmin_2999090193679509.csv')

In [3]:
va

Unnamed: 0,video_for,totalEmojiReaction,views
0,ebeveadmin/videos/2999090193679509,29,912


In [4]:
va.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   video_for           1 non-null      object
 1   totalEmojiReaction  1 non-null      int64 
 2   views               1 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 152.0+ bytes


In [5]:
df = pd.read_csv('../../data/scrapped_data/ebeveadmin_2999090193679509.csv', encoding='utf-8')

In [6]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime
0,Stingray how much..,Reen Rai,1:00:13
1,"No problem, tq",Reen Rai,1:00:39
2,Call steam,Richard Ling,1:01:08
3,Biggest cook curry vegetable,Richard Ling,1:01:30
4,Ikan pari power,Richard Ling,1:02:26
5,What else u have sis,Reen Rai,1:02:52
6,Still have batang?,Serene Rebecca Koh,1:03:10
7,Porridge fish,Richard Ling,1:03:43
8,Lns,Diana Ng,1:03:54
9,Ok Batang set +1,Serene Rebecca Koh,1:04:04


In [7]:
#https://stackoverflow.com/questions/32072076/find-the-unique-values-in-a-column-and-then-sort-them
#check if the seller uses multiple accounts to reply
postCommentAuthor_unique = df['postCommentAuthor'].unique()
print(sorted(postCommentAuthor_unique))

['Ace Tan', 'Amir Abdul Majid', 'Ann Chia', 'Catherine Koo', 'Chen Jolene', 'Connie Tay', 'Diana Ng', 'E-Beve', 'Firdaus Nordin', 'Mïššy Danté', 'Norsuwali Ali', 'Pauline Ng', 'Philip Ada Lyn', 'Reen Rai', 'Richard Ling', 'SJ Huang', 'Serene Rebecca Koh', 'Shuganya Devi', 'Veon Veon', 'Xian Yang', 'き リーサン', '马小玲']


In [8]:
#find comments posted by the seller only
df.loc[df['postCommentAuthor'] == 'E-Beve']

Unnamed: 0,postComment,postCommentAuthor,postCommentTime
13,"Ang go li 1.4kg $18 </div><div dir=""auto"" style=""text-align: start;"">AGL18+1",E-Beve,1:05:09
21,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,1:38
22,Hi morning sis and admin lns done,E-Beve,1:38
28,"Lns <span class=""pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu""><img alt=""✅"" height=""16"" referrerpolicy=""origin-when-cross-origin"" src=""https://static.xx.fbcdn.net/images/emoji.php/v9/tb4/2/16/2705.png"" width=""16""/></span>",E-Beve,5:22
33,Dssb+1,E-Beve,8:16
34,"WHITE THREADFIN $9</div><div dir=""auto"" style=""text-align: start;"">WTF+1",E-Beve,10:28
38,LNS for you babe,E-Beve,11:35
43,Ok,E-Beve,13:21
49,What tenggiri is that?,E-Beve,15:59
52,Bt21+1,E-Beve,18:21


In [9]:
#check the comments to have a gauge
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

['<span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 k7cz35w2 bsnbvmp4"><img alt="👍🏻" height="32" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/t10/2/32/1f44d_1f3fb.png" width="32"/></span>', 'AGL18+1', 'AGL8+1', 'AMIR CONGRATS', 'ANG GO LI 750G $8</div><div dir="auto" style="text-align: start;">AGL8+1', 'Agl8+1', 'Ang  for curry cook', 'Ang go li 1.4kg $18 </div><div dir="auto" style="text-align: start;">AGL18+1', 'BABY ANG GO LI $8</div><div dir="auto" style="text-align: start;">BBAGL+1', 'BATANG FILLET 1KG $21</div><div dir="auto" style="text-align: start;">BT21+1', 'BATANG TAIL $9</div><div dir="auto" style="text-align: start;">BTTAIL+1', 'BP+2', 'BTTail+1', 'Bbagl+1', 'Biggest cook curry vegetable', 'Bp+1', 'Bt21+1', 'Bye bye sister', 'CP+2', 'Call steam ', 'Congrats ', 'Congratulations <span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu"><img alt="

## Data Cleaning

### Convert the emojis to text for easy cleaning

We noticed that the emojis have html parsers attached to it. Hence, we will convert the images of the emojis to text first, to remove the html parsers to the emojis, while retaining the emoji's text. We will convert it back to emoji afterwards.

In [10]:
df['postComment'] = df['postComment'].apply(emoji.demojize)

In [11]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

['<span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 k7cz35w2 bsnbvmp4"><img alt=":thumbs_up_light_skin_tone:" height="32" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/t10/2/32/1f44d_1f3fb.png" width="32"/></span>', 'AGL18+1', 'AGL8+1', 'AMIR CONGRATS', 'ANG GO LI 750G $8</div><div dir="auto" style="text-align: start;">AGL8+1', 'Agl8+1', 'Ang  for curry cook', 'Ang go li 1.4kg $18 </div><div dir="auto" style="text-align: start;">AGL18+1', 'BABY ANG GO LI $8</div><div dir="auto" style="text-align: start;">BBAGL+1', 'BATANG FILLET 1KG $21</div><div dir="auto" style="text-align: start;">BT21+1', 'BATANG TAIL $9</div><div dir="auto" style="text-align: start;">BTTAIL+1', 'BP+2', 'BTTail+1', 'Bbagl+1', 'Biggest cook curry vegetable', 'Bp+1', 'Bt21+1', 'Bye bye sister', 'CP+2', 'Call steam ', 'Congrats ', 'Congratulations <span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3l

### Clean the comments using Regex

From the above unique values of the comments, there are several cleaning issues that needs to be addressed.

    1. Removal of links or URLs. 
        - URLs are present when the Facebook users manually type a link in the comment. 
        - Additionally, when users are tagged, their Facebook profile URL is printed as a result.
        - Similarly, there is a unique URL linked to each emoji as well. 
        
    2. Removal of HTML special entities
        - Examples of HTML special entities are '&amp' and '&gt'. 
        
    3. Removal of other HTML special terms
        - Examples of other HTML special entities are whitespace HTML special entities like '#x200B' and '#xa0'.
        
    4. Removal of HTML Parsers to the Emojis
        - Previously, the emojis have been demojized to text already. However, the HTML parsers to the mojis remain. Hence, they are required to be removed as well.
        
    5. Removal of HTML Parcers to Whitespaces
        - When a next line is entered in the same comment, there will be a HTML parser to the this whitespace. Hence, this are to be removed as well.
        
    6. Removal of HTML Parsers to tagged names
        - When users are tagged in the comments, in addition to their Facebook profile URL, there will be HTML parsers to the tagged names as well. Hence, this are to be removed as well.
        
    7. Removsl of other HTML Parsers   

In [12]:
def clean(row):
    
    # Remove links or URLs
    row['postComment'] = re.sub(
        pattern=r'https?:\/\/.*\/\w*', 
        repl='', 
        string=row['postComment'],
        flags=re.M)
    
    # Remove HTML special entities (e.g.. &amp, &gt;)
    row['postComment'] = re.sub(
        pattern=r'\&\w*;',
        repl='',
        string=row['postComment'],
        flags=re.M) 
    
    # Remove emoji html parsers
    #https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
    row['postComment'] = re.sub(
        pattern=r'<span\sclass=\"([a-z0-9]{8}\s)+[a-z0-9]{8}\">',
        repl='', 
        string=row['postComment'],
        flags=re.M)
 
    # Remove emoji html parsers
    row['postComment'] = re.sub(
        pattern = r'<img\salt=\"(\:\w*)(\-?)(\w*\:)\"\s[a-z]{6}=\"\d\d\"\s[a-z]{14}=.{26}\s[a-z]{3}=\">',
        repl=r'\1\2\3',
        string=row['postComment'],
        flags=re.M)
    
    # Remove whitespaces
    row['postComment'] = re.sub(
        pattern=r'<\/div><div\s.*\s.*>',
        repl=' ',
        string=row['postComment'],
        flags=re.M)

    # Remove whitespaces
    row['postComment'] = re.sub(
        pattern=r'<div\sdir=\"auto\"\s.*\s.*>',
        repl=' ',
        string=row['postComment'],
        flags=re.M)

    
    # Remove tagged names
    row['postComment'] = re.sub(
        pattern=r'<a\sclass=\"([a-z0-9]{8}\s)+[a-z0-9]{8}\"\shref=\">',
        repl='',
        string=row['postComment'],
        flags=re.M)
    

    # Remove consecutive non-ASCII characters
    # This will remove the chinese comments
    #https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
    row['postComment'] = re.sub(
        pattern=r'[^\x00-\x7F]+', 
        repl=' ', 
        string=row['postComment'],
        flags=re.M)
    
    return row

In [13]:
df2 = df.apply(clean, axis=1)

In [14]:
postComment_unique2 = df2['postComment'].unique()
print(sorted(postComment_unique2))

[':thumbs_up_light_skin_tone:', 'AGL18+1', 'AGL8+1', 'AMIR CONGRATS', 'ANG GO LI 750G $8 AGL8+1', 'Agl8+1', 'Ang  for curry cook', 'Ang go li 1.4kg $18  AGL18+1', 'BABY ANG GO LI $8 BBAGL+1', 'BATANG FILLET 1KG $21 BT21+1', 'BATANG TAIL $9 BTTAIL+1', 'BP+2', 'BTTail+1', 'Bbagl+1', 'Biggest cook curry vegetable', 'Bp+1', 'Bt21+1', 'Bye bye sister', 'CP+2', 'Call steam ', 'Congrats ', 'Congratulations :clapping_hands:', 'Congratulations:red_heart:', 'Cookles', 'Cp+1', 'DEEP SEA SEABASS $10 DSSB+1', 'DSSB+1', 'Dear,  any fish head?', 'Deep fry sambal chili sauce', 'Dssb+1', 'FLL+1', 'FLOWER CLAMS 500G $3.50 FLL+1', 'FLOWER GROUPER 550G $6 Fg+1', 'GOL6+2', 'GOLDEN POMFRET 450G $6 GOL6+1', 'GOLDEN POMFRET 550G $7 GOL7+1', 'GP+1', 'Gd morning!! ', 'Gh+1', 'Give me the part got belly. Thanks.', 'Gol+1', 'Gol7+1', 'Good morning.', 'Guess the numbers of clams??', 'Hello ', 'Heng Heng', 'Hi morning sis and admin lns done', 'Hi, delivery fees is ? ', 'Hi..any hardtail', 'How to sharer tag ah?  I 

**Convert encoded emoji text back to emojis**

In [15]:
df2['postComment'] = df2['postComment'].apply(emoji.emojize)

In [16]:
postComment_unique2 = df2['postComment'].unique()
print(sorted(postComment_unique2))

['AGL18+1', 'AGL8+1', 'AMIR CONGRATS', 'ANG GO LI 750G $8 AGL8+1', 'Agl8+1', 'Ang  for curry cook', 'Ang go li 1.4kg $18  AGL18+1', 'BABY ANG GO LI $8 BBAGL+1', 'BATANG FILLET 1KG $21 BT21+1', 'BATANG TAIL $9 BTTAIL+1', 'BP+2', 'BTTail+1', 'Bbagl+1', 'Biggest cook curry vegetable', 'Bp+1', 'Bt21+1', 'Bye bye sister', 'CP+2', 'Call steam ', 'Congrats ', 'Congratulations 👏', 'Congratulations❤️', 'Cookles', 'Cp+1', 'DEEP SEA SEABASS $10 DSSB+1', 'DSSB+1', 'Dear,  any fish head?', 'Deep fry sambal chili sauce', 'Dssb+1', 'FLL+1', 'FLOWER CLAMS 500G $3.50 FLL+1', 'FLOWER GROUPER 550G $6 Fg+1', 'GOL6+2', 'GOLDEN POMFRET 450G $6 GOL6+1', 'GOLDEN POMFRET 550G $7 GOL7+1', 'GP+1', 'Gd morning!! ', 'Gh+1', 'Give me the part got belly. Thanks.', 'Gol+1', 'Gol7+1', 'Good morning.', 'Guess the numbers of clams??', 'Hello ', 'Heng Heng', 'Hi morning sis and admin lns done', 'Hi, delivery fees is ? ', 'Hi..any hardtail', 'How to sharer tag ah?  I forgot', "I'll pass..", 'Ikan pari power', 'Ill wait fo

### Reindexing the dataframe 
**New Column to reindex the dataframe in accordance to time**

From the data, we can tell that there is an inconsistent timestamp being used in the column 'postCommentTime'. For example, there are times like '0:57' and '1:00:14' which indicates 0 hour 0 mins 57 secs, and 1 hour 0 mins 14 secs. 

Hence, a new column 'postCommentTime_final' is created to ensure that a timestamp of HH:MM:SS is being used consistently throughout. However, we note that the number of days is being included for TimeDeltaIndex.

As a result, we will read the timestamp, by excluding the number of days.  

In [17]:
#TimedeltaIndex
#https://stackoverflow.com/questions/54877467/pandas-convert-hhmm-and-hhmmss-to-standard-hhmmss-in-python
# for example, time of '0:57' will then be 00:00:57; 0 hours 0 mins 57 secs
df2['postCommentTime_final'] = pd.to_timedelta(np.where(df2['postCommentTime'].str.count(':') == 1, '00:' + df2['postCommentTime'], df2['postCommentTime']))

In [18]:
df2.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Stingray how much..,Reen Rai,1:00:13,0 days 01:00:13
1,"No problem, tq",Reen Rai,1:00:39,0 days 01:00:39
2,Call steam,Richard Ling,1:01:08,0 days 01:01:08
3,Biggest cook curry vegetable,Richard Ling,1:01:30,0 days 01:01:30
4,Ikan pari power,Richard Ling,1:02:26,0 days 01:02:26


In [19]:
df2['postCommentTime_final'] = df2['postCommentTime_final'].astype(str).map(lambda x: x[7:])

In [20]:
df2

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Stingray how much..,Reen Rai,1:00:13,01:00:13
1,"No problem, tq",Reen Rai,1:00:39,01:00:39
2,Call steam,Richard Ling,1:01:08,01:01:08
3,Biggest cook curry vegetable,Richard Ling,1:01:30,01:01:30
4,Ikan pari power,Richard Ling,1:02:26,01:02:26
5,What else u have sis,Reen Rai,1:02:52,01:02:52
6,Still have batang?,Serene Rebecca Koh,1:03:10,01:03:10
7,Porridge fish,Richard Ling,1:03:43,01:03:43
8,Lns,Diana Ng,1:03:54,01:03:54
9,Ok Batang set +1,Serene Rebecca Koh,1:04:04,01:04:04


In [21]:
#reindex according to postCommentTime_final
#previous natsort in data collection didnt take into account the different timestamp format
df3 = df2.reindex(index=order_by_index(df2.index, index_natsorted(df2.postCommentTime_final)))

In [22]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
20,Good morning.,Firdaus Nordin,1:17,00:01:17
21,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,1:38,00:01:38
22,Hi morning sis and admin lns done,E-Beve,1:38,00:01:38
23,LNS,Mïššy Danté,1:55,00:01:55
24,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,2:18,00:02:18
25,Lns done ✅,Shuganya Devi,4:13,00:04:13
26,LnS done😊,Shuganya Devi,4:22,00:04:22
27,Hello,Xian Yang,4:40,00:04:40
28,Lns ✅,E-Beve,5:22,00:05:22
29,DEEP SEA SEABASS $10 DSSB+1,SJ Huang,5:29,00:05:29


In [23]:
#reset the index for the dataframe
#https://stackoverflow.com/questions/20490274/how-to-reset-index-in-a-pandas-dataframe

df3 = df3.reset_index(drop=True)

In [24]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Good morning.,Firdaus Nordin,1:17,00:01:17
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,1:38,00:01:38
2,Hi morning sis and admin lns done,E-Beve,1:38,00:01:38
3,LNS,Mïššy Danté,1:55,00:01:55
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,2:18,00:02:18
5,Lns done ✅,Shuganya Devi,4:13,00:04:13
6,LnS done😊,Shuganya Devi,4:22,00:04:22
7,Hello,Xian Yang,4:40,00:04:40
8,Lns ✅,E-Beve,5:22,00:05:22
9,DEEP SEA SEABASS $10 DSSB+1,SJ Huang,5:29,00:05:29


**Obtain the length of the video**

Assuming that the last comment of the video is the total length of the video, we will input the length of the video from the comments dataframe into the video attributes dataframe.

In [25]:
#retrieve last comment to obtain the length of the video
df3['postCommentTime_final'].iloc[-1]

'01:07:47'

In [26]:
#https://stackoverflow.com/questions/6402812/how-to-convert-an-hmmss-time-string-to-seconds-in-python
def get_sec(time_str):
    """Get Seconds from time."""
    h, m, s = time_str.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

In [27]:
get_sec(df3['postCommentTime_final'].iloc[-1])

4067

In [28]:
#retrieve last comment to obtain the length of the video in seconds
va['videoLength']= get_sec(df3['postCommentTime_final'].iloc[-1])

In [29]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength
0,ebeveadmin/videos/2999090193679509,29,912,4067


## Feature Engineering

### Comments made by the Seller

**New Column to identify the number of comments made by the seller in the video**

From the total sum of comments made by the seller, we will input it into the video attributes dataframe.

In [30]:
(df3['postCommentAuthor']=='E-Beve').sum()

28

In [31]:
va['numSellerComments'] = (df3['postCommentAuthor']=='E-Beve').sum()

In [32]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments
0,ebeveadmin/videos/2999090193679509,29,912,4067,28


**New Column to identify if the comment is made by the Seller or not**

In [33]:
#create a new column to show if the comment is made by the seller or not
df3['isSeller'] = df3['postCommentAuthor'].map(lambda x:1 if x =='E-Beve' else 0)

In [34]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller
0,Good morning.,Firdaus Nordin,1:17,00:01:17,0
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,1:38,00:01:38,1
2,Hi morning sis and admin lns done,E-Beve,1:38,00:01:38,1
3,LNS,Mïššy Danté,1:55,00:01:55,0
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,2:18,00:02:18,0


In [35]:
df3['isSeller'].value_counts()

0    90
1    28
Name: isSeller, dtype: int64

In [36]:
#show all the seller's comments
df3.loc[df3['isSeller'] == 1]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,1:38,00:01:38,1
2,Hi morning sis and admin lns done,E-Beve,1:38,00:01:38,1
8,Lns ✅,E-Beve,5:22,00:05:22,1
13,Dssb+1,E-Beve,8:16,00:08:16,1
14,WHITE THREADFIN $9 WTF+1,E-Beve,10:28,00:10:28,1
18,LNS for you babe,E-Beve,11:35,00:11:35,1
23,Ok,E-Beve,13:21,00:13:21,1
29,What tenggiri is that?,E-Beve,15:59,00:15:59,1
32,Bt21+1,E-Beve,18:21,00:18:21,1
35,BP+2,E-Beve,20:05,00:20:05,1


### Length of comments

**New Column to identify the length of each comment**

From the comments, we take the length of each comment as the total number of words each comment has.

In [37]:
#length of each comment
#https://stackoverflow.com/questions/37483470/how-to-calculate-number-of-words-in-a-string-in-dataframe
df3['postCommentLength'] = df3['postComment'].str.split().str.len()

In [38]:
df3.head(10)

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength
0,Good morning.,Firdaus Nordin,1:17,00:01:17,0,2
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,1:38,00:01:38,1,5
2,Hi morning sis and admin lns done,E-Beve,1:38,00:01:38,1,7
3,LNS,Mïššy Danté,1:55,00:01:55,0,1
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,2:18,00:02:18,0,5
5,Lns done ✅,Shuganya Devi,4:13,00:04:13,0,3
6,LnS done😊,Shuganya Devi,4:22,00:04:22,0,2
7,Hello,Xian Yang,4:40,00:04:40,0,1
8,Lns ✅,E-Beve,5:22,00:05:22,1,2
9,DEEP SEA SEABASS $10 DSSB+1,SJ Huang,5:29,00:05:29,0,5


**New Column to identify the total number of comments in the video**

From the total number of comments in the video, we will input it into the video attributes dataframe.

In [39]:
#total number of comments
df3['postCommentLength'].sum()

343

In [40]:
va['numComments'] = df3['postCommentLength'].sum()

In [41]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments
0,ebeveadmin/videos/2999090193679509,29,912,4067,28,343


### LNS

LNS is an acronym that stands for 'like and share'. It is a form of customer engagement as it indicates by the customers to the sellers that they have liked and shared the video on their Facebook wall. 

**New Column to identify if Customers are engaging in liking and sharing the video**

In [42]:
#if the customer has commented 'lns' or 'ls' which stands for 'like & shared' & 'like shared' respectively
def lns(comment):
    if re.search(r'(l)(n?)(s)', comment, re.IGNORECASE):
        return int(1)
    else:
        return int(0)

In [43]:
df3['lns'] = df3['postComment'].map(lambda x:lns(x))

In [44]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns
0,Good morning.,Firdaus Nordin,1:17,00:01:17,0,2,0
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,1:38,00:01:38,1,5,0
2,Hi morning sis and admin lns done,E-Beve,1:38,00:01:38,1,7,1
3,LNS,Mïššy Danté,1:55,00:01:55,0,1,1
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,2:18,00:02:18,0,5,0


**New Column to identify if the number of Customers who explicitly inform the sellers that they are engaging in liking and sharing the video**

In [45]:
#range of customer's engagement for LNS
df3['lns'].value_counts()

0    102
1     16
Name: lns, dtype: int64

In [46]:
(df3['lns']==1).sum()

16

In [47]:
va['lnsQuantity'] = (df3['lns']==1).sum()

In [48]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity
0,ebeveadmin/videos/2999090193679509,29,912,4067,28,343,16


## Sales Quantity

**New Columns to identify the quantity of sales made**

In [49]:
#overview of the sales
df3[df3['postComment'].str.contains('(\w*)(\s)*(\+)(\s)*(\d*)', regex=True)]

  df3[df3['postComment'].str.contains('(\w*)(\s)*(\+)(\s)*(\d*)', regex=True)]


Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns
9,DEEP SEA SEABASS $10 DSSB+1,SJ Huang,5:29,00:05:29,0,5,0
10,DSSB+1,SJ Huang,5:51,00:05:51,0,1,0
13,Dssb+1,E-Beve,8:16,00:08:16,1,1,0
14,WHITE THREADFIN $9 WTF+1,E-Beve,10:28,00:10:28,1,4,0
15,ANG GO LI 750G $8 AGL8+1,Veon Veon,10:30,00:10:30,0,6,0
16,Agl8+1,Shuganya Devi,10:49,00:10:49,0,1,0
17,AGL8+1,き リーサン,10:51,00:10:51,0,1,0
19,GOLDEN POMFRET 450G $6 GOL6+1,SJ Huang,11:52,00:11:52,0,5,0
20,GOL6+2,Ann Chia,12:23,00:12:23,0,1,0
21,Gol+1,Connie Tay,12:43,00:12:43,0,1,0


In [50]:
def sale(comment):
    if re.findall(r'(\+)(\s)?(\d)', comment):
        results = re.findall(r'\+\s?\d', comment)
        total = 0
        for r in results:
            total += int(r[-1])
        return total
    else:
        return int(0)

In [51]:
df3['sales'] = df3['postComment'].apply(lambda x:sale(x))

In [52]:
df3.head(10)

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales
0,Good morning.,Firdaus Nordin,1:17,00:01:17,0,2,0,0
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,1:38,00:01:38,1,5,0,0
2,Hi morning sis and admin lns done,E-Beve,1:38,00:01:38,1,7,1,0
3,LNS,Mïššy Danté,1:55,00:01:55,0,1,1,0
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,2:18,00:02:18,0,5,0,0
5,Lns done ✅,Shuganya Devi,4:13,00:04:13,0,3,1,0
6,LnS done😊,Shuganya Devi,4:22,00:04:22,0,2,1,0
7,Hello,Xian Yang,4:40,00:04:40,0,1,0,0
8,Lns ✅,E-Beve,5:22,00:05:22,1,2,1,0
9,DEEP SEA SEABASS $10 DSSB+1,SJ Huang,5:29,00:05:29,0,5,0,1


In [53]:
#if the comments consist the sale information for the product, we will indicate it as '0', otherwise '1'
def no_sale_info(comment):
    if re.search(r'(\$)(\s)?(.*)', comment, re.IGNORECASE):
        return int(0)
    else:
        return int(1)

In [54]:
df3['no_sale_info'] = df3['postComment'].map(lambda x:no_sale_info(x))

In [55]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info
0,Good morning.,Firdaus Nordin,1:17,00:01:17,0,2,0,0,1
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,1:38,00:01:38,1,5,0,0,1
2,Hi morning sis and admin lns done,E-Beve,1:38,00:01:38,1,7,1,0,1
3,LNS,Mïššy Danté,1:55,00:01:55,0,1,1,0,1
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,2:18,00:02:18,0,5,0,0,1
5,Lns done ✅,Shuganya Devi,4:13,00:04:13,0,3,1,0,1
6,LnS done😊,Shuganya Devi,4:22,00:04:22,0,2,1,0,1
7,Hello,Xian Yang,4:40,00:04:40,0,1,0,0,1
8,Lns ✅,E-Beve,5:22,00:05:22,1,2,1,0,1
9,DEEP SEA SEABASS $10 DSSB+1,SJ Huang,5:29,00:05:29,0,5,0,1,0


In [56]:
#sales made by the customers; exclude the seller's comments on the codes for the sales
#https://stackoverflow.com/questions/19914937/applying-function-with-multiple-arguments-to-create-a-new-pandas-column
df3['salesQuantity'] = np.multiply(df3['no_sale_info'], df3['sales'])

In [57]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info,salesQuantity
0,Good morning.,Firdaus Nordin,1:17,00:01:17,0,2,0,0,1,0
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,1:38,00:01:38,1,5,0,0,1,0
2,Hi morning sis and admin lns done,E-Beve,1:38,00:01:38,1,7,1,0,1,0
3,LNS,Mïššy Danté,1:55,00:01:55,0,1,1,0,1,0
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,2:18,00:02:18,0,5,0,0,1,0
5,Lns done ✅,Shuganya Devi,4:13,00:04:13,0,3,1,0,1,0
6,LnS done😊,Shuganya Devi,4:22,00:04:22,0,2,1,0,1,0
7,Hello,Xian Yang,4:40,00:04:40,0,1,0,0,1,0
8,Lns ✅,E-Beve,5:22,00:05:22,1,2,1,0,1,0
9,DEEP SEA SEABASS $10 DSSB+1,SJ Huang,5:29,00:05:29,0,5,0,1,0,0


In [58]:
#range of sales quantity
df3['salesQuantity'].value_counts()

0    89
1    23
2     6
Name: salesQuantity, dtype: int64

In [59]:
#total number of orders made
df3['salesQuantity'].sum()

35

In [60]:
va['salesQuantity'] = df3['salesQuantity'].sum()

In [61]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity
0,ebeveadmin/videos/2999090193679509,29,912,4067,28,343,16,35


## Products 

The seller will comment and post the unique product codes for each product. Thereafter, customers who are keen on purchasing the products will explicitly comment out the specific & unique product codes, in addition to the quanityt of each product that they wish to purchase.

**New Columns to identify the products purchased by the Customers**

Regex is used to identify the products being offered and the products being purchased as well.

In [62]:
#function to identify the code of the product bought
def sale2(comment):
    if re.search(r'(\w*)(\s?)[\-]?\+(\s?)(\d)', comment):
        return str(re.search(r'(\w*)(\s?)[\-]?\+(\s?)(\d)', comment).group(0)[:-2])
    else:
        return int(0)

In [63]:
#identifies all comments that have the codes of the products purchased by the Customers
#this column will be dropped afterwards.
df3['productBought'] = df3['postComment'].apply(lambda x:sale2(x))

In [64]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info,salesQuantity,productBought
0,Good morning.,Firdaus Nordin,1:17,00:01:17,0,2,0,0,1,0,0
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,1:38,00:01:38,1,5,0,0,1,0,0
2,Hi morning sis and admin lns done,E-Beve,1:38,00:01:38,1,7,1,0,1,0,0
3,LNS,Mïššy Danté,1:55,00:01:55,0,1,1,0,1,0,0
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,2:18,00:02:18,0,5,0,0,1,0,0
5,Lns done ✅,Shuganya Devi,4:13,00:04:13,0,3,1,0,1,0,0
6,LnS done😊,Shuganya Devi,4:22,00:04:22,0,2,1,0,1,0,0
7,Hello,Xian Yang,4:40,00:04:40,0,1,0,0,1,0,0
8,Lns ✅,E-Beve,5:22,00:05:22,1,2,1,0,1,0,0
9,DEEP SEA SEABASS $10 DSSB+1,SJ Huang,5:29,00:05:29,0,5,0,1,0,0,DSSB


In [65]:
df3['productBought'].unique()

array([0, 'DSSB', 'Dssb', 'WTF', 'AGL8', 'Agl8', 'GOL6', 'Gol', 'GOL7',
       'Gol7', 'BT21', 'Bt21', 'BP', 'Bp', 'ST', 'St', 'FLL', 'RS', 'Rs',
       'KingSAL', 'KingSal', 'BBAGL', 'Bbagl', 'GH', 'Gh', 'Fg', 'CP',
       'Cp', 'SB', 'STR12', 'Str12', 'STR7', 'BTTAIL', 'BTTail', 'KUN',
       'KUN-', 'Kun', 'RE20', 'KingSal ', 'MANC', 'GP', 'set ', 'AGL18'],
      dtype=object)

**Change the produce codes to be uppercase for consistency**

In [66]:
#change the produce codes to be uppercase for consistency, and since python is case sensitive.
#https://stackoverflow.com/questions/39512002/convert-whole-dataframe-from-lower-case-to-upper-case-with-pandas
df3['productBought'] = df3['productBought'].astype(str).str.upper()

In [67]:
df3['productBought'].unique()

array(['0', 'DSSB', 'WTF', 'AGL8', 'GOL6', 'GOL', 'GOL7', 'BT21', 'BP',
       'ST', 'FLL', 'RS', 'KINGSAL', 'BBAGL', 'GH', 'FG', 'CP', 'SB',
       'STR12', 'STR7', 'BTTAIL', 'KUN', 'KUN-', 'RE20', 'KINGSAL ',
       'MANC', 'GP', 'SET ', 'AGL18'], dtype=object)

Remove whitespaces at the end of the string

In [68]:
df3['productBought'] = df3['productBought'].str.rstrip()

In [69]:
df3['productBought'].unique()

array(['0', 'DSSB', 'WTF', 'AGL8', 'GOL6', 'GOL', 'GOL7', 'BT21', 'BP',
       'ST', 'FLL', 'RS', 'KINGSAL', 'BBAGL', 'GH', 'FG', 'CP', 'SB',
       'STR12', 'STR7', 'BTTAIL', 'KUN', 'KUN-', 'RE20', 'MANC', 'GP',
       'SET', 'AGL18'], dtype=object)

We noticed that the cell 63 did not follow the exact product code. Hence, we will remove the additional '-' symbol.

In [70]:
df3['productBought'] = df3['productBought'].str.replace(pat='-', repl='', regex=True)

In [71]:
df3['productBought'].unique()

array(['0', 'DSSB', 'WTF', 'AGL8', 'GOL6', 'GOL', 'GOL7', 'BT21', 'BP',
       'ST', 'FLL', 'RS', 'KINGSAL', 'BBAGL', 'GH', 'FG', 'CP', 'SB',
       'STR12', 'STR7', 'BTTAIL', 'KUN', 'RE20', 'MANC', 'GP', 'SET',
       'AGL18'], dtype=object)

In [72]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info,salesQuantity,productBought
0,Good morning.,Firdaus Nordin,1:17,00:01:17,0,2,0,0,1,0,0
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,1:38,00:01:38,1,5,0,0,1,0,0
2,Hi morning sis and admin lns done,E-Beve,1:38,00:01:38,1,7,1,0,1,0,0
3,LNS,Mïššy Danté,1:55,00:01:55,0,1,1,0,1,0,0
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,2:18,00:02:18,0,5,0,0,1,0,0
5,Lns done ✅,Shuganya Devi,4:13,00:04:13,0,3,1,0,1,0,0
6,LnS done😊,Shuganya Devi,4:22,00:04:22,0,2,1,0,1,0,0
7,Hello,Xian Yang,4:40,00:04:40,0,1,0,0,1,0,0
8,Lns ✅,E-Beve,5:22,00:05:22,1,2,1,0,1,0,0
9,DEEP SEA SEABASS $10 DSSB+1,SJ Huang,5:29,00:05:29,0,5,0,1,0,0,DSSB


### Price of Products

**New Column to identify the price of the products**

This new column is created to identify the regex of the unique product codes and their corresponding prices, as adviced by the seller.

In [73]:
#products offered by the seller
df3[df3['postComment'].str.contains('(\$)(\s)?(.*)', regex=True)]

  df3[df3['postComment'].str.contains('(\$)(\s)?(.*)', regex=True)]


Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info,salesQuantity,productBought
9,DEEP SEA SEABASS $10 DSSB+1,SJ Huang,5:29,00:05:29,0,5,0,1,0,0,DSSB
14,WHITE THREADFIN $9 WTF+1,E-Beve,10:28,00:10:28,1,4,0,1,0,0,WTF
15,ANG GO LI 750G $8 AGL8+1,Veon Veon,10:30,00:10:30,0,6,0,1,0,0,AGL8
19,GOLDEN POMFRET 450G $6 GOL6+1,SJ Huang,11:52,00:11:52,0,5,0,1,0,0,GOL6
24,GOLDEN POMFRET 550G $7 GOL7+1,Veon Veon,13:36,00:13:36,0,5,0,1,0,0,GOL7
30,BATANG FILLET 1KG $21 BT21+1,Veon Veon,16:23,00:16:23,0,5,0,1,0,0,BT21
33,WILD BLACK POMFRET 450G $6.50 BP+1,Veon Veon,18:23,00:18:23,0,6,0,1,0,0,BP
36,SOTONG $17 ST+1,Shuganya Devi,20:26,00:20:26,0,3,0,1,0,0,ST
38,FLOWER CLAMS 500G $3.50 FLL+1,Ann Chia,23:11,00:23:11,0,5,0,1,0,0,FLL
40,WILD RED SNAPPER 750G $12 RS+1,Veon Veon,25:51,00:25:51,0,6,0,1,0,0,RS


In [74]:
def price(comment):
    if re.search(r'(\$)(\s)?(.*)', comment):
        return str(re.search(r'(\$)(\s)?(.*)', comment).group(0))
    else:
        return int(0)

In [75]:
df3['productPrice'] = df3['postComment'].apply(lambda x:price(x))

In [76]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info,salesQuantity,productBought,productPrice
0,Good morning.,Firdaus Nordin,1:17,00:01:17,0,2,0,0,1,0,0,0
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,1:38,00:01:38,1,5,0,0,1,0,0,0
2,Hi morning sis and admin lns done,E-Beve,1:38,00:01:38,1,7,1,0,1,0,0,0
3,LNS,Mïššy Danté,1:55,00:01:55,0,1,1,0,1,0,0,0
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,2:18,00:02:18,0,5,0,0,1,0,0,0
5,Lns done ✅,Shuganya Devi,4:13,00:04:13,0,3,1,0,1,0,0,0
6,LnS done😊,Shuganya Devi,4:22,00:04:22,0,2,1,0,1,0,0,0
7,Hello,Xian Yang,4:40,00:04:40,0,1,0,0,1,0,0,0
8,Lns ✅,E-Beve,5:22,00:05:22,1,2,1,0,1,0,0,0
9,DEEP SEA SEABASS $10 DSSB+1,SJ Huang,5:29,00:05:29,0,5,0,1,0,0,DSSB,$10 DSSB+1


In [77]:
df3['productPrice'].unique()

array([0, '$10 DSSB+1', '$9 WTF+1', '$8 AGL8+1', '$6 GOL6+1', '$7 GOL7+1',
       '$21 BT21+1', '$6.50 BP+1', '$17 ST+1', '$3.50 FLL+1', '$12 RS+1',
       '$18.50 KingSAL+1', '$8 BBAGL+1', '$7 GH+1', '$6 Fg+1', '$10 CP+1',
       '$5 SB+1', '$12 STR12+1', '$7 STR7+1', '$9 BTTAIL+1', '$8 KUN+1',
       '$20 RE20+1', '$8 MANC+1', '$18  AGL18+1'], dtype=object)

Excluding the comments where there was no product codes as adviced by the seller, we find the number of unique products offered by the seller.

In [78]:
#number of unique products offered by the seller
int(df3['productPrice'].nunique()) - int(1)

23

In [79]:
#total number of products offered
va['numProducts'] = int(df3['productPrice'].nunique()) - int(1)

In [80]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts
0,ebeveadmin/videos/2999090193679509,29,912,4067,28,343,16,35,23


**Drop irrelevant columns**

The following column was dropped for the following reasons:

1. 'postCommentTime'
- Since a new column 'postCommentTime_final' was created to ensure that a consistent timestamp of HH:MM:SS is used consistently throughout the dataframe, and the dataframe has been thereafter reindex and sorted in accordance to time in ascending order, we dropped the original inconsistent time column 'postCommentTime' as it had varying timestamp formats of HH:MM:SS, MM:SS and M:SS.

2. 'no_sale_info'
- This column 'notSeller' was solely created to calculate the quantity of sales made, and the products purchased by the customers. Hence, we are able to delete it after the quantity of sales have been calculated and the identification of the products purchased by the customers have been identified.

3. 'sales'
- This column identifies the quantity of products to the unique & specific product codes. However, it includes the comments which advises for the product sale information as well. Hence, this column was solely created to be multiplied against the column 'no_sale_info' to calculate the true quantity of sales made by customers only. Hence, we are able to delete it after the quantity of sales have been calculated.

In [81]:
#drop unwanted columns
df3.drop(['postCommentTime', 'no_sale_info', 'sales'], axis=1, inplace=True)

In [82]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productBought,productPrice
0,Good morning.,Firdaus Nordin,00:01:17,0,2,0,0,0,0
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,00:01:38,1,5,0,0,0,0
2,Hi morning sis and admin lns done,E-Beve,00:01:38,1,7,1,0,0,0
3,LNS,Mïššy Danté,00:01:55,0,1,1,0,0,0
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,00:02:18,0,5,0,0,0,0


### Revenue from the sale of the products

**Dummify the products bought to find the revenue**

In [83]:
#getdummies the products bought
df3 = pd.get_dummies(df3, columns = ['productBought'], drop_first = True)

In [84]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,productBought_KUN,productBought_MANC,productBought_RE20,productBought_RS,productBought_SB,productBought_SET,productBought_ST,productBought_STR12,productBought_STR7,productBought_WTF
0,Good morning.,Firdaus Nordin,00:01:17,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,00:01:38,1,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Hi morning sis and admin lns done,E-Beve,00:01:38,1,7,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,LNS,Mïššy Danté,00:01:55,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,00:02:18,0,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
# iterating the columns
for col in df3.columns:
    print(col)

postComment
postCommentAuthor
postCommentTime_final
isSeller
postCommentLength
lns
salesQuantity
productPrice
productBought_AGL18
productBought_AGL8
productBought_BBAGL
productBought_BP
productBought_BT21
productBought_BTTAIL
productBought_CP
productBought_DSSB
productBought_FG
productBought_FLL
productBought_GH
productBought_GOL
productBought_GOL6
productBought_GOL7
productBought_GP
productBought_KINGSAL
productBought_KUN
productBought_MANC
productBought_RE20
productBought_RS
productBought_SB
productBought_SET
productBought_ST
productBought_STR12
productBought_STR7
productBought_WTF


After the column 'productBought' has been dummified, the cells will return a '1' if that particular item is bought, and a '0' if it is not. Hence, we will replace all the '1' with the price of the product.

Then, we will create a new revenue column which is a multiplication of the column 'salesQuantity' against the price of the product (i.e. the dummified 'productBought' columnns).

Product AGL18

In [86]:
df3[df3['postComment'].str.contains('AGL18', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,productBought_KUN,productBought_MANC,productBought_RE20,productBought_RS,productBought_SB,productBought_SET,productBought_ST,productBought_STR12,productBought_STR7,productBought_WTF
111,Ang go li 1.4kg $18 AGL18+1,E-Beve,01:05:09,1,6,0,0,$18 AGL18+1,1,0,...,0,0,0,0,0,0,0,0,0,0
115,AGL18+1,Serene Rebecca Koh,01:07:09,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
df3['productBought_AGL18'] = df3['productBought_AGL18'].map(lambda x:float(18.00) if x == int(1) else 0)

In [88]:
df3['revenue_AGL18'] = np.multiply(df3['productBought_AGL18'], df3['salesQuantity'])

In [89]:
revenue_AGL18 = "The total revenue from the sale of the product {} is ${}". format ("AGL18", format(df3['revenue_AGL18'].sum(), '.2f'))
print(revenue_AGL18)


The total revenue from the sale of the product AGL18 is $18.00


Product AGL8

In [90]:
df3[df3['postComment'].str.contains('AGL8', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,productBought_MANC,productBought_RE20,productBought_RS,productBought_SB,productBought_SET,productBought_ST,productBought_STR12,productBought_STR7,productBought_WTF,revenue_AGL18
15,ANG GO LI 750G $8 AGL8+1,Veon Veon,00:10:30,0,6,0,0,$8 AGL8+1,0.0,1,...,0,0,0,0,0,0,0,0,0,0.0
17,AGL8+1,き リーサン,00:10:51,0,1,0,1,0,0.0,1,...,0,0,0,0,0,0,0,0,0,0.0


In [91]:
df3['productBought_AGL8'] = df3['productBought_AGL8'].map(lambda x:float(8.00) if x == int(1) else 0)

In [92]:
df3['revenue_AGL8'] = np.multiply(df3['productBought_AGL8'], df3['salesQuantity'])

In [93]:
revenue_AGL8 = "The total revenue from the sale of the product {} is ${}". format ("AGL8", format(df3['revenue_AGL8'].sum(), '.2f'))
print(revenue_AGL8)


The total revenue from the sale of the product AGL8 is $16.00


Product BBAGL

In [94]:
df3[df3['postComment'].str.contains('BBAGL', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,productBought_RE20,productBought_RS,productBought_SB,productBought_SET,productBought_ST,productBought_STR12,productBought_STR7,productBought_WTF,revenue_AGL18,revenue_AGL8
44,BABY ANG GO LI $8 BBAGL+1,Shuganya Devi,00:30:28,0,6,0,0,$8 BBAGL+1,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0


In [95]:
df3['productBought_BBAGL'] = df3['productBought_BBAGL'].map(lambda x:float(8.00) if x == int(1) else 0)

In [96]:
df3['revenue_BBAGL'] = np.multiply(df3['productBought_BBAGL'], df3['salesQuantity'])

In [97]:
revenue_BBAGL = "The total revenue from the sale of the product {} is ${}". format ("BBAGL", format(df3['revenue_BBAGL'].sum(), '.2f'))
print(revenue_BBAGL)


The total revenue from the sale of the product BBAGL is $8.00


Product BP

In [98]:
df3[df3['postComment'].str.contains('BP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,productBought_RS,productBought_SB,productBought_SET,productBought_ST,productBought_STR12,productBought_STR7,productBought_WTF,revenue_AGL18,revenue_AGL8,revenue_BBAGL
33,WILD BLACK POMFRET 450G $6.50 BP+1,Veon Veon,00:18:23,0,6,0,0,$6.50 BP+1,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
35,BP+2,E-Beve,00:20:05,1,1,0,2,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0


In [99]:
df3['productBought_BP'] = df3['productBought_BP'].map(lambda x:float(6.50) if x == int(1) else 0)

In [100]:
df3['revenue_BP'] = np.multiply(df3['productBought_BP'], df3['salesQuantity'])

In [101]:
revenue_BP = "The total revenue from the sale of the product {} is ${}". format ("BP", format(df3['revenue_BP'].sum(), '.2f'))
print(revenue_BP)


The total revenue from the sale of the product BP is $19.50


Product BT21

In [102]:
df3[df3['postComment'].str.contains('BT21', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,productBought_SB,productBought_SET,productBought_ST,productBought_STR12,productBought_STR7,productBought_WTF,revenue_AGL18,revenue_AGL8,revenue_BBAGL,revenue_BP
30,BATANG FILLET 1KG $21 BT21+1,Veon Veon,00:16:23,0,5,0,0,$21 BT21+1,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0


In [103]:
df3['productBought_BT21'] = df3['productBought_BT21'].map(lambda x:float(21.00) if x == int(1) else 0)

In [104]:
df3['revenue_BT21'] = np.multiply(df3['productBought_BT21'], df3['salesQuantity'])

In [105]:
revenue_BT21 = "The total revenue from the sale of the product {} is ${}". format ("BT21", format(df3['revenue_BT21'].sum(), '.2f'))
print(revenue_BT21)


The total revenue from the sale of the product BT21 is $42.00


Product BTTAIL

In [106]:
df3[df3['postComment'].str.contains('BTTAIL', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,productBought_SET,productBought_ST,productBought_STR12,productBought_STR7,productBought_WTF,revenue_AGL18,revenue_AGL8,revenue_BBAGL,revenue_BP,revenue_BT21
60,BATANG TAIL $9 BTTAIL+1,Serene Rebecca Koh,00:46:09,0,4,0,0,$9 BTTAIL+1,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


In [107]:
df3['productBought_BTTAIL'] = df3['productBought_BTTAIL'].map(lambda x:float(9.00) if x == int(1) else 0)

In [108]:
df3['revenue_BTTAIL'] = np.multiply(df3['productBought_BTTAIL'], df3['salesQuantity'])

In [109]:
revenue_BTTAIL = "The total revenue from the sale of the product {} is ${}". format ("BTTAIL", format(df3['revenue_BTTAIL'].sum(), '.2f'))
print(revenue_BTTAIL)


The total revenue from the sale of the product BTTAIL is $9.00


Product CP

In [110]:
df3[df3['postComment'].str.contains('CP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,productBought_ST,productBought_STR12,productBought_STR7,productBought_WTF,revenue_AGL18,revenue_AGL8,revenue_BBAGL,revenue_BP,revenue_BT21,revenue_BTTAIL
50,WILD CHINESE POMFRET 400G $10 CP+1,SJ Huang,00:36:59,0,6,0,0,$10 CP+1,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
51,CP+2,Veon Veon,00:37:28,0,1,0,2,0,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
df3['productBought_CP'] = df3['productBought_CP'].map(lambda x:float(10.00) if x == int(1) else 0)

In [112]:
df3['revenue_CP'] = np.multiply(df3['productBought_CP'], df3['salesQuantity'])

In [113]:
revenue_CP = "The total revenue from the sale of the product {} is ${}". format ("CP", format(df3['revenue_CP'].sum(), '.2f'))
print(revenue_CP)


The total revenue from the sale of the product CP is $30.00


Product DSSB

In [114]:
df3[df3['postComment'].str.contains('DSSB', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,productBought_STR12,productBought_STR7,productBought_WTF,revenue_AGL18,revenue_AGL8,revenue_BBAGL,revenue_BP,revenue_BT21,revenue_BTTAIL,revenue_CP
9,DEEP SEA SEABASS $10 DSSB+1,SJ Huang,00:05:29,0,5,0,0,$10 DSSB+1,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,DSSB+1,SJ Huang,00:05:51,0,1,0,1,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
df3['productBought_DSSB'] = df3['productBought_DSSB'].map(lambda x:float(10.00) if x == int(1) else 0)

In [116]:
df3['revenue_DSSB'] = np.multiply(df3['productBought_DSSB'], df3['salesQuantity'])

In [117]:
revenue_DSSB = "The total revenue from the sale of the product {} is ${}". format ("DSSB", format(df3['revenue_DSSB'].sum(), '.2f'))
print(revenue_DSSB)


The total revenue from the sale of the product DSSB is $20.00


Product FG

In [118]:
df3[df3['postComment'].str.contains('FG', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,productBought_STR7,productBought_WTF,revenue_AGL18,revenue_AGL8,revenue_BBAGL,revenue_BP,revenue_BT21,revenue_BTTAIL,revenue_CP,revenue_DSSB


In [119]:
df3[df3['postComment'].str.contains('Fg', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,productBought_STR7,productBought_WTF,revenue_AGL18,revenue_AGL8,revenue_BBAGL,revenue_BP,revenue_BT21,revenue_BTTAIL,revenue_CP,revenue_DSSB
48,FLOWER GROUPER 550G $6 Fg+1,Pauline Ng,00:34:49,0,5,0,0,$6 Fg+1,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
df3['productBought_FG'] = df3['productBought_FG'].map(lambda x:float(6.00) if x == int(1) else 0)

In [121]:
df3['revenue_FG'] = np.multiply(df3['productBought_FG'], df3['salesQuantity'])

In [122]:
revenue_FG = "The total revenue from the sale of the product {} is ${}". format ("FG", format(df3['revenue_FG'].sum(), '.2f'))
print(revenue_FG)


The total revenue from the sale of the product FG is $0.00


Product FLL

In [123]:
df3[df3['postComment'].str.contains('FLL', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,productBought_WTF,revenue_AGL18,revenue_AGL8,revenue_BBAGL,revenue_BP,revenue_BT21,revenue_BTTAIL,revenue_CP,revenue_DSSB,revenue_FG
38,FLOWER CLAMS 500G $3.50 FLL+1,Ann Chia,00:23:11,0,5,0,0,$3.50 FLL+1,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39,FLL+1,E-Beve,00:25:09,1,1,0,1,0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [124]:
df3['productBought_FLL'] = df3['productBought_FLL'].map(lambda x:float(3.50) if x == int(1) else 0)

In [125]:
df3['revenue_FLL'] = np.multiply(df3['productBought_FLL'], df3['salesQuantity'])

In [126]:
revenue_FLL = "The total revenue from the sale of the product {} is ${}". format ("FLL", format(df3['revenue_FLL'].sum(), '.2f'))
print(revenue_FLL)


The total revenue from the sale of the product FLL is $3.50


Product GH

In [127]:
df3[df3['postComment'].str.contains('GH', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_AGL18,revenue_AGL8,revenue_BBAGL,revenue_BP,revenue_BT21,revenue_BTTAIL,revenue_CP,revenue_DSSB,revenue_FG,revenue_FLL
46,WILD GROUPER FISH HEAD HALF $7 GH+1,Veon Veon,00:32:25,0,7,0,0,$7 GH+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [128]:
df3['productBought_GH'] = df3['productBought_GH'].map(lambda x:float(7.00) if x == int(1) else 0)

In [129]:
df3['revenue_GH'] = np.multiply(df3['productBought_GH'], df3['salesQuantity'])

In [130]:
revenue_GH = "The total revenue from the sale of the product {} is ${}". format ("GH", format(df3['revenue_GH'].sum(), '.2f'))
print(revenue_GH)


The total revenue from the sale of the product GH is $7.00


Product GOL

In [131]:
df3[df3['postComment'].str.contains('GOL', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_AGL8,revenue_BBAGL,revenue_BP,revenue_BT21,revenue_BTTAIL,revenue_CP,revenue_DSSB,revenue_FG,revenue_FLL,revenue_GH
19,GOLDEN POMFRET 450G $6 GOL6+1,SJ Huang,00:11:52,0,5,0,0,$6 GOL6+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,GOL6+2,Ann Chia,00:12:23,0,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,GOLDEN POMFRET 550G $7 GOL7+1,Veon Veon,00:13:36,0,5,0,0,$7 GOL7+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We noticed that there is no sale information with regards to the product code 'GOL'. Perhaps a wrong product code has been typed out and posted. Hence, we will exclude the sales related to the code 'GOL'.

Product GOL6

In [132]:
df3[df3['postComment'].str.contains('GOL6', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_AGL8,revenue_BBAGL,revenue_BP,revenue_BT21,revenue_BTTAIL,revenue_CP,revenue_DSSB,revenue_FG,revenue_FLL,revenue_GH
19,GOLDEN POMFRET 450G $6 GOL6+1,SJ Huang,00:11:52,0,5,0,0,$6 GOL6+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,GOL6+2,Ann Chia,00:12:23,0,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [133]:
df3['productBought_GOL6'] = df3['productBought_GOL6'].map(lambda x:float(6.00) if x == int(1) else 0)

In [134]:
df3['revenue_GOL6'] = np.multiply(df3['productBought_GOL6'], df3['salesQuantity'])

In [135]:
revenue_GOL6 = "The total revenue from the sale of the product {} is ${}". format ("GOL6", format(df3['revenue_GOL6'].sum(), '.2f'))
print(revenue_GOL6)


The total revenue from the sale of the product GOL6 is $12.00


Product GOL7

In [136]:
df3[df3['postComment'].str.contains('GOL7', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_BBAGL,revenue_BP,revenue_BT21,revenue_BTTAIL,revenue_CP,revenue_DSSB,revenue_FG,revenue_FLL,revenue_GH,revenue_GOL6
24,GOLDEN POMFRET 550G $7 GOL7+1,Veon Veon,00:13:36,0,5,0,0,$7 GOL7+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [137]:
df3['productBought_GOL7'] = df3['productBought_GOL7'].map(lambda x:float(7.00) if x == int(1) else 0)

In [138]:
df3['revenue_GOL7'] = np.multiply(df3['productBought_GOL7'], df3['salesQuantity'])

In [139]:
revenue_GOL7 = "The total revenue from the sale of the product {} is ${}". format ("GOL7", format(df3['revenue_GOL7'].sum(), '.2f'))
print(revenue_GOL7)


The total revenue from the sale of the product GOL7 is $14.00


Product GP

In [140]:
df3[df3['postComment'].str.contains('GP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_BP,revenue_BT21,revenue_BTTAIL,revenue_CP,revenue_DSSB,revenue_FG,revenue_FLL,revenue_GH,revenue_GOL6,revenue_GOL7
97,GP+1,Serene Rebecca Koh,00:59:46,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We noticed that there is no sale information with regards to the product code 'GP'. Perhaps a wrong product code has been typed out and posted. Hence, we will exclude the sales related to the code 'GP'.

Product KINGSAL

In [141]:
df3[df3['postComment'].str.contains('KingSAL', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_BP,revenue_BT21,revenue_BTTAIL,revenue_CP,revenue_DSSB,revenue_FG,revenue_FLL,revenue_GH,revenue_GOL6,revenue_GOL7
42,WILD NEW ZEALAND SALMON $18.50 KingSAL+1,SJ Huang,00:28:24,0,6,0,0,$18.50 KingSAL+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [142]:
df3['productBought_KINGSAL'] = df3['productBought_KINGSAL'].map(lambda x:float(18.50) if x == int(1) else 0)

In [143]:
df3['revenue_KINGSAL'] = np.multiply(df3['productBought_KINGSAL'], df3['salesQuantity'])

In [144]:
revenue_KINGSAL = "The total revenue from the sale of the product {} is ${}". format ("KINGSAL", format(df3['revenue_KINGSAL'].sum(), '.2f'))
print(revenue_KINGSAL)


The total revenue from the sale of the product KINGSAL is $74.00


Product KUN

In [145]:
df3[df3['postComment'].str.contains('KUN', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_BT21,revenue_BTTAIL,revenue_CP,revenue_DSSB,revenue_FG,revenue_FLL,revenue_GH,revenue_GOL6,revenue_GOL7,revenue_KINGSAL
62,KUNNING $8 KUN+1,Shuganya Devi,00:47:11,0,3,0,0,$8 KUN+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63,KUN-+1,Shuganya Devi,00:47:16,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [146]:
df3['productBought_KUN'] = df3['productBought_KUN'].map(lambda x:float(8.00) if x == int(1) else 0)

In [147]:
df3['revenue_KUN'] = np.multiply(df3['productBought_KUN'], df3['salesQuantity'])

In [148]:
revenue_KUN = "The total revenue from the sale of the product {} is ${}". format ("KUN", format(df3['revenue_KUN'].sum(), '.2f'))
print(revenue_KUN)


The total revenue from the sale of the product KUN is $16.00


Product MANC

In [149]:
df3[df3['postComment'].str.contains('MANC', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_BTTAIL,revenue_CP,revenue_DSSB,revenue_FG,revenue_FLL,revenue_GH,revenue_GOL6,revenue_GOL7,revenue_KINGSAL,revenue_KUN
89,MANGO CLAMS $8 MANC+1,E-Beve,00:57:12,1,4,0,0,$8 MANC+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [150]:
df3['productBought_MANC'] = df3['productBought_MANC'].map(lambda x:float(8.00) if x == int(1) else 0)

In [151]:
df3['revenue_MANC'] = np.multiply(df3['productBought_MANC'], df3['salesQuantity'])

In [152]:
revenue_MANC = "The total revenue from the sale of the product {} is ${}". format ("MANC", format(df3['revenue_MANC'].sum(), '.2f'))
print(revenue_MANC)


The total revenue from the sale of the product MANC is $0.00


Product RE20

In [153]:
df3[df3['postComment'].str.contains('RE20', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_CP,revenue_DSSB,revenue_FG,revenue_FLL,revenue_GH,revenue_GOL6,revenue_GOL7,revenue_KINGSAL,revenue_KUN,revenue_MANC
65,RED EMPEROR 1.1KG $20 RE20+1,Ace Tan,00:51:01,0,5,0,0,$20 RE20+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [154]:
df3['productBought_RE20'] = df3['productBought_RE20'].map(lambda x:float(20.00) if x == int(1) else 0)

In [155]:
df3['revenue_RE20'] = np.multiply(df3['productBought_RE20'], df3['salesQuantity'])

In [156]:
revenue_RE20 = "The total revenue from the sale of the product {} is ${}". format ("RE20", format(df3['revenue_RE20'].sum(), '.2f'))
print(revenue_RE20)


The total revenue from the sale of the product RE20 is $0.00


Product RS

In [157]:
df3[df3['postComment'].str.contains('RS', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_DSSB,revenue_FG,revenue_FLL,revenue_GH,revenue_GOL6,revenue_GOL7,revenue_KINGSAL,revenue_KUN,revenue_MANC,revenue_RE20
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,00:01:38,1,5,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,00:02:18,0,5,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,WILD RED SNAPPER 750G $12 RS+1,Veon Veon,00:25:51,0,6,0,0,$12 RS+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [158]:
df3['productBought_RS'] = df3['productBought_RS'].map(lambda x:float(12.00) if x == int(1) else 0)

In [159]:
df3['revenue_RS'] = np.multiply(df3['productBought_RS'], df3['salesQuantity'])

In [160]:
revenue_RS = "The total revenue from the sale of the product {} is ${}". format ("RS", format(df3['revenue_RS'].sum(), '.2f'))
print(revenue_RS)


The total revenue from the sale of the product RS is $12.00


Product SB

In [161]:
df3[df3['postComment'].str.contains('SB', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_FG,revenue_FLL,revenue_GH,revenue_GOL6,revenue_GOL7,revenue_KINGSAL,revenue_KUN,revenue_MANC,revenue_RE20,revenue_RS
9,DEEP SEA SEABASS $10 DSSB+1,SJ Huang,00:05:29,0,5,0,0,$10 DSSB+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,DSSB+1,SJ Huang,00:05:51,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53,KUKUP SEABASS $5 SB+1,Serene Rebecca Koh,00:39:54,0,4,0,0,$5 SB+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55,SB+2,E-Beve,00:40:26,1,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [162]:
df3['productBought_SB'] = df3['productBought_SB'].map(lambda x:float(10.00) if x == int(1) else 0)

In [163]:
df3['revenue_SB'] = np.multiply(df3['productBought_SB'], df3['salesQuantity'])

In [164]:
revenue_SB = "The total revenue from the sale of the product {} is ${}". format ("SB", format(df3['revenue_SB'].sum(), '.2f'))
print(revenue_SB)


The total revenue from the sale of the product SB is $20.00


Product SET

In [165]:
df3[df3['postComment'].str.contains('SET', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_FLL,revenue_GH,revenue_GOL6,revenue_GOL7,revenue_KINGSAL,revenue_KUN,revenue_MANC,revenue_RE20,revenue_RS,revenue_SB
67,LNS GIVEAWAY DURI SET,Amir Abdul Majid,00:52:23,0,4,1,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [166]:
df3[df3['postComment'].str.contains('set', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_FLL,revenue_GH,revenue_GOL6,revenue_GOL7,revenue_KINGSAL,revenue_KUN,revenue_MANC,revenue_RE20,revenue_RS,revenue_SB
107,Ok Batang set +1,Serene Rebecca Koh,01:04:04,0,4,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We noticed that there is no sale information with regards to the product code 'SET'. Perhaps a wrong product code has been typed out and posted. Hence, we will exclude the sales related to the code 'SET'.

Product ST

In [167]:
df3[df3['postComment'].str.contains('ST', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_FLL,revenue_GH,revenue_GOL6,revenue_GOL7,revenue_KINGSAL,revenue_KUN,revenue_MANC,revenue_RE20,revenue_RS,revenue_SB
36,SOTONG $17 ST+1,Shuganya Devi,00:20:26,0,3,0,0,$17 ST+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57,STINGRAY 1KG $12 STR12+1,Shuganya Devi,00:42:06,0,4,0,0,$12 STR12+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59,STINRGRAY 650G $7 STR7+1,E-Beve,00:45:57,1,4,0,0,$7 STR7+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [168]:
df3['productBought_ST'] = df3['productBought_ST'].map(lambda x:float(17.00) if x == int(1) else 0)

In [169]:
df3['revenue_ST'] = np.multiply(df3['productBought_ST'], df3['salesQuantity'])

In [170]:
revenue_ST = "The total revenue from the sale of the product {} is ${}". format ("ST", format(df3['revenue_ST'].sum(), '.2f'))
print(revenue_ST)


The total revenue from the sale of the product ST is $17.00


Product STR12

In [171]:
df3[df3['postComment'].str.contains('STR12', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_GH,revenue_GOL6,revenue_GOL7,revenue_KINGSAL,revenue_KUN,revenue_MANC,revenue_RE20,revenue_RS,revenue_SB,revenue_ST
57,STINGRAY 1KG $12 STR12+1,Shuganya Devi,00:42:06,0,4,0,0,$12 STR12+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [172]:
df3['productBought_STR12'] = df3['productBought_STR12'].map(lambda x:float(12.00) if x == int(1) else 0)

In [173]:
df3['revenue_STR12'] = np.multiply(df3['productBought_STR12'], df3['salesQuantity'])

In [174]:
revenue_STR12 = "The total revenue from the sale of the product {} is ${}". format ("STR12", format(df3['revenue_STR12'].sum(), '.2f'))
print(revenue_STR12)


The total revenue from the sale of the product STR12 is $12.00


Product STR7

In [175]:
df3[df3['postComment'].str.contains('STR7', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_GOL6,revenue_GOL7,revenue_KINGSAL,revenue_KUN,revenue_MANC,revenue_RE20,revenue_RS,revenue_SB,revenue_ST,revenue_STR12
59,STINRGRAY 650G $7 STR7+1,E-Beve,00:45:57,1,4,0,0,$7 STR7+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [176]:
df3['productBought_STR7'] = df3['productBought_STR7'].map(lambda x:float(7.00) if x == int(1) else 0)

In [177]:
df3['revenue_STR7'] = np.multiply(df3['productBought_STR7'], df3['salesQuantity'])

In [178]:
revenue_STR7 = "The total revenue from the sale of the product {} is ${}". format ("STR7", format(df3['revenue_STR7'].sum(), '.2f'))
print(revenue_STR7)


The total revenue from the sale of the product STR7 is $0.00


Product WTF

In [179]:
df3[df3['postComment'].str.contains('WTF', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_GOL7,revenue_KINGSAL,revenue_KUN,revenue_MANC,revenue_RE20,revenue_RS,revenue_SB,revenue_ST,revenue_STR12,revenue_STR7
14,WHITE THREADFIN $9 WTF+1,E-Beve,00:10:28,1,4,0,0,$9 WTF+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [180]:
df3['productBought_WTF'] = df3['productBought_WTF'].map(lambda x:float(9.00) if x == int(1) else 0)

In [181]:
df3['revenue_WTF'] = np.multiply(df3['productBought_WTF'], df3['salesQuantity'])

In [182]:
revenue_WTF = "The total revenue from the sale of the product {} is ${}". format ("WTF", format(df3['revenue_WTF'].sum(), '.2f'))
print(revenue_WTF)


The total revenue from the sale of the product WTF is $0.00


In [183]:
# iterating the columns
for col in df3.columns:
    print(col)

postComment
postCommentAuthor
postCommentTime_final
isSeller
postCommentLength
lns
salesQuantity
productPrice
productBought_AGL18
productBought_AGL8
productBought_BBAGL
productBought_BP
productBought_BT21
productBought_BTTAIL
productBought_CP
productBought_DSSB
productBought_FG
productBought_FLL
productBought_GH
productBought_GOL
productBought_GOL6
productBought_GOL7
productBought_GP
productBought_KINGSAL
productBought_KUN
productBought_MANC
productBought_RE20
productBought_RS
productBought_SB
productBought_SET
productBought_ST
productBought_STR12
productBought_STR7
productBought_WTF
revenue_AGL18
revenue_AGL8
revenue_BBAGL
revenue_BP
revenue_BT21
revenue_BTTAIL
revenue_CP
revenue_DSSB
revenue_FG
revenue_FLL
revenue_GH
revenue_GOL6
revenue_GOL7
revenue_KINGSAL
revenue_KUN
revenue_MANC
revenue_RE20
revenue_RS
revenue_SB
revenue_ST
revenue_STR12
revenue_STR7
revenue_WTF


**Sum of total revenue from the video**

In [184]:
#total revenue from the video
total_revenue = df3.loc[:, 'revenue_AGL18': 'revenue_WTF'].values.sum()

#round the total revenue to 2 decimals places
total_revenue_rounded = format(total_revenue, '.2f')

print(f"The total revenue for the sale of products for the video is ${total_revenue_rounded}")


The total revenue for the sale of products for the video is $350.00


In [185]:
va['totalRevenue'] = total_revenue_rounded
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue
0,ebeveadmin/videos/2999090193679509,29,912,4067,28,343,16,35,23,350.0


**New Column for the total revenue at that comment**

In [186]:
#https://stackoverflow.com/questions/42063716/pandas-sum-up-multiple-columns-into-one-column-without-last-column
df3['revenue'] = df3.loc[:, 'revenue_AGL18': 'revenue_WTF'].sum(axis=1)
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AGL18,productBought_AGL8,...,revenue_KUN,revenue_MANC,revenue_RE20,revenue_RS,revenue_SB,revenue_ST,revenue_STR12,revenue_STR7,revenue_WTF,revenue
0,Good morning.,Firdaus Nordin,00:01:17,0,2,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,00:01:38,1,5,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Hi morning sis and admin lns done,E-Beve,00:01:38,1,7,1,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,LNS,Mïššy Danté,00:01:55,0,1,1,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,00:02:18,0,5,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Lns done ✅,Shuganya Devi,00:04:13,0,3,1,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,LnS done😊,Shuganya Devi,00:04:22,0,2,1,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Hello,Xian Yang,00:04:40,0,1,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Lns ✅,E-Beve,00:05:22,1,2,1,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,DEEP SEA SEABASS $10 DSSB+1,SJ Huang,00:05:29,0,5,0,0,$10 DSSB+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [187]:
#https://www.geeksforgeeks.org/how-to-move-a-column-to-first-position-in-pandas-dataframe/
#shift the revenue column to be the 8th column, i.e at position 7
eighth_column = df3.pop('revenue')

# insert column using insert(position,column_name,ninth_column) function
df3.insert(7, 'revenue', eighth_column)
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,productPrice,productBought_AGL18,...,revenue_KINGSAL,revenue_KUN,revenue_MANC,revenue_RE20,revenue_RS,revenue_SB,revenue_ST,revenue_STR12,revenue_STR7,revenue_WTF
0,Good morning.,Firdaus Nordin,00:01:17,0,2,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,00:01:38,1,5,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Hi morning sis and admin lns done,E-Beve,00:01:38,1,7,1,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,LNS,Mïššy Danté,00:01:55,0,1,1,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,00:02:18,0,5,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Lns done ✅,Shuganya Devi,00:04:13,0,3,1,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,LnS done😊,Shuganya Devi,00:04:22,0,2,1,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Hello,Xian Yang,00:04:40,0,1,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Lns ✅,E-Beve,00:05:22,1,2,1,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,DEEP SEA SEABASS $10 DSSB+1,SJ Huang,00:05:29,0,5,0,0,0.0,$10 DSSB+1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


After the revenue of the sales have been calculated, all the dummified product columns and the column 'productPrice' will be dropped as they are no longer required to be checked against to identify the price of the product.

In [188]:
df3 = df3.loc[: ,'postComment':'revenue']
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue
0,Good morning.,Firdaus Nordin,00:01:17,0,2,0,0,0.0
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,00:01:38,1,5,0,0,0.0
2,Hi morning sis and admin lns done,E-Beve,00:01:38,1,7,1,0,0.0
3,LNS,Mïššy Danté,00:01:55,0,1,1,0,0.0
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,00:02:18,0,5,0,0,0.0
5,Lns done ✅,Shuganya Devi,00:04:13,0,3,1,0,0.0
6,LnS done😊,Shuganya Devi,00:04:22,0,2,1,0,0.0
7,Hello,Xian Yang,00:04:40,0,1,0,0,0.0
8,Lns ✅,E-Beve,00:05:22,1,2,1,0,0.0
9,DEEP SEA SEABASS $10 DSSB+1,SJ Huang,00:05:29,0,5,0,0,0.0


**New Column to identify the frequency of the seller's comments in the video**

In [189]:
#frequency of seller's comments
va['frequencySeller']= np.divide((va['videoLength'].iloc[0]),va['numSellerComments'])
#seller's comment appears on average of every 145 seconds

In [190]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue,frequencySeller
0,ebeveadmin/videos/2999090193679509,29,912,4067,28,343,16,35,23,350.0,145.25


**New Column to identify the seller**

In [191]:
df3['seller'] = 'ebeveadmin'

In [192]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller
0,Good morning.,Firdaus Nordin,00:01:17,0,2,0,0,0.0,ebeveadmin
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,00:01:38,1,5,0,0,0.0,ebeveadmin
2,Hi morning sis and admin lns done,E-Beve,00:01:38,1,7,1,0,0.0,ebeveadmin
3,LNS,Mïššy Danté,00:01:55,0,1,1,0,0.0,ebeveadmin
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,00:02:18,0,5,0,0,0.0,ebeveadmin


**New Column for the Average Compound Score from Sentiment Analysis on raw & uncleaned comments for video**

In [193]:
# Instantiate Sentiment Intensity Analyzer
sent = SentimentIntensityAnalyzer()

In [194]:
df3['sentiment_score'] = df3['postComment'].apply(sent.polarity_scores)
df3['compound'] = [sent.polarity_scores(x)['compound'] for x in df3['postComment']]
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller,sentiment_score,compound
0,Good morning.,Firdaus Nordin,00:01:17,0,2,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 0.256, 'pos': 0.744, 'compound': 0.4404}",0.4404
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,00:01:38,1,5,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
2,Hi morning sis and admin lns done,E-Beve,00:01:38,1,7,1,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
3,LNS,Mïššy Danté,00:01:55,0,1,1,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,00:02:18,0,5,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0


In [195]:
#average compound scores for the video
#df3.shape[0] calculates the total number of rows in the dataframe
va['averageCompound']= (df3['compound'].sum())/(df3['compound'].sum())/df3.shape[0]
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue,frequencySeller,averageCompound
0,ebeveadmin/videos/2999090193679509,29,912,4067,28,343,16,35,23,350.0,145.25,0.008475


In [196]:
#drop the columns with regarding to the sentiment analysis on the raw & uncleaned comments
#as we will perform sentiment analysis at the comment level on processed comments
df3 = df3.loc[: ,'postComment':'seller']
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller
0,Good morning.,Firdaus Nordin,00:01:17,0,2,0,0,0.0,ebeveadmin
1,WHATSAPP 88896368 FOR NEW BUYERS,E-Beve,00:01:38,1,5,0,0,0.0,ebeveadmin
2,Hi morning sis and admin lns done,E-Beve,00:01:38,1,7,1,0,0.0,ebeveadmin
3,LNS,Mïššy Danté,00:01:55,0,1,1,0,0.0,ebeveadmin
4,WHATSAPP 88896368 FOR NEW BUYERS,Norsuwali Ali,00:02:18,0,5,0,0,0.0,ebeveadmin


### Saving the cleaned dataframes

In [197]:
# export to csv - change the name of the data file for each video
va.to_csv('../../data/cleaned_data/cleaned_va_ebeveadmin_2999090193679509.csv', index=False)

In [198]:
#check for nulls
#displaying only the columns with nulls and their sum
df3[df3.columns[df3.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [199]:
# export to csv - change the name of the data file for each video
df3.to_csv('../../data/cleaned_data/cleaned_ebeveadmin_2999090193679509.csv', index=False)