# Data Import & Cleaning

### Contents:

- [Data Import](#Data-Import)
- [Data Cleaning](#Data-Cleaning)
- [Feature Engineering](#Feature-Engineering)

### Import Libraries

In [1]:
#import standard libraries
import pandas as pd
import numpy as np

#import emoji
import emoji

from natsort import natsorted, index_natsorted, order_by_index

#import warnings to ignore flags when the project is complete
#import warnings
#warnings.filterwarnings('ignore')

#import pre-processing libraries for data cleaning
import string
import re
import nltk
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

## Data Import

**Read scrapped data for the following videos**

In [2]:
va = pd.read_csv('../../data/scrapped_data/va_OCEANSTARLIVE_869455797269649.csv')

In [3]:
va

Unnamed: 0,video_for,totalEmojiReaction,views
0,OCEANSTARLIVE/videos/869455797269649,17,439


In [4]:
df = pd.read_csv('../../data/scrapped_data/OCEANSTARLIVE_869455797269649.csv', encoding='utf-8')

In [5]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime
0,"Bye <span class=""pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu""><img alt=""👋"" height=""16"" referrerpolicy=""origin-when-cross-origin"" src=""https://static.xx.fbcdn.net/images/emoji.php/v9/t99/2/16/1f44b.png"" width=""16""/></span>",Jane Wong,0:00
1,Oss,Eve Kang,0:40
2,"BIG FISHES!!!</div><div dir=""auto"" style=""text-align: start;"">WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90</div><div dir=""auto"" style=""text-align: start;"">Comment「EMP399+1」below to join the Sale",OceanStar Seafood,0:53
3,No more shells for this mala clams.,Jane Wong,1:00:01
4,Btt399+1,Jennifer Quek,1:00:03
5,"Joy+2</div><div dir=""auto"" style=""text-align: start;"">Jay+2</div><div dir=""auto"" style=""text-align: start;"">Sm+2</div><div dir=""auto"" style=""text-align: start;"">Jom+8</div><div dir=""auto"" style=""text-align: start;"">VNL+1</div><div dir=""auto"" style=""text-align: start;"">Ccl+1</div><div dir=""auto"" style=""text-align: start;"">Jf+1",Winnie Wu,1:00:04
6,Hello I m bk!,Stella Lim,1:00:14
7,Jf+5,Jennifer Quek,1:00:23
8,BTF+1,Jasmine Chua,1:00:26
9,Tmp +1,Jennifer Quek,1:00:36


In [6]:
#https://stackoverflow.com/questions/32072076/find-the-unique-values-in-a-column-and-then-sort-them
#check if the seller uses multiple accounts to reply
postCommentAuthor_unique = df['postCommentAuthor'].unique()
print(sorted(postCommentAuthor_unique))

['Catherine Gan-Chua', 'Catherine Phua', 'Cjwee Surin', 'Clarice Goh', 'Eve Kang', 'Flora Chew', 'Ivy Ng Lee Cheng', 'Ja Ja', 'Jack Daniel', 'Jacqueline Hoe', 'Jane Wong', 'Jasmine Chua', 'Jennie Gan', 'Jennifer Quek', 'Jmie Jmie', 'Judy Tan', 'Juniyati Soh', 'Kelley Lee', 'Khym Hoon Sung', 'Lyn Leong', 'OceanStar Seafood', 'Pat Wong', 'Patrick Ong', 'Sharon Dias-Desmond', 'Sharon Lau', 'Stacey Ho', 'Stella Lim', 'Susan Chee', 'Teddy Teddy', 'Vic Swi', 'Winnie Wu', 'き リーサン', '黄丽娟']


In [7]:
#find comments posted by the seller only
df.loc[df['postCommentAuthor'] == 'OceanStar Seafood']

Unnamed: 0,postComment,postCommentAuthor,postCommentTime
2,"BIG FISHES!!!</div><div dir=""auto"" style=""text-align: start;"">WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90</div><div dir=""auto"" style=""text-align: start;"">Comment「EMP399+1」below to join the Sale",OceanStar Seafood,0:53
10,"2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11</div><div dir=""auto"" style=""text-align: start;"">Comment「TMP+1」below to join the Sale.",OceanStar Seafood,1:01:07
11,"2 X RAW IKAN BILIS 400G/ PKT @ $8.90</div><div dir=""auto"" style=""text-align: start;"">Comment「FIB+1」below to join the Sale",OceanStar Seafood,1:01:31
12,"JAPAN APPLE JUICE 1L @$12.90</div><div dir=""auto"" style=""text-align: start;"">Comment「JA+1」below to join the Sale.",OceanStar Seafood,1:01:42
15,"ORGANIC COOKED SHRIMP 31-40PCS/ PKT @$ 18.80</div><div dir=""auto"" style=""text-align: start;"">Comment「SCP+1」below to join the Sale.",OceanStar Seafood,1:02:30
17,"ORGANIC COOKED SHRIMP 31-40PCS/ PKT @$ 18.80</div><div dir=""auto"" style=""text-align: start;"">Comment「SCP+1」below to join the Sale.",OceanStar Seafood,1:03:18
24,"CHINESE POMFRET 1.85-1.9KG/ PCS @$99.00</div><div dir=""auto"" style=""text-align: start;"">Comment「CP99+1」below to join the Sale",OceanStar Seafood,1:05:16
27,"LAST SET!!!</div><div dir=""auto"" style=""text-align: start;"">LONG DAN HEAD PORTION 700G/ PKT @$19.90</div><div dir=""auto"" style=""text-align: start;"">Comment「LDH+1」below to join the Sale.",OceanStar Seafood,1:05:59
30,"LAST 3 SET!!!</div><div dir=""auto"" style=""text-align: start;"">2 X WILD BARRAMUNDI 450-600G/ PCS @$11.11</div><div dir=""auto"" style=""text-align: start;"">Comment「WB11+1」below to join the Sale",OceanStar Seafood,1:06:25
32,"3 X MALE MUD CRAB 350-450G/ PCS @$29.90</div><div dir=""auto"" style=""text-align: start;"">Comment「MC299+1」below to join the Sale",OceanStar Seafood,1:07:09


In [8]:
#check the comments to have a gauge
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

['$1的波罗蜜可以买几包', '1123 black friday', '2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11</div><div dir="auto" style="text-align: start;">Comment「TMP+1」below to join the Sale.', '2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11</div><div dir="auto" style="text-align: start;">Comment「BCP+1」below to join the Sale.', '2 X BABY THREADFIN 300-400G / PCS @ $12.90</div><div dir="auto" style="text-align: start;">Comment「BBT+1」below to join the Sale', '2 X BATANG STEAK 250-350G/ PKT @$13.90</div><div dir="auto" style="text-align: start;">Comment「BST+1」below to join the Sale', '2 X BATANG TAIL 1.1-1.2 KG/ PCS @$39.90</div><div dir="auto" style="text-align: start;">Comment「BTT399+1」below to join the Sale', '2 X BLACK POMFRET 400-500G/ PCS @$15.90</div><div dir="auto" style="text-align: start;">Comment「BP+1」below to join the Sale', '2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.11</div><div dir="auto" style="text-align: start;">Comment「PCT+1」below to join the Sale', '2 X FRANCE COD FL 400-500G/ PKT @$

## Data Cleaning

### Convert the emojis to text for easy cleaning

We noticed that the emojis have html parsers attached to it. Hence, we will convert the images of the emojis to text first, to remove the html parsers to the emojis, while retaining the emoji's text. We will convert it back to emoji afterwards.

In [9]:
df['postComment'] = df['postComment'].apply(emoji.demojize)

In [10]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

['$1的波罗蜜可以买几包', '1123 black friday', '2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11</div><div dir="auto" style="text-align: start;">Comment「TMP+1」below to join the Sale.', '2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11</div><div dir="auto" style="text-align: start;">Comment「BCP+1」below to join the Sale.', '2 X BABY THREADFIN 300-400G / PCS @ $12.90</div><div dir="auto" style="text-align: start;">Comment「BBT+1」below to join the Sale', '2 X BATANG STEAK 250-350G/ PKT @$13.90</div><div dir="auto" style="text-align: start;">Comment「BST+1」below to join the Sale', '2 X BATANG TAIL 1.1-1.2 KG/ PCS @$39.90</div><div dir="auto" style="text-align: start;">Comment「BTT399+1」below to join the Sale', '2 X BLACK POMFRET 400-500G/ PCS @$15.90</div><div dir="auto" style="text-align: start;">Comment「BP+1」below to join the Sale', '2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.11</div><div dir="auto" style="text-align: start;">Comment「PCT+1」below to join the Sale', '2 X FRANCE COD FL 400-500G/ PKT @$

### Clean the comments using Regex

From the above unique values of the comments, there are several cleaning issues that needs to be addressed.

    1. Removal of links or URLs. 
        - URLs are present when the Facebook users manually type a link in the comment. 
        - Additionally, when users are tagged, their Facebook profile URL is printed as a result.
        - Similarly, there is a unique URL linked to each emoji as well. 
        
    2. Removal of HTML special entities
        - Examples of HTML special entities are '&amp' and '&gt'. 
        
    3. Removal of other HTML special terms
        - Examples of other HTML special entities are whitespace HTML special entities like '#x200B' and '#xa0'.
        
    4. Removal of HTML Parsers to the Emojis
        - Previously, the emojis have been demojized to text already. However, the HTML parsers to the mojis remain. Hence, they are required to be removed as well.
        
    5. Removal of HTML Parcers to Whitespaces
        - When a next line is entered in the same comment, there will be a HTML parser to the this whitespace. Hence, this are to be removed as well.
        
    6. Removal of HTML Parsers to tagged names
        - When users are tagged in the comments, in addition to their Facebook profile URL, there will be HTML parsers to the tagged names as well. Hence, this are to be removed as well.
        
    7. Removsl of other HTML Parsers   

In [11]:
def clean(row):
    
    # Remove links or URLs
    row['postComment'] = re.sub(
        pattern=r'https?:\/\/.*\.png', 
        repl='', 
        string=row['postComment'],
        flags=re.M)
    
    # Remove HTML special entities (e.g.. &amp, &gt;)
    row['postComment'] = re.sub(
        pattern=r'\&\w*;',
        repl='',
        string=row['postComment'])

    # Remove emoji html parsers
    #https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
    row['postComment'] = re.sub(
        pattern=r'<img\salt=\"(\:\w*)(\-?)(\w*\:)\"\s[a-z]{6}\=\"\d\d\"\s[a-z]{14}\=.{26}\s[a-z]{3}\=\"\"\s[a-z]{5}\=\"\d\d\"\/>(<\/span>)?',
        repl=r'\1\2\3', 
        string=row['postComment'],
        flags=re.M)
    
    # Remove whitespaces
    row['postComment'] = re.sub(
        pattern=r'<\/div><div\sdir\=\"auto\"\sstyle\=\"text\-align\:\sstart\;\">',
        repl=' ',
        string=row['postComment'],
        flags=re.M)
    
    # Remove emoji html parsers
    row['postComment'] = re.sub(
        pattern = r'<span\sclass\=\"([a-z0-9]{8}\s)+[a-z0-9]{8}\">',
        repl=r' ',
        string=row['postComment'],
        flags=re.M)

    # Remove emoji html parsers
    row['postComment'] = re.sub(
        pattern = r'<div\sclass\=\"([a-z0-9]{8}\s)+[a-z0-9]{8}\".*<\/div>',
        repl=r' ',
        string=row['postComment'],
        flags=re.M)
    
    # Remove emoji html parsers
    row['postComment'] = re.sub(
        pattern=r'<\/span>', 
        repl=' ', 
        string=row['postComment'],
        flags=re.M)
 
    # Remove consecutive non-ASCII characters
    # This will remove the chinese comments
    #https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
    row['postComment'] = re.sub(
        pattern=r'[^\x00-\x7F]+', 
        repl=' ', 
        string=row['postComment'],
        flags=re.M)

    # Remove links or URLs
    row['postComment'] = re.sub(
        pattern=r'https?:\/\/.*tabindex\=\"\d\"', 
        repl='', 
        string=row['postComment'],
        flags=re.M)
    
    #remove tagged names
    row['postComment'] = re.sub(
        pattern=r'<a class="oajrlxb2 g5ia77u1 qu0x051f esr5mh6w e9989ue4 r7d6kgcz rq0escxv nhd2j8a9 nc684nl6 p7hjln8o kvgmc6g5 cxmmr5t8 oygrvhab hcukyx3x jb3vyjys rz4wbd8a qt6c0cv9 a8nywdso i1ao9s8h esuyzwwr f1sip0of lzcic4wl oo9gr5id gpro0wi8 lrazzd5p" href=">',
        repl='',
        string=row['postComment'],
        flags=re.M)    
    
    #remove tagged names
    row['postComment'] = re.sub(
        pattern=r'<span\sclass\=\"([a-z0-9]{8})\">',
        repl='',
        string=row['postComment'],
        flags=re.M)
    
    #remove tagged names
    row['postComment'] = re.sub(
        pattern=r'<span.*<\/a>',
        repl='',
        string=row['postComment'],
        flags=re.M)
    
    return row
    

In [12]:
df = df.apply(clean, axis=1)

In [13]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' ', '  :grinning_face_with_big_eyes:', '  :grinning_squinting_face:', '  :loudly_crying_face:', '  :waving_hand:', ' 5 needs 2 pkts', ' :face_screaming_in_fear:', ' :grinning_squinting_face:', ' :rolling_on_the_floor_laughing:', ' QM ', ' Qm', ' is jackfruit ', '$1 ', '1123 black friday', '2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.', '2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11 Comment BCP+1 below to join the Sale.', '2 X BABY THREADFIN 300-400G / PCS @ $12.90 Comment BBT+1 below to join the Sale', '2 X BATANG STEAK 250-350G/ PKT @$13.90 Comment BST+1 below to join the Sale', '2 X BATANG TAIL 1.1-1.2 KG/ PCS @$39.90 Comment BTT399+1 below to join the Sale', '2 X BLACK POMFRET 400-500G/ PCS @$15.90 Comment BP+1 below to join the Sale', '2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.11 Comment PCT+1 below to join the Sale', '2 X FRANCE COD FL 400-500G/ PKT @$59.90 Comment FCF+1 below to join the Sale.', '2 X IQF CHICKEN MID JOINT 450G+-/ 

**Convert encoded emoji text back to emojis**

In [14]:
df['postComment'] = df['postComment'].apply(emoji.emojize)

In [15]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' ', '  👋', '  😃', '  😆', '  😭', ' 5 needs 2 pkts', ' QM ', ' Qm', ' is jackfruit ', ' 😆', ' 😱', ' 🤣', '$1 ', '1123 black friday', '2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.', '2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11 Comment BCP+1 below to join the Sale.', '2 X BABY THREADFIN 300-400G / PCS @ $12.90 Comment BBT+1 below to join the Sale', '2 X BATANG STEAK 250-350G/ PKT @$13.90 Comment BST+1 below to join the Sale', '2 X BATANG TAIL 1.1-1.2 KG/ PCS @$39.90 Comment BTT399+1 below to join the Sale', '2 X BLACK POMFRET 400-500G/ PCS @$15.90 Comment BP+1 below to join the Sale', '2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.11 Comment PCT+1 below to join the Sale', '2 X FRANCE COD FL 400-500G/ PKT @$59.90 Comment FCF+1 below to join the Sale.', '2 X IQF CHICKEN MID JOINT 450G+-/ P@ $9.90 Comment MJ+1 below to join the Sale', '2 X KASU RABBIT FISH 500G+-/PKT @ $18.00 Comment RB18+1 below to join the Sale', '2 X KUNNING 500G/ PKT @$11.90 

### Drop Empty Posts

Blank comments are dropped to ensure that unique comments are obtained.

In [16]:
#drop empty posts
df = df.loc[((df['postComment'] != ' ')),:]

In [17]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

['  👋', '  😃', '  😆', '  😭', ' 5 needs 2 pkts', ' QM ', ' Qm', ' is jackfruit ', ' 😆', ' 😱', ' 🤣', '$1 ', '1123 black friday', '2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.', '2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11 Comment BCP+1 below to join the Sale.', '2 X BABY THREADFIN 300-400G / PCS @ $12.90 Comment BBT+1 below to join the Sale', '2 X BATANG STEAK 250-350G/ PKT @$13.90 Comment BST+1 below to join the Sale', '2 X BATANG TAIL 1.1-1.2 KG/ PCS @$39.90 Comment BTT399+1 below to join the Sale', '2 X BLACK POMFRET 400-500G/ PCS @$15.90 Comment BP+1 below to join the Sale', '2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.11 Comment PCT+1 below to join the Sale', '2 X FRANCE COD FL 400-500G/ PKT @$59.90 Comment FCF+1 below to join the Sale.', '2 X IQF CHICKEN MID JOINT 450G+-/ P@ $9.90 Comment MJ+1 below to join the Sale', '2 X KASU RABBIT FISH 500G+-/PKT @ $18.00 Comment RB18+1 below to join the Sale', '2 X KUNNING 500G/ PKT @$11.90 Comme

### Reindexing the dataframe 
**New Column to reindex the dataframe in accordance to time**

From the data, we can tell that there is an inconsistent timestamp being used in the column 'postCommentTime'. For example, there are times like '0:57' and '1:00:14' which indicates 0 hour 0 mins 57 secs, and 1 hour 0 mins 14 secs. 

Hence, a new column 'postCommentTime_final' is created to ensure that a timestamp of HH:MM:SS is being used consistently throughout. However, we note that the number of days is being included for TimeDeltaIndex.

As a result, we will read the timestamp, by excluding the number of days.  

In [18]:
#TimedeltaIndex
#https://stackoverflow.com/questions/54877467/pandas-convert-hhmm-and-hhmmss-to-standard-hhmmss-in-python
# for example, time of '0:57' will then be 00:00:57; 0 hours 0 mins 57 secs
df['postCommentTime_final'] = pd.to_timedelta(np.where(df['postCommentTime'].str.count(':') == 1, '00:' + df['postCommentTime'], df['postCommentTime']))

In [19]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Bye 👋,Jane Wong,0:00,0 days 00:00:00
1,Oss,Eve Kang,0:40,0 days 00:00:40
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:53,0 days 00:00:53
3,No more shells for this mala clams.,Jane Wong,1:00:01,0 days 01:00:01
4,Btt399+1,Jennifer Quek,1:00:03,0 days 01:00:03


In [20]:
df['postCommentTime_final'] = df['postCommentTime_final'].astype(str).map(lambda x: x[7:])

In [21]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Bye 👋,Jane Wong,0:00,00:00:00
1,Oss,Eve Kang,0:40,00:00:40
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:53,00:00:53
3,No more shells for this mala clams.,Jane Wong,1:00:01,01:00:01
4,Btt399+1,Jennifer Quek,1:00:03,01:00:03
5,Joy+2 Jay+2 Sm+2 Jom+8 VNL+1 Ccl+1 Jf+1,Winnie Wu,1:00:04,01:00:04
6,Hello I m bk!,Stella Lim,1:00:14,01:00:14
7,Jf+5,Jennifer Quek,1:00:23,01:00:23
8,BTF+1,Jasmine Chua,1:00:26,01:00:26
9,Tmp +1,Jennifer Quek,1:00:36,01:00:36


In [22]:
#reindex according to postCommentTime_final
#previous natsort in data collection didnt take into account the different timestamp format
df = df.reindex(index=order_by_index(df.index, index_natsorted(df.postCommentTime_final)))

In [23]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Bye 👋,Jane Wong,0:00,00:00:00
1,Oss,Eve Kang,0:40,00:00:40
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:53,00:00:53
28,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,1:06,00:01:06
42,OSS,Jack Daniel,1:10,00:01:10
46,Good morning OSS,Khym Hoon Sung,1:11,00:01:11
65,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:15,00:01:15
84,Lns,Teddy Teddy,1:20,00:01:20
95,LAST 2 PCS!!! RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,1:33,00:01:33
96,Morning,Stacey Ho,1:37,00:01:37


In [24]:
#reset the index for the dataframe
#https://stackoverflow.com/questions/20490274/how-to-reset-index-in-a-pandas-dataframe

df = df.reset_index(drop=True)

In [25]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Bye 👋,Jane Wong,0:00,00:00:00
1,Oss,Eve Kang,0:40,00:00:40
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:53,00:00:53
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,1:06,00:01:06
4,OSS,Jack Daniel,1:10,00:01:10
5,Good morning OSS,Khym Hoon Sung,1:11,00:01:11
6,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:15,00:01:15
7,Lns,Teddy Teddy,1:20,00:01:20
8,LAST 2 PCS!!! RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,1:33,00:01:33
9,Morning,Stacey Ho,1:37,00:01:37


**Obtain the length of the video**

Assuming that the last comment of the video is the total length of the video, we will input the length of the video from the comments dataframe into the video attributes dataframe.

In [26]:
#retrieve last comment to obtain the length of the video
df['postCommentTime_final'].iloc[-1]

'01:23:36'

In [27]:
#https://stackoverflow.com/questions/6402812/how-to-convert-an-hmmss-time-string-to-seconds-in-python
def get_sec(time_str):
    """Get Seconds from time."""
    h, m, s = time_str.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

In [28]:
#retrieve last comment to obtain the length of the video in seconds
va['videoLength']= get_sec(df['postCommentTime_final'].iloc[-1])

In [29]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength
0,OCEANSTARLIVE/videos/869455797269649,17,439,5016


## Feature Engineering

### Comments made by the Seller

**New Column to identify the number of comments made by the seller in the video**

From the total sum of comments made by the seller, we will input it into the video attributes dataframe.

In [30]:
(df['postCommentAuthor']=='OceanStar Seafood').sum()

119

In [31]:
va['numSellerComments'] = (df['postCommentAuthor']=='OceanStar Seafood').sum()

In [32]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments
0,OCEANSTARLIVE/videos/869455797269649,17,439,5016,119


**New Column to identify if the comment is made by the Seller or not**

In [33]:
#to delete column 'notSeller' in due course
df['notSeller'] = df['postCommentAuthor'].map(lambda x:1 if x !='OceanStar Seafood' else 0)

In [34]:
#create a new column to show if the comment is made by the seller or not
df['isSeller'] = df['postCommentAuthor'].map(lambda x:1 if x =='OceanStar Seafood' else 0)

In [35]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller
0,Bye 👋,Jane Wong,0:00,00:00:00,1,0
1,Oss,Eve Kang,0:40,00:00:40,1,0
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:53,00:00:53,0,1
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,1:06,00:01:06,1,0
4,OSS,Jack Daniel,1:10,00:01:10,1,0


In [36]:
df['isSeller'].value_counts()

0    182
1    119
Name: isSeller, dtype: int64

In [37]:
#show all the seller's comments
df.loc[df['isSeller'] == 1]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:53,00:00:53,0,1
6,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:15,00:01:15,0,1
8,LAST 2 PCS!!! RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,1:33,00:01:33,0,1
10,BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00 Comment BTW78+1 below to join the Sale,OceanStar Seafood,1:43,00:01:43,0,1
11,BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00 Comment BTW168+1 below to join the Sale,OceanStar Seafood,1:51,00:01:51,0,1
12,PONTIAN WHITE THREADFIN 1.8-2.0 KG/ PCS @$29.90 Comment PWT299+1 below to join the Sale,OceanStar Seafood,2:00,00:02:00,0,1
14,LAST 5 PCS!!! MACKEREL WHOLE 500-600G/ PCS @$9.90 Comment MAC99+1 below to join the Sale,OceanStar Seafood,2:13,00:02:13,0,1
16,2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.11 Comment PCT+1 below to join the Sale,OceanStar Seafood,2:22,00:02:22,0,1
17,SALMON TROUT WHOLE 4.0-4.3 KG/ PCS @$108.00 Comment STW+1 below to join the Sale,OceanStar Seafood,2:30,00:02:30,0,1
18,WILD BARRAMUNDI WHOLE 3.5-3.9 KG/ PCS @ $49.90 Comment WB+1 below to join the Sale,OceanStar Seafood,2:38,00:02:38,0,1


### Length of comments

**New Column to identify the length of each comment**

From the comments, we take the length of each comment as the total number of words each comment has.

In [38]:
#length of each comment
#https://stackoverflow.com/questions/37483470/how-to-calculate-number-of-words-in-a-string-in-dataframe
df['postCommentLength'] = df['postComment'].str.split().str.len()

In [39]:
df.head(10)

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength
0,Bye 👋,Jane Wong,0:00,00:00:00,1,0,2
1,Oss,Eve Kang,0:40,00:00:40,1,0,1
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:53,00:00:53,0,1,15
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,1:06,00:01:06,1,0,6
4,OSS,Jack Daniel,1:10,00:01:10,1,0,1
5,Good morning OSS,Khym Hoon Sung,1:11,00:01:11,1,0,3
6,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:15,00:01:15,0,1,13
7,Lns,Teddy Teddy,1:20,00:01:20,1,0,1
8,LAST 2 PCS!!! RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,1:33,00:01:33,0,1,17
9,Morning,Stacey Ho,1:37,00:01:37,1,0,1


**New Column to identify the total number of comments in the video**

From the total number of comments in the video, we will input it into the video attributes dataframe.

In [40]:
#total number of comments
df['postCommentLength'].sum()

2508

In [41]:
va['numComments'] = df['postCommentLength'].sum()

In [42]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments
0,OCEANSTARLIVE/videos/869455797269649,17,439,5016,119,2508


### LNS

LNS is an acronym that stands for 'like and share'. It is a form of customer engagement as it indicates by the customers to the sellers that they have liked and shared the video on their Facebook wall. 

**New Column to identify if Customers are engaging in liking and sharing the video**

In [43]:
#if the customer has commented 'lns' or 'ls' which stands for 'like & shared' & 'like shared' respectively
def lns(comment):
    if re.search(r'(l)(n?)(s)', comment, re.IGNORECASE):
        return int(1)
    else:
        return int(0)

In [44]:
df['lns'] = df['postComment'].map(lambda x:lns(x))

In [45]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns
0,Bye 👋,Jane Wong,0:00,00:00:00,1,0,2,0
1,Oss,Eve Kang,0:40,00:00:40,1,0,1,0
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:53,00:00:53,0,1,15,0
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,1:06,00:01:06,1,0,6,0
4,OSS,Jack Daniel,1:10,00:01:10,1,0,1,0


**New Column to identify if the number of Customers who explicitly inform the sellers that they are engaging in liking and sharing the video**

In [46]:
#range of customer's engagement for LNS
df['lns'].value_counts()

0    291
1     10
Name: lns, dtype: int64

In [47]:
(df['lns']==1).sum()

10

In [48]:
va['lnsQuantity'] = (df['lns']==1).sum()

In [49]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity
0,OCEANSTARLIVE/videos/869455797269649,17,439,5016,119,2508,10


## Sales Quantity

**New Columns to identify the quantity of sales made**

From the comments, using regex, we first see an overview of the comments that are related to the sale of the products. 

In [50]:
#products offered by the seller
df[df['postComment'].str.contains('(\w*\+\d)', regex=True)]

  df[df['postComment'].str.contains('(\w*\+\d)', regex=True)]


Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:53,00:00:53,0,1,15,0
6,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:15,00:01:15,0,1,13,0
8,LAST 2 PCS!!! RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,1:33,00:01:33,0,1,17,0
10,BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00 Comment BTW78+1 below to join the Sale,OceanStar Seafood,1:43,00:01:43,0,1,13,0
11,BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00 Comment BTW168+1 below to join the Sale,OceanStar Seafood,1:51,00:01:51,0,1,14,0
12,PONTIAN WHITE THREADFIN 1.8-2.0 KG/ PCS @$29.90 Comment PWT299+1 below to join the Sale,OceanStar Seafood,2:00,00:02:00,0,1,14,0
14,LAST 5 PCS!!! MACKEREL WHOLE 500-600G/ PCS @$9.90 Comment MAC99+1 below to join the Sale,OceanStar Seafood,2:13,00:02:13,0,1,15,0
16,2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.11 Comment PCT+1 below to join the Sale,OceanStar Seafood,2:22,00:02:22,0,1,19,0
17,SALMON TROUT WHOLE 4.0-4.3 KG/ PCS @$108.00 Comment STW+1 below to join the Sale,OceanStar Seafood,2:30,00:02:30,0,1,14,0
18,WILD BARRAMUNDI WHOLE 3.5-3.9 KG/ PCS @ $49.90 Comment WB+1 below to join the Sale,OceanStar Seafood,2:38,00:02:38,0,1,15,0


In [51]:
def sale(comment):
    if re.search(r'(\w*\+)(\d)', comment):
        return int(re.search(r'\w*\+\d', comment).group(0)[-1])
    else:
        return int(0)

In [52]:
df['sales'] = df['postComment'].apply(lambda x:sale(x))

In [53]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales
0,Bye 👋,Jane Wong,0:00,00:00:00,1,0,2,0,0
1,Oss,Eve Kang,0:40,00:00:40,1,0,1,0,0
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:53,00:00:53,0,1,15,0,1
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,1:06,00:01:06,1,0,6,0,0
4,OSS,Jack Daniel,1:10,00:01:10,1,0,1,0,0


In [54]:
#sales made by the customers; exclude the seller's comments on the codes for the sales
#https://stackoverflow.com/questions/19914937/applying-function-with-multiple-arguments-to-create-a-new-pandas-column
df['salesQuantity'] = np.multiply(df['notSeller'], df['sales'])

In [55]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity
0,Bye 👋,Jane Wong,0:00,00:00:00,1,0,2,0,0,0
1,Oss,Eve Kang,0:40,00:00:40,1,0,1,0,0,0
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:53,00:00:53,0,1,15,0,1,0
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,1:06,00:01:06,1,0,6,0,0,0
4,OSS,Jack Daniel,1:10,00:01:10,1,0,1,0,0,0


In [56]:
#range of sales quantity
df['salesQuantity'].value_counts()

0    252
1     35
5      7
2      5
3      2
Name: salesQuantity, dtype: int64

In [57]:
#total number of orders made
df['salesQuantity'].sum()

86

In [58]:
va['salesQuantity'] = df['salesQuantity'].sum()

In [59]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity
0,OCEANSTARLIVE/videos/869455797269649,17,439,5016,119,2508,10,86


## Products 

The seller will comment and post the unique product codes for each product. Thereafter, customers who are keen on purchasing the products will explicitly comment out the specific & unique product codes, in addition to the quanityt of each product that they wish to purchase.

**New Columns to identify the products purchased by the Customers**

Regex is used to identify the products being offered and the products being purchased as well.

In [60]:
#function to identify the code of the product bought
def sale2(comment):
    if re.search(r'[a-zA-Z]*\s?(\d*)?\s?\+\s?\d', comment):
        return str(re.search(r'[a-zA-Z]*\s?(\d*)?\s?\+\s?\d', comment).group(0)[:-2])
    else:
        return int(0)

In [61]:
#identifies all comments that have the codes of the products, including the seller's comments.
#this column will be dropped afterwards.
df['product'] = df['postComment'].apply(lambda x:sale2(x))

In [62]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity,product
0,Bye 👋,Jane Wong,0:00,00:00:00,1,0,2,0,0,0,0
1,Oss,Eve Kang,0:40,00:00:40,1,0,1,0,0,0,0
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:53,00:00:53,0,1,15,0,1,0,EMP399
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,1:06,00:01:06,1,0,6,0,0,0,0
4,OSS,Jack Daniel,1:10,00:01:10,1,0,1,0,0,0,0
5,Good morning OSS,Khym Hoon Sung,1:11,00:01:11,1,0,3,0,0,0,0
6,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:15,00:01:15,0,1,13,0,1,0,EMP399
7,Lns,Teddy Teddy,1:20,00:01:20,1,0,1,1,0,0,0
8,LAST 2 PCS!!! RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,1:33,00:01:33,0,1,17,0,1,0,RES249
9,Morning,Stacey Ho,1:37,00:01:37,1,0,1,0,0,0,0


In [63]:
#products bought by Customers; exclude the seller's comments on the product details 
df['productBought'] = np.multiply(df['notSeller'], df['product'])

In [64]:
df['productBought'].unique()

array([0, '', 'BTT399', 'JF', 'PR', 'BSS', 'QM', 'KBP', 'Qm ', 'RES249',
       'QM +', 'WAK', 'BLJ', 'Bbt', 'Mca', 'Joy', 'Sm', 'SM', 'TMP',
       'Amb', 'JAY', 'JOY', 'MFC', 'Btt399', 'Jf', 'BTF', 'Tmp ', 'Mfc',
       'Scp', 'Bcp', 'Sp', 'WAK ', 'AMB', 'Mc299', 'Jt '], dtype=object)

In [65]:
#https://stackoverflow.com/questions/13445241/replacing-blank-values-white-space-with-nan-in-pandas
df['productBought'] = df['productBought'].replace(r'^\s*$', int(0), regex=True)

In [66]:
df['productBought'].unique()

array([0, 'BTT399', 'JF', 'PR', 'BSS', 'QM', 'KBP', 'Qm ', 'RES249',
       'QM +', 'WAK', 'BLJ', 'Bbt', 'Mca', 'Joy', 'Sm', 'SM', 'TMP',
       'Amb', 'JAY', 'JOY', 'MFC', 'Btt399', 'Jf', 'BTF', 'Tmp ', 'Mfc',
       'Scp', 'Bcp', 'Sp', 'WAK ', 'AMB', 'Mc299', 'Jt '], dtype=object)

In [67]:
#remove whitespaces
df['productBought'] = df['productBought'].str.replace(" ","")

In [68]:
df['productBought'].unique()

array([nan, 'BTT399', 'JF', 'PR', 'BSS', 'QM', 'KBP', 'Qm', 'RES249',
       'QM+', 'WAK', 'BLJ', 'Bbt', 'Mca', 'Joy', 'Sm', 'SM', 'TMP', 'Amb',
       'JAY', 'JOY', 'MFC', 'Btt399', 'Jf', 'BTF', 'Tmp', 'Mfc', 'Scp',
       'Bcp', 'Sp', 'AMB', 'Mc299', 'Jt'], dtype=object)

In [69]:
df.update(df[['productBought']].fillna(0))

In [70]:
df['productBought'].unique()

array([0, 'BTT399', 'JF', 'PR', 'BSS', 'QM', 'KBP', 'Qm', 'RES249', 'QM+',
       'WAK', 'BLJ', 'Bbt', 'Mca', 'Joy', 'Sm', 'SM', 'TMP', 'Amb', 'JAY',
       'JOY', 'MFC', 'Btt399', 'Jf', 'BTF', 'Tmp', 'Mfc', 'Scp', 'Bcp',
       'Sp', 'AMB', 'Mc299', 'Jt'], dtype=object)

In [71]:
#change the produce codes to be uppercase for consistency
#https://stackoverflow.com/questions/39512002/convert-whole-dataframe-from-lower-case-to-upper-case-with-pandas
df['productBought'] = df['productBought'].astype(str).str.upper()

In [72]:
df['productBought'].unique()

array(['0', 'BTT399', 'JF', 'PR', 'BSS', 'QM', 'KBP', 'RES249', 'QM+',
       'WAK', 'BLJ', 'BBT', 'MCA', 'JOY', 'SM', 'TMP', 'AMB', 'JAY',
       'MFC', 'BTF', 'SCP', 'BCP', 'SP', 'MC299', 'JT'], dtype=object)

In [73]:
df['productBought'] = df['productBought'].map(lambda x: re.sub(r'\W+', '', x))

In [74]:
df['productBought'].unique()

array(['0', 'BTT399', 'JF', 'PR', 'BSS', 'QM', 'KBP', 'RES249', 'WAK',
       'BLJ', 'BBT', 'MCA', 'JOY', 'SM', 'TMP', 'AMB', 'JAY', 'MFC',
       'BTF', 'SCP', 'BCP', 'SP', 'MC299', 'JT'], dtype=object)

In [75]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity,product,productBought
0,Bye 👋,Jane Wong,0:00,00:00:00,1,0,2,0,0,0,0,0
1,Oss,Eve Kang,0:40,00:00:40,1,0,1,0,0,0,0,0
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:53,00:00:53,0,1,15,0,1,0,EMP399,0
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,1:06,00:01:06,1,0,6,0,0,0,0,0
4,OSS,Jack Daniel,1:10,00:01:10,1,0,1,0,0,0,0,0
5,Good morning OSS,Khym Hoon Sung,1:11,00:01:11,1,0,3,0,0,0,0,0
6,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:15,00:01:15,0,1,13,0,1,0,EMP399,0
7,Lns,Teddy Teddy,1:20,00:01:20,1,0,1,1,0,0,0,0
8,LAST 2 PCS!!! RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,1:33,00:01:33,0,1,17,0,1,0,RES249,0
9,Morning,Stacey Ho,1:37,00:01:37,1,0,1,0,0,0,0,0


### Price of Products

**New Column to identify the price of the products**

This new column is created to identify the regex of the unique product codes and their corresponding prices, as adviced by the seller.

In [76]:
def price(comment):
    if re.search(r'(\@)(\$)( ?)(.*)', comment):
        return str(re.search(r'(\$)( ?)(.*)', comment).group(0)[:-23])
    else:
        return int(0)

In [77]:
df['productPrice'] = df['postComment'].apply(lambda x:price(x))

In [78]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity,product,productBought,productPrice
0,Bye 👋,Jane Wong,0:00,00:00:00,1,0,2,0,0,0,0,0,0
1,Oss,Eve Kang,0:40,00:00:40,1,0,1,0,0,0,0,0,0
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:53,00:00:53,0,1,15,0,1,0,EMP399,0,$39.90 Comment EMP399+1
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,1:06,00:01:06,1,0,6,0,0,0,0,0,0
4,OSS,Jack Daniel,1:10,00:01:10,1,0,1,0,0,0,0,0,0
5,Good morning OSS,Khym Hoon Sung,1:11,00:01:11,1,0,3,0,0,0,0,0,0
6,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:15,00:01:15,0,1,13,0,1,0,EMP399,0,$39.90 Comment EMP399+1
7,Lns,Teddy Teddy,1:20,00:01:20,1,0,1,1,0,0,0,0,0
8,LAST 2 PCS!!! RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,1:33,00:01:33,0,1,17,0,1,0,RES249,0,$24.90 Comment RES249+1
9,Morning,Stacey Ho,1:37,00:01:37,1,0,1,0,0,0,0,0,0


We noticed that each comment has a word 'Comment' in the middle of the extracted string of comments for the column 'productPrice'. Hence, we will remove the mentioned word.

In [79]:
df['productPrice'] = df['productPrice'].replace(value='', regex=r'Comment')

In [80]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity,product,productBought,productPrice
0,Bye 👋,Jane Wong,0:00,00:00:00,1,0,2,0,0,0,0,0,0
1,Oss,Eve Kang,0:40,00:00:40,1,0,1,0,0,0,0,0,0
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:53,00:00:53,0,1,15,0,1,0,EMP399,0,$39.90 EMP399+1
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,1:06,00:01:06,1,0,6,0,0,0,0,0,0
4,OSS,Jack Daniel,1:10,00:01:10,1,0,1,0,0,0,0,0,0
5,Good morning OSS,Khym Hoon Sung,1:11,00:01:11,1,0,3,0,0,0,0,0,0
6,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:15,00:01:15,0,1,13,0,1,0,EMP399,0,$39.90 EMP399+1
7,Lns,Teddy Teddy,1:20,00:01:20,1,0,1,1,0,0,0,0,0
8,LAST 2 PCS!!! RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,1:33,00:01:33,0,1,17,0,1,0,RES249,0,$24.90 RES249+1
9,Morning,Stacey Ho,1:37,00:01:37,1,0,1,0,0,0,0,0,0


In [81]:
df['productPrice'].unique()

array([0, '$39.90  EMP399+1', '$24.90  RES249+1', '$78.00  BTW78+1',
       '$168.00  BTW168+1', '$29.90  PWT299+1', '$9.90  MAC99+1',
       '$108.00  STW+1', '$39.90  WG399+1', '$35.00  RG35+1',
       '$58.00  RG58+1', '$88.00  RG88+1', '$19.90  LDH+1 ',
       '$22.90  RST+1', '$24.90  WAK+1 ', '$ 18.80  SCP+1 ',
       '$13.90  CCL+1 ', '$13.90  SBY+1 ', '$13.90  VNL+1 ',
       '$13.90  MCA+1', '$11.11  TMP+1 ', '$29.90  MC299+1',
       '$19.90  SLD+1 ', '$39.90  BTT399+1', '$10.00  QM+1 ',
       '$24.00  BSS+1', '$12.90 ( PWP )  BTF+1 ', '$ 5.90  KBP+1',
       '$99.00  CP99+1', '$88.00  CP88+1', '$77.00  CP77+1',
       '$22.00    CP22+1', '$11.11  BCP+1 ', '$12.90  WGH+1',
       '$19.90  WGF+1', '$12.00  SK+1', '$13.90  MCA+1 ', '$2.40  JAY+1 ',
       '$2.40  JOY+1 ', '$15.90  BP+1', '$11.11  AMB+1', '$13.90  BST+1',
       '$12.90  JA+1 ', '$11.11  WB11+1', '$11.90  KN119+1 ',
       '$9.90  FG+1', '$36.00  STF+1 ', '$59.90  FCF+1 '], dtype=object)

Excluding the comments where there was no product codes as adviced by the seller, we find the number of unique products offered by the seller.

In [82]:
#number of unique products offered by the seller
int(df['productPrice'].nunique()) - int(1)

47

In [83]:
#total number of products offered
va['numProducts'] = int(df['productPrice'].nunique()) - int(1)

In [84]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts
0,OCEANSTARLIVE/videos/869455797269649,17,439,5016,119,2508,10,86,47


**Drop irrelevant columns**

The following column was dropped for the following reasons:

2. 'notSeller'
- This column 'notSeller' was solely created to calculate the quantity of sales made, and the products purchased by the customer. Hence, we are able to delete it after the quantity of sales have been calculated and the identification of the products purchased by the customers have been identified.
- Notwithstanding the above, a new column 'isSeller' has been feature engineered out as well, which will tell us the same results on whether the comment is posted & written by a seller or not. 

3. 'sales'
- This column identifies the quantity of products to the unique & specific product codes. However, it includes the product codes posted by the sellers as well. Hence, this column was solely created to be multiplied against the column 'notSeller' to calculate the true quantity of sales made by customers only. Hence, we are able to delete it after the quantity of sales have been calculated.

4. 'product'
- This column 'product' was solely created to identify the products purchased by the customers. Hence, we are able to delete it after the products purchased by the customers have been identified - especially since not all products offered by the seller is being bought by the customers.

In [85]:
#drop unwanted columns
df.drop(['postCommentTime', 'notSeller', 'sales', 'product'], axis=1, inplace=True)

In [86]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productBought,productPrice
0,Bye 👋,Jane Wong,00:00:00,0,2,0,0,0,0
1,Oss,Eve Kang,00:00:40,0,1,0,0,0,0
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:53,1,15,0,0,0,$39.90 EMP399+1
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,00:01:06,0,6,0,0,0,0
4,OSS,Jack Daniel,00:01:10,0,1,0,0,0,0


### Revenue from the sale of the products

**Dummify the products bought to find the revenue**

In [87]:
#getdummies the products bought
df = pd.get_dummies(df, columns = ['productBought'], drop_first = True)

In [88]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_MCA,productBought_MFC,productBought_PR,productBought_QM,productBought_RES249,productBought_SCP,productBought_SM,productBought_SP,productBought_TMP,productBought_WAK
0,Bye 👋,Jane Wong,00:00:00,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Oss,Eve Kang,00:00:40,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:53,1,15,0,0,$39.90 EMP399+1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,00:01:06,0,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,OSS,Jack Daniel,00:01:10,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
# iterating the columns
for col in df.columns:
    print(col)

postComment
postCommentAuthor
postCommentTime_final
isSeller
postCommentLength
lns
salesQuantity
productPrice
productBought_AMB
productBought_BBT
productBought_BCP
productBought_BLJ
productBought_BSS
productBought_BTF
productBought_BTT399
productBought_JAY
productBought_JF
productBought_JOY
productBought_JT
productBought_KBP
productBought_MC299
productBought_MCA
productBought_MFC
productBought_PR
productBought_QM
productBought_RES249
productBought_SCP
productBought_SM
productBought_SP
productBought_TMP
productBought_WAK


After the column 'productBought' has been dummified, the cells will return a '1' if that particular item is bought, and a '0' if it is not. Hence, we will replace all the '1' with the price of the product.

Then, we will create a new revenue column which is a multiplication of the column 'salesQuantity' against the price of the product (i.e. the dummified 'productBought' columnns).

Product AMB

In [90]:
df[df['postComment'].str.contains('AMB', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_MCA,productBought_MFC,productBought_PR,productBought_QM,productBought_RES249,productBought_SCP,productBought_SM,productBought_SP,productBought_TMP,productBought_WAK
200,AUS MELTIQUE BEEF STEAK 200G+-/ PCS @$11.11 Comment AMB+1 below to join the Sale,OceanStar Seafood,00:56:20,1,14,0,0,$11.11 AMB+1,0,0,...,0,0,0,0,0,0,0,0,0,0
244,AMB+2,Jane Wong,01:07:11,0,1,0,2,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
df['productBought_AMB'] = df['productBought_AMB'].map(lambda x:float(11.11) if x == int(1) else 0)

In [92]:
df['revenue_AMB'] = np.multiply(df['productBought_AMB'], df['salesQuantity'])

In [93]:
revenue_AMB = "The total revenue from the sale of the product {} is ${}". format ("AMB", format(df['revenue_AMB'].sum(), '.2f'))
print(revenue_AMB)

The total revenue from the sale of the product AMB is $44.44


Product BBT

In [94]:
df[df['postComment'].str.contains('BBT', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_MFC,productBought_PR,productBought_QM,productBought_RES249,productBought_SCP,productBought_SM,productBought_SP,productBought_TMP,productBought_WAK,revenue_AMB
179,2 X BABY THREADFIN 300-400G / PCS @ $12.90 Comment BBT+1 below to join the Sale,OceanStar Seafood,00:51:27,1,16,0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0
275,2 X BABY THREADFIN 300-400G / PCS @ $12.90 Comment BBT+1 below to join the Sale,OceanStar Seafood,01:15:39,1,16,0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [95]:
df['productBought_BBT'] = df['productBought_BBT'].map(lambda x:float(12.90) if x == int(1) else 0)

In [96]:
df['revenue_BBT'] = np.multiply(df['productBought_BBT'], df['salesQuantity'])

In [97]:
revenue_BBT = "The total revenue from the sale of the product {} is ${}". format ("BBT", format(df['revenue_BBT'].sum(), '.2f'))
print(revenue_BBT)


The total revenue from the sale of the product BBT is $12.90


Product BCP

In [98]:
df[df['postComment'].str.contains('BCP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_PR,productBought_QM,productBought_RES249,productBought_SCP,productBought_SM,productBought_SP,productBought_TMP,productBought_WAK,revenue_AMB,revenue_BBT
135,2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11 Comment BCP+1 below to join the Sale.,OceanStar Seafood,00:35:18,1,15,0,0,$11.11 BCP+1,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0
154,2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11 Comment BCP+1 below to join the Sale.,OceanStar Seafood,00:41:54,1,15,0,0,$11.11 BCP+1,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0


In [99]:
df['productBought_BCP'] = df['productBought_BCP'].map(lambda x:float(11.11) if x == int(1) else 0)

In [100]:
df['revenue_BCP'] = np.multiply(df['productBought_BCP'], df['salesQuantity'])

In [101]:
revenue_BCP = "The total revenue from the sale of the product {} is ${}". format ("BCP", format(df['revenue_BCP'].sum(), '.2f'))
print(revenue_BCP)


The total revenue from the sale of the product BCP is $11.11


Product BLJ

In [102]:
df[df['postComment'].str.contains('BLJ', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_QM,productBought_RES249,productBought_SCP,productBought_SM,productBought_SP,productBought_TMP,productBought_WAK,revenue_AMB,revenue_BBT,revenue_BCP
160,BLJ+1,Ja Ja,00:44:21,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0


Product BSS

In [103]:
df[df['postComment'].str.contains('BSS', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_QM,productBought_RES249,productBought_SCP,productBought_SM,productBought_SP,productBought_TMP,productBought_WAK,revenue_AMB,revenue_BBT,revenue_BCP
120,3 X BATANG SOUP SLICED 300G/ PKT @$24.00 Comment BSS+1 below to join the Sale,OceanStar Seafood,00:31:09,1,15,0,0,$24.00 BSS+1,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
122,MUST GRAB!!! 3 X BATANG SOUP SLICED 300G/ PKT @$24.00 Comment BSS+1 below to join the Sale,OceanStar Seafood,00:31:24,1,17,0,0,$24.00 BSS+1,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
123,BSS+1,Susan Chee,00:31:46,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
128,MUST GRAB!!! 3 X BATANG SOUP SLICED 300G/ PKT @$24.00 Comment BSS+1 below to join the Sale,OceanStar Seafood,00:33:51,1,17,0,0,$24.00 BSS+1,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0


In [104]:
df['productBought_BSS'] = df['productBought_BSS'].map(lambda x:float(24.00) if x == int(1) else 0)

In [105]:
df['revenue_BSS'] = np.multiply(df['productBought_BSS'], df['salesQuantity'])

In [106]:
revenue_BSS = "The total revenue from the sale of the product {} is ${}". format ("BSS", format(df['revenue_BSS'].sum(), '.2f'))
print(revenue_BSS)


The total revenue from the sale of the product BSS is $24.00


Product BTF

In [107]:
df[df['postComment'].str.contains('BTF', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_RES249,productBought_SCP,productBought_SM,productBought_SP,productBought_TMP,productBought_WAK,revenue_AMB,revenue_BBT,revenue_BCP,revenue_BSS
124,BALAI THREADFIN FILLET 300-350G/PKT @$12.90 ( PWP ) Comment BTF+1 below to join the Sale.,OceanStar Seafood,00:32:02,1,15,0,0,$12.90 ( PWP ) BTF+1,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
211,BALAI THREADFIN FILLET 300-350G/PKT @$12.90 ( PWP ) Comment BTF+1 below to join the Sale.,OceanStar Seafood,00:59:24,1,15,0,0,$12.90 ( PWP ) BTF+1,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
220,BTF+1,Jasmine Chua,01:00:26,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0


In [108]:
df['productBought_BTF'] = df['productBought_BTF'].map(lambda x:float(12.90) if x == int(1) else 0)

In [109]:
df['revenue_BTF'] = np.multiply(df['productBought_BTF'], df['salesQuantity'])

In [110]:
revenue_BTF = "The total revenue from the sale of the product {} is ${}". format ("BTF", format(df['revenue_BTF'].sum(), '.2f'))
print(revenue_BTF)


The total revenue from the sale of the product BTF is $12.90


Product BTT399

In [111]:
df[df['postComment'].str.contains('BTT399', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_SCP,productBought_SM,productBought_SP,productBought_TMP,productBought_WAK,revenue_AMB,revenue_BBT,revenue_BCP,revenue_BSS,revenue_BTF
99,2 X BATANG TAIL 1.1-1.2 KG/ PCS @$39.90 Comment BTT399+1 below to join the Sale,OceanStar Seafood,00:25:10,1,15,0,0,$39.90 BTT399+1,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
100,BTT399+1,Jane Wong,00:25:42,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
146,BTT399+1,Jane Wong,00:39:22,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
204,2 X BATANG TAIL 1.1-1.2 KG/ PCS @$39.90 Comment BTT399+1 below to join the Sale,OceanStar Seafood,00:57:00,1,15,0,0,$39.90 BTT399+1,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
233,BTT399+1,Ivy Ng Lee Cheng,01:04:44,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


In [112]:
df['productBought_BTT399'] = df['productBought_BTT399'].map(lambda x:float(39.90) if x == int(1) else 0)

In [113]:
df['revenue_BTT399'] = np.multiply(df['productBought_BTT399'], df['salesQuantity'])

In [114]:
revenue_BTT399 = "The total revenue from the sale of the product {} is ${}". format ("BTT399", format(df['revenue_BTT399'].sum(), '.2f'))
print(revenue_BTT399)


The total revenue from the sale of the product BTT399 is $159.60


Product JAY

In [115]:
df[df['postComment'].str.contains('JAY', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_SM,productBought_SP,productBought_TMP,productBought_WAK,revenue_AMB,revenue_BBT,revenue_BCP,revenue_BSS,revenue_BTF,revenue_BTT399
188,Japan Hokkaido Apple Yoghurt 200ml @$2.40 Comment JAY+1 below to join the Sale.,OceanStar Seafood,00:52:47,1,13,0,0,$2.40 JAY+1,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
202,JAY+5,Jane Wong,00:56:45,0,1,0,5,0,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
df['productBought_JAY'] = df['productBought_JAY'].map(lambda x:float(2.40) if x == int(1) else 0)

In [117]:
df['revenue_JAY'] = np.multiply(df['productBought_JAY'], df['salesQuantity'])

In [118]:
revenue_JAY = "The total revenue from the sale of the product {} is ${}". format ("JAY", format(df['revenue_JAY'].sum(), '.2f'))
print(revenue_JAY)


The total revenue from the sale of the product JAY is $12.00


Product JF

In [119]:
df[df['postComment'].str.contains('JF', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_SP,productBought_TMP,productBought_WAK,revenue_AMB,revenue_BBT,revenue_BCP,revenue_BSS,revenue_BTF,revenue_BTT399,revenue_JAY
68,MAX 5 PKT / ACC JACKFRUIT 500G/ PKT @ $1.00 ( PWP ) Comment JF+1 below to join the Sale.,OceanStar Seafood,00:17:19,1,20,0,0,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73,CRAZY OFFER!!!! MAX 5 PKT / ACC JACKFRUIT 500G/ PKT @ $1.00 ( PWP ) Comment JF+1 below to join the Sale.,OceanStar Seafood,00:17:58,1,22,0,0,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85,CRAZY OFFER!!!! MAX 5 PKT / ACC JACKFRUIT 500G/ PKT @ $1.00 ( PWP ) Comment JF+1 below to join the Sale.,OceanStar Seafood,00:20:41,1,22,0,0,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
90,CRAZY OFFER!!!! MAX 5 PKT / ACC JACKFRUIT 500G/ PKT @ $1.00 ( PWP ) Comment JF+1 below to join the Sale.,OceanStar Seafood,00:21:37,1,22,0,0,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,CRAZY OFFER!!!! MAX 5 PKT / ACC JACKFRUIT 500G/ PKT @ $1.00 ( PWP ) Comment JF+1 below to join the Sale.,OceanStar Seafood,00:26:11,1,22,0,0,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104,JF+1,Jane Wong,00:27:09,0,1,0,1,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
107,JF+5,Catherine Gan-Chua,00:27:44,0,1,0,5,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
143,CRAZY DEAL!!! MAX 5 PKT / ACC JACKFRUIT 500G/ PKT @ $1.00 ( PWP ) Comment JF+1 below to join the Sale.,OceanStar Seafood,00:38:30,1,22,0,0,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
174,CRAZY DEAL!!! MAX 5 PKT / ACC JACKFRUIT 500G/ PKT @ $1.00 ( PWP ) Comment JF+1 below to join the Sale.,OceanStar Seafood,00:50:19,1,22,0,0,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
175,JF+2,Ja Ja,00:50:23,0,1,0,2,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
df['productBought_JF'] = df['productBought_JF'].map(lambda x:float(1.00) if x == int(1) else 0)

In [121]:
df['revenue_JF'] = np.multiply(df['productBought_JF'], df['salesQuantity'])

In [122]:
revenue_JF = "The total revenue from the sale of the product {} is ${}". format ("JF", format(df['revenue_JF'].sum(), '.2f'))
print(revenue_JF)


The total revenue from the sale of the product JF is $29.00


Product JOY

In [123]:
df[df['postComment'].str.contains('JOY', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_TMP,productBought_WAK,revenue_AMB,revenue_BBT,revenue_BCP,revenue_BSS,revenue_BTF,revenue_BTT399,revenue_JAY,revenue_JF
189,Japan Hokkaido Original Yoghurt 200ml @$2.40 Comment JOY+1 below to join the Sale.,OceanStar Seafood,00:52:55,1,13,0,0,$2.40 JOY+1,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
206,JOY+1,Jane Wong,00:57:55,0,1,0,1,0,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [124]:
df['productBought_JOY'] = df['productBought_JOY'].map(lambda x:float(2.40) if x == int(1) else 0)

In [125]:
df['revenue_JOY'] = np.multiply(df['productBought_JOY'], df['salesQuantity'])

In [126]:
revenue_JOY = "The total revenue from the sale of the product {} is ${}". format ("JOY", format(df['revenue_JOY'].sum(), '.2f'))
print(revenue_JOY)


The total revenue from the sale of the product JOY is $9.60


Product JT

In [127]:
df[df['postComment'].str.contains('JT', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_WAK,revenue_AMB,revenue_BBT,revenue_BCP,revenue_BSS,revenue_BTF,revenue_BTT399,revenue_JAY,revenue_JF,revenue_JOY


Product KBP

In [128]:
df[df['postComment'].str.contains('KBP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_WAK,revenue_AMB,revenue_BBT,revenue_BCP,revenue_BSS,revenue_BTF,revenue_BTT399,revenue_JAY,revenue_JF,revenue_JOY
127,LAST 6 PKT!!!! KONG BAK PAO 4 PCS / PKT @$ 5.90 Comment KBP+1 below to join the Sale,OceanStar Seafood,00:33:01,1,19,0,0,$ 5.90 KBP+1,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
129,KBP+1,Susan Chee,00:34:10,0,1,0,1,0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [129]:
df['productBought_KBP'] = df['productBought_KBP'].map(lambda x:float(5.90) if x == int(1) else 0)

In [130]:
df['revenue_KBP'] = np.multiply(df['productBought_KBP'], df['salesQuantity'])

In [131]:
revenue_KBP = "The total revenue from the sale of the product {} is ${}". format ("KBP", format(df['revenue_KBP'].sum(), '.2f'))
print(revenue_KBP)


The total revenue from the sale of the product KBP is $5.90


Product MC299

In [132]:
df[df['postComment'].str.contains('MC299', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_AMB,revenue_BBT,revenue_BCP,revenue_BSS,revenue_BTF,revenue_BTT399,revenue_JAY,revenue_JF,revenue_JOY,revenue_KBP
88,3 X MALE MUD CRAB 350-450G/ PCS @$29.90 Comment MC299+1 below to join the Sale,OceanStar Seafood,00:21:08,1,15,0,0,$29.90 MC299+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92,3 X MALE MUD CRAB 350-450G/ PCS @$29.90 Comment MC299+1 below to join the Sale,OceanStar Seafood,00:22:23,1,15,0,0,$29.90 MC299+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
243,3 X MALE MUD CRAB 350-450G/ PCS @$29.90 Comment MC299+1 below to join the Sale,OceanStar Seafood,01:07:09,1,15,0,0,$29.90 MC299+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [133]:
df['productBought_MC299'] = df['productBought_MC299'].map(lambda x:float(29.90) if x == int(1) else 0)

In [134]:
df['revenue_MC299'] = np.multiply(df['productBought_MC299'], df['salesQuantity'])

In [135]:
revenue_MC299 = "The total revenue from the sale of the product {} is ${}". format ("MC299", format(df['revenue_MC299'].sum(), '.2f'))
print(revenue_MC299)


The total revenue from the sale of the product MC299 is $29.90


Product MCA

In [136]:
df[df['postComment'].str.contains('MCA', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_BBT,revenue_BCP,revenue_BSS,revenue_BTF,revenue_BTT399,revenue_JAY,revenue_JF,revenue_JOY,revenue_KBP,revenue_MC299
61,MATCHA JAPAN HOKKAIDO CREPE CAKE 4 PCS/ PKT @$13.90 Comment MCA+1 below to join the Sale,OceanStar Seafood,00:14:28,1,16,0,0,$13.90 MCA+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
183,MATCHA JAPAN HOKKAIDO CREPE CAKE 4 PCS/ PKT @$13.90 Comment MCA+1 below to join the Sale.,OceanStar Seafood,00:51:53,1,16,0,0,$13.90 MCA+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [137]:
df['productBought_MCA'] = df['productBought_MCA'].map(lambda x:float(13.90) if x == int(1) else 0)

In [138]:
df['revenue_MCA'] = np.multiply(df['productBought_MCA'], df['salesQuantity'])

In [139]:
revenue_MCA = "The total revenue from the sale of the product {} is ${}". format ("MCA", format(df['revenue_MCA'].sum(), '.2f'))
print(revenue_MCA)


The total revenue from the sale of the product MCA is $13.90


Product MJ

In [140]:
df[df['postComment'].str.contains('MCA', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_BCP,revenue_BSS,revenue_BTF,revenue_BTT399,revenue_JAY,revenue_JF,revenue_JOY,revenue_KBP,revenue_MC299,revenue_MCA
61,MATCHA JAPAN HOKKAIDO CREPE CAKE 4 PCS/ PKT @$13.90 Comment MCA+1 below to join the Sale,OceanStar Seafood,00:14:28,1,16,0,0,$13.90 MCA+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
183,MATCHA JAPAN HOKKAIDO CREPE CAKE 4 PCS/ PKT @$13.90 Comment MCA+1 below to join the Sale.,OceanStar Seafood,00:51:53,1,16,0,0,$13.90 MCA+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [141]:
df['productBought_MCA'] = df['productBought_MCA'].map(lambda x:float(13.90) if x == int(1) else 0)

In [142]:
df['revenue_MCA'] = np.multiply(df['productBought_MCA'], df['salesQuantity'])

In [143]:
revenue_MCA = "The total revenue from the sale of the product {} is ${}". format ("MCA", format(df['revenue_MCA'].sum(), '.2f'))
print(revenue_MCA)


The total revenue from the sale of the product MCA is $0.00


Product MFC

In [144]:
df[df['postComment'].str.contains('MFC', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_BCP,revenue_BSS,revenue_BTF,revenue_BTT399,revenue_JAY,revenue_JF,revenue_JOY,revenue_KBP,revenue_MC299,revenue_MCA
205,2 X MALA FLOWER CLAM 500G/ PKT @ $ 9.90 Comment MFC+1 below to join the Sale,OceanStar Seafood,00:57:53,1,17,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
209,MFC+1,Jane Wong,00:58:59,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [145]:
df['productBought_MFC'] = df['productBought_MFC'].map(lambda x:float(9.90) if x == int(1) else 0)

In [146]:
df['revenue_MFC'] = np.multiply(df['productBought_MFC'], df['salesQuantity'])

In [147]:
revenue_MFC = "The total revenue from the sale of the product {} is ${}". format ("MFC", format(df['revenue_MFC'].sum(), '.2f'))
print(revenue_MFC)


The total revenue from the sale of the product MFC is $19.80


Product PR

In [148]:
df[df['postComment'].str.contains('PR+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_BSS,revenue_BTF,revenue_BTT399,revenue_JAY,revenue_JF,revenue_JOY,revenue_KBP,revenue_MC299,revenue_MCA,revenue_MFC
110,3 X IQF PRIME RIBS 500G+- / P @ $21.90 Comment PR+1 below to join the Sale,OceanStar Seafood,00:28:47,1,17,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
111,PR+1,Susan Chee,00:29:05,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
140,PR+1,Jane Wong,00:37:42,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
285,3 X IQF PRIME RIBS 500G+- / P @ $21.90 Comment PR+1 below to join the Sale,OceanStar Seafood,01:18:23,1,17,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0


In [149]:
df['productBought_PR'] = df['productBought_PR'].map(lambda x:float(21.90) if x == int(1) else 0)

In [150]:
df['revenue_PR'] = np.multiply(df['productBought_PR'], df['salesQuantity'])

In [151]:
revenue_PR = "The total revenue from the sale of the product {} is ${}". format ("PR", format(df['revenue_PR'].sum(), '.2f'))
print(revenue_PR)


The total revenue from the sale of the product PR is $43.80


Product QM

In [152]:
df[df['postComment'].str.contains('QM+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_BTF,revenue_BTT399,revenue_JAY,revenue_JF,revenue_JOY,revenue_KBP,revenue_MC299,revenue_MCA,revenue_MFC,revenue_PR
114,QING MU CUTTLEFISH 500-600G/ PCS @$10.00 Comment QM+1 below to join the Sale.,OceanStar Seafood,00:29:34,1,13,0,0,$10.00 QM+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
126,QM+1,Jane Wong,00:32:48,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
137,QING MU CUTTLEFISH 500-600G/ PCS @$10.00 Comment QM+1 below to join the Sale.,OceanStar Seafood,00:35:47,1,13,0,0,$10.00 QM+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
152,QM+3,Ja Ja,00:41:43,0,1,0,3,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
158,QM+2,Vic Swi,00:42:42,0,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
255,QM+2,Ivy Ng Lee Cheng,01:10:53,0,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
281,QM+1,Ivy Ng Lee Cheng,01:16:47,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0


In [153]:
df['productBought_QM'] = df['productBought_QM'].map(lambda x:float(10.00) if x == int(1) else 0)

In [154]:
df['revenue_QM'] = np.multiply(df['productBought_QM'], df['salesQuantity'])

In [155]:
revenue_QM = "The total revenue from the sale of the product {} is ${}". format ("QM", format(df['revenue_QM'].sum(), '.2f'))
print(revenue_QM)


The total revenue from the sale of the product QM is $130.00


Product RES249

In [156]:
df[df['postComment'].str.contains('RES249+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_BTT399,revenue_JAY,revenue_JF,revenue_JOY,revenue_KBP,revenue_MC299,revenue_MCA,revenue_MFC,revenue_PR,revenue_QM
8,LAST 2 PCS!!! RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,00:01:33,1,17,0,0,$24.90 RES249+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
148,LAST 2 PCS!!! RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,00:40:18,1,17,0,0,$24.90 RES249+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
149,RES249+1,Ja Ja,00:41:05,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0


In [157]:
df['productBought_RES249'] = df['productBought_RES249'].map(lambda x:float(24.90) if x == int(1) else 0)

In [158]:
df['revenue_RES249'] = np.multiply(df['productBought_RES249'], df['salesQuantity'])

In [159]:
revenue_RES249 = "The total revenue from the sale of the product {} is ${}". format ("RES249", format(df['revenue_RES249'].sum(), '.2f'))
print(revenue_RES249)


The total revenue from the sale of the product RES249 is $24.90


Product SCP

In [160]:
df[df['postComment'].str.contains('SCP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_JAY,revenue_JF,revenue_JOY,revenue_KBP,revenue_MC299,revenue_MCA,revenue_MFC,revenue_PR,revenue_QM,revenue_RES249
50,ORGANIC COOKED SHRIMP 31-40PCS/ PKT @$ 18.80 Comment SCP+1 below to join the Sale.,OceanStar Seafood,00:10:25,1,14,0,0,$ 18.80 SCP+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
227,ORGANIC COOKED SHRIMP 31-40PCS/ PKT @$ 18.80 Comment SCP+1 below to join the Sale.,OceanStar Seafood,01:02:30,1,14,0,0,$ 18.80 SCP+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
229,ORGANIC COOKED SHRIMP 31-40PCS/ PKT @$ 18.80 Comment SCP+1 below to join the Sale.,OceanStar Seafood,01:03:18,1,14,0,0,$ 18.80 SCP+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0


In [161]:
df['productBought_SCP'] = df['productBought_SCP'].map(lambda x:float(18.80) if x == int(1) else 0)

In [162]:
df['revenue_SCP'] = np.multiply(df['productBought_SCP'], df['salesQuantity'])

In [163]:
revenue_SCP = "The total revenue from the sale of the product {} is ${}". format ("SCP", format(df['revenue_SCP'].sum(), '.2f'))
print(revenue_SCP)


The total revenue from the sale of the product SCP is $18.80


Product SM

In [164]:
df[df['postComment'].str.contains('SM+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_JF,revenue_JOY,revenue_KBP,revenue_MC299,revenue_MCA,revenue_MFC,revenue_PR,revenue_QM,revenue_RES249,revenue_SCP
191,Japan Hokkaido Strawberry Milk Drink 200ml @ $2.40 Comment SM+1 below to join the Sale.,OceanStar Seafood,00:53:36,1,15,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
196,SM+1,Jane Wong,00:55:11,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0


In [165]:
df['productBought_SM'] = df['productBought_SM'].map(lambda x:float(2.40) if x == int(1) else 0)

In [166]:
df['revenue_SM'] = np.multiply(df['productBought_SM'], df['salesQuantity'])

In [167]:
revenue_SM = "The total revenue from the sale of the product {} is ${}". format ("SM", format(df['revenue_SM'].sum(), '.2f'))
print(revenue_SM)


The total revenue from the sale of the product SM is $4.80


Product SP

In [168]:
df[df['postComment'].str.contains('SP+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_JOY,revenue_KBP,revenue_MC299,revenue_MCA,revenue_MFC,revenue_PR,revenue_QM,revenue_RES249,revenue_SCP,revenue_SM
103,MUST GRAB!!! 3 X SILVER POMFRET 350-450G/PCS@ $12.90 Comment SP+1 below to join the Sale,OceanStar Seafood,00:26:48,1,15,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
142,MUST GRAB!!! 3 X SILVER POMFRET 350-450G/PCS@ $12.90 Comment SP+1 below to join the Sale,OceanStar Seafood,00:37:57,1,15,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
274,3 X SILVER POMFRET 350-450G/PCS@ $12.90 Comment SP+1 below to join the Sale,OceanStar Seafood,01:15:27,1,13,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [169]:
df['productBought_SP'] = df['productBought_SP'].map(lambda x:float(12.90) if x == int(1) else 0)

In [170]:
df['revenue_SP'] = np.multiply(df['productBought_SP'], df['salesQuantity'])

In [171]:
revenue_SP = "The total revenue from the sale of the product {} is ${}". format ("SP", format(df['revenue_SP'].sum(), '.2f'))
print(revenue_SP)


The total revenue from the sale of the product SP is $12.90


Product TMP

In [172]:
df[df['postComment'].str.contains('TMP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_KBP,revenue_MC299,revenue_MCA,revenue_MFC,revenue_PR,revenue_QM,revenue_RES249,revenue_SCP,revenue_SM,revenue_SP
62,2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.,OceanStar Seafood,00:14:36,1,18,0,0,$11.11 TMP+1,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
197,2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.,OceanStar Seafood,00:55:26,1,18,0,0,$11.11 TMP+1,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199,TMP+1,Susan Chee,00:56:08,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214,TMP+1,Vic Swi,00:59:47,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
222,2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.,OceanStar Seafood,01:01:07,1,18,0,0,$11.11 TMP+1,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [173]:
df['productBought_TMP'] = df['productBought_TMP'].map(lambda x:float(11.11) if x == int(1) else 0)

In [174]:
df['revenue_TMP'] = np.multiply(df['productBought_TMP'], df['salesQuantity'])

In [175]:
revenue_TMP = "The total revenue from the sale of the product {} is ${}". format ("TMP", format(df['revenue_TMP'].sum(), '.2f'))
print(revenue_TMP)


The total revenue from the sale of the product TMP is $33.33


Product WAK

In [176]:
df[df['postComment'].str.contains('WAK', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_MC299,revenue_MCA,revenue_MFC,revenue_PR,revenue_QM,revenue_RES249,revenue_SCP,revenue_SM,revenue_SP,revenue_TMP
49,2 X WILD CAUGHT ANG KA 500G/ POR @$24.90 Comment WAK+1 below to join the Sale.,OceanStar Seafood,00:10:15,1,16,0,0,$24.90 WAK+1,0.0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
156,2 X WILD CAUGHT ANG KA 500G/ POR @$24.90 Comment WAK+1 below to join the Sale.,OceanStar Seafood,00:42:08,1,16,0,0,$24.90 WAK+1,0.0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
157,WAK+1,Vic Swi,00:42:31,0,1,0,1,0,0.0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
240,WAK +1,黄丽娟,01:06:12,0,2,0,1,0,0.0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [177]:
df['productBought_WAK'] = df['productBought_WAK'].map(lambda x:float(24.90) if x == int(1) else 0)

In [178]:
df['revenue_WAK'] = np.multiply(df['productBought_WAK'], df['salesQuantity'])

In [179]:
revenue_WAK = "The total revenue from the sale of the product {} is ${}". format ("WAK", format(df['revenue_WAK'].sum(), '.2f'))
print(revenue_WAK)


The total revenue from the sale of the product WAK is $49.80


In [180]:
# iterating the columns
for col in df.columns:
    print(col)

postComment
postCommentAuthor
postCommentTime_final
isSeller
postCommentLength
lns
salesQuantity
productPrice
productBought_AMB
productBought_BBT
productBought_BCP
productBought_BLJ
productBought_BSS
productBought_BTF
productBought_BTT399
productBought_JAY
productBought_JF
productBought_JOY
productBought_JT
productBought_KBP
productBought_MC299
productBought_MCA
productBought_MFC
productBought_PR
productBought_QM
productBought_RES249
productBought_SCP
productBought_SM
productBought_SP
productBought_TMP
productBought_WAK
revenue_AMB
revenue_BBT
revenue_BCP
revenue_BSS
revenue_BTF
revenue_BTT399
revenue_JAY
revenue_JF
revenue_JOY
revenue_KBP
revenue_MC299
revenue_MCA
revenue_MFC
revenue_PR
revenue_QM
revenue_RES249
revenue_SCP
revenue_SM
revenue_SP
revenue_TMP
revenue_WAK


**Sum of total revenue from the video**

In [181]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_MCA,revenue_MFC,revenue_PR,revenue_QM,revenue_RES249,revenue_SCP,revenue_SM,revenue_SP,revenue_TMP,revenue_WAK
0,Bye 👋,Jane Wong,00:00:00,0,2,0,0,0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Oss,Eve Kang,00:00:40,0,1,0,0,0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:53,1,15,0,0,$39.90 EMP399+1,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,00:01:06,0,6,0,0,0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,OSS,Jack Daniel,00:01:10,0,1,0,0,0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [182]:
#total revenue from the video
total_revenue = df.loc[:, 'revenue_AMB': 'revenue_WAK'].values.sum()

#round the total revenue to 2 decimals places
total_revenue_rounded = format(total_revenue, '.2f')

print(f"The total revenue for the sale of products for the video is ${total_revenue_rounded}")


The total revenue for the sale of products for the video is $689.48


In [183]:
va['totalRevenue'] = total_revenue_rounded
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue
0,OCEANSTARLIVE/videos/869455797269649,17,439,5016,119,2508,10,86,47,689.48


**New Column for the total revenue at that comment**

In [184]:
#https://stackoverflow.com/questions/42063716/pandas-sum-up-multiple-columns-into-one-column-without-last-column
df['revenue'] = df.loc[:, 'revenue_AMB': 'revenue_WAK'].sum(axis=1)
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_MFC,revenue_PR,revenue_QM,revenue_RES249,revenue_SCP,revenue_SM,revenue_SP,revenue_TMP,revenue_WAK,revenue
0,Bye 👋,Jane Wong,00:00:00,0,2,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Oss,Eve Kang,00:00:40,0,1,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:53,1,15,0,0,$39.90 EMP399+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,00:01:06,0,6,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,OSS,Jack Daniel,00:01:10,0,1,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Good morning OSS,Khym Hoon Sung,00:01:11,0,3,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:01:15,1,13,0,0,$39.90 EMP399+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Lns,Teddy Teddy,00:01:20,0,1,1,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,LAST 2 PCS!!! RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,00:01:33,1,17,0,0,$24.90 RES249+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Morning,Stacey Ho,00:01:37,0,1,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [185]:
#https://www.geeksforgeeks.org/how-to-move-a-column-to-first-position-in-pandas-dataframe/
#shift the revenue column to be the 8th column, i.e at position 7
eighth_column = df.pop('revenue')

# insert column using insert(position,column_name,ninth_column) function
df.insert(7, 'revenue', eighth_column)
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,productPrice,productBought_AMB,...,revenue_MCA,revenue_MFC,revenue_PR,revenue_QM,revenue_RES249,revenue_SCP,revenue_SM,revenue_SP,revenue_TMP,revenue_WAK
0,Bye 👋,Jane Wong,00:00:00,0,2,0,0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Oss,Eve Kang,00:00:40,0,1,0,0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:53,1,15,0,0,0.0,$39.90 EMP399+1,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,00:01:06,0,6,0,0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,OSS,Jack Daniel,00:01:10,0,1,0,0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Good morning OSS,Khym Hoon Sung,00:01:11,0,3,0,0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:01:15,1,13,0,0,0.0,$39.90 EMP399+1,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Lns,Teddy Teddy,00:01:20,0,1,1,0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,LAST 2 PCS!!! RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,00:01:33,1,17,0,0,0.0,$24.90 RES249+1,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Morning,Stacey Ho,00:01:37,0,1,0,0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


After the revenue of the sales have been calculated, all the dummified product columns and the column 'productPrice' will be dropped as they are no longer required to be checked against to identify the price of the product.

In [186]:
df = df.loc[: ,'postComment':'revenue']
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue
0,Bye 👋,Jane Wong,00:00:00,0,2,0,0,0.0
1,Oss,Eve Kang,00:00:40,0,1,0,0,0.0
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:53,1,15,0,0,0.0
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,00:01:06,0,6,0,0,0.0
4,OSS,Jack Daniel,00:01:10,0,1,0,0,0.0
5,Good morning OSS,Khym Hoon Sung,00:01:11,0,3,0,0,0.0
6,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:01:15,1,13,0,0,0.0
7,Lns,Teddy Teddy,00:01:20,0,1,1,0,0.0
8,LAST 2 PCS!!! RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,00:01:33,1,17,0,0,0.0
9,Morning,Stacey Ho,00:01:37,0,1,0,0,0.0


**New Column to identify the frequency of the seller's comments in the video**

In [187]:
#frequency of seller's comments
va['frequencySeller']= np.divide(va['videoLength'].iloc[0],va['numSellerComments'])
#seller's comment appears on average of every 42 seconds

In [188]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue,frequencySeller
0,OCEANSTARLIVE/videos/869455797269649,17,439,5016,119,2508,10,86,47,689.48,42.151261


**New Column to identify the seller**

In [189]:
df['seller'] = 'OCEANSTARLIVE'

In [190]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller
0,Bye 👋,Jane Wong,00:00:00,0,2,0,0,0.0,OCEANSTARLIVE
1,Oss,Eve Kang,00:00:40,0,1,0,0,0.0,OCEANSTARLIVE
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:53,1,15,0,0,0.0,OCEANSTARLIVE
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,00:01:06,0,6,0,0,0.0,OCEANSTARLIVE
4,OSS,Jack Daniel,00:01:10,0,1,0,0,0.0,OCEANSTARLIVE


**New Column for the Average Compound Score from Sentiment Analysis on raw & uncleaned comments for video**

In [191]:
# Instantiate Sentiment Intensity Analyzer
sent = SentimentIntensityAnalyzer()

In [192]:
df['sentiment_score'] = df['postComment'].apply(sent.polarity_scores)
df['compound'] = [sent.polarity_scores(x)['compound'] for x in df['postComment']]
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller,sentiment_score,compound
0,Bye 👋,Jane Wong,00:00:00,0,2,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
1,Oss,Eve Kang,00:00:40,0,1,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:53,1,15,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 0.82, 'pos': 0.18, 'compound': 0.4724}",0.4724
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,00:01:06,0,6,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 0.61, 'pos': 0.39, 'compound': 0.4926}",0.4926
4,OSS,Jack Daniel,00:01:10,0,1,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0


In [193]:
#average compound scores for the video
#df.shape[0] calculates the total number of rows in the dataframe
va['averageCompound']= (df['compound'].sum())/(df['compound'].sum())/df.shape[0]
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue,frequencySeller,averageCompound
0,OCEANSTARLIVE/videos/869455797269649,17,439,5016,119,2508,10,86,47,689.48,42.151261,0.003322


In [194]:
#drop the columns with regarding to the sentiment analysis on the raw & uncleaned comments
#as we will perform sentiment analysis at the comment level on processed comments
df = df.loc[: ,'postComment':'seller']
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller
0,Bye 👋,Jane Wong,00:00:00,0,2,0,0,0.0,OCEANSTARLIVE
1,Oss,Eve Kang,00:00:40,0,1,0,0,0.0,OCEANSTARLIVE
2,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:53,1,15,0,0,0.0,OCEANSTARLIVE
3,"Good Morning Miko, OSS Team Everyone!",Catherine Gan-Chua,00:01:06,0,6,0,0,0.0,OCEANSTARLIVE
4,OSS,Jack Daniel,00:01:10,0,1,0,0,0.0,OCEANSTARLIVE


### Saving the cleaned dataframes

In [195]:
# export to csv - change the name of the data file for each video
va.to_csv('../../data/cleaned_data/cleaned_va_OCEANSTARLIVE_869455797269649.csv', index=False)

In [196]:
#check for nulls
#displaying only the columns with nulls and their sum
df[df.columns[df.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [197]:
# export to csv - change the name of the data file for each video
df.to_csv('../../data/cleaned_data/cleaned_OCEANSTARLIVE_869455797269649.csv', index=False)