# Data Import & Cleaning

### Contents:

- [Data Import](#Data-Import)
- [Data Cleaning](#Data-Cleaning)
- [Feature Engineering](#Feature-Engineering)

### Import Libraries

In [1]:
#import standard libraries
import pandas as pd
import numpy as np

#import emoji
import emoji

from natsort import natsorted, index_natsorted, order_by_index

#import warnings to ignore flags when the project is complete
#import warnings
#warnings.filterwarnings('ignore')

#import pre-processing libraries for data cleaning
import string
import re
import nltk
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

## Data Import

**Read scrapped data for the following videos**

In [2]:
va = pd.read_csv('../../data/scrapped_data/va_OCEANSTARLIVE_581200849664350.csv')

In [3]:
va

Unnamed: 0,video_for,totalEmojiReaction,views
0,OCEANSTARLIVE/videos/581200849664350,16,633


In [4]:
df = pd.read_csv('../../data/scrapped_data/OCEANSTARLIVE_581200849664350.csv', encoding='utf-8')

In [5]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime
0,Hello Miko &amp; Team!!,Yvonne Leong,0:44
1,"BIG FISHES!!!</div><div dir=""auto"" style=""text-align: start;"">WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90</div><div dir=""auto"" style=""text-align: start;"">Comment「EMP399+1」below to join the Sale",OceanStar Seafood,0:55
2,"OSS 早安 奶美美 早安 <span class=""pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu""><img alt=""✌️"" height=""16"" referrerpolicy=""origin-when-cross-origin"" src=""https://static.xx.fbcdn.net/images/emoji.php/v9/te2/2/16/270c.png"" width=""16""/></span>",Jack Daniel,0:59
3,PN+1,Eva Sek,1:00:44
4,"LAST 2 SET!!!!</div><div dir=""auto"" style=""text-align: start;"">2 X WILD CAUGHT BLUE TAIL PRAWN 500G/ PORTION @ $18.80</div><div dir=""auto"" style=""text-align: start;"">Comment「BTP188+1」below to join the Sale",OceanStar Seafood,1:00:56
5,"2 X WILD CAUGHT ANG KA 500G/ POR @$24.90</div><div dir=""auto"" style=""text-align: start;"">Comment「WAK+1」below to join the Sale.",OceanStar Seafood,1:01:12
6,"2 X TIGER PRAWN 500G/ PORTION @ $19.90</div><div dir=""auto"" style=""text-align: start;"">Comment「TP199+1」below to join the Sale.",OceanStar Seafood,1:01:23
7,u ask lisan to come in 11am leh hahaha,Winnie Wu,1:01:29
8,"ORGANIC COOKED SHRIMP 31-40PCS/ PKT @$ 18.80</div><div dir=""auto"" style=""text-align: start;"">Comment「SCP+1」below to join the Sale",OceanStar Seafood,1:02:33
9,Her fb name difficult to tag her hahaha,Winnie Wu,1:02:57


In [6]:
#https://stackoverflow.com/questions/32072076/find-the-unique-values-in-a-column-and-then-sort-them
#check if the seller uses multiple accounts to reply
postCommentAuthor_unique = df['postCommentAuthor'].unique()
print(sorted(postCommentAuthor_unique))

['Angela Tay', 'Bee Thin', 'Catherine Gan-Chua', 'Doris Pay', 'Erika Poh', 'Eva Sek', 'Fabian Low', 'Jack Daniel', 'Jacqueline Hoe', 'Jane Wong', 'Jasmine Khoo', 'Jennifer Quek', 'Joanne Koh', 'Nat Aneles', 'OceanStar Seafood', 'Patrick Ong', 'Sabrina Chu', 'Serene Chia', 'Stella Lim', 'Sunny Tan', 'Teik Hang', 'Violet Yap', 'Winnie Wu', 'Yong Liang', 'Yvonne Leong', 'き リーサン']


In [7]:
#find comments posted by the seller only
df.loc[df['postCommentAuthor'] == 'OceanStar Seafood']

Unnamed: 0,postComment,postCommentAuthor,postCommentTime
1,"BIG FISHES!!!</div><div dir=""auto"" style=""text-align: start;"">WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90</div><div dir=""auto"" style=""text-align: start;"">Comment「EMP399+1」below to join the Sale",OceanStar Seafood,0:55
4,"LAST 2 SET!!!!</div><div dir=""auto"" style=""text-align: start;"">2 X WILD CAUGHT BLUE TAIL PRAWN 500G/ PORTION @ $18.80</div><div dir=""auto"" style=""text-align: start;"">Comment「BTP188+1」below to join the Sale",OceanStar Seafood,1:00:56
5,"2 X WILD CAUGHT ANG KA 500G/ POR @$24.90</div><div dir=""auto"" style=""text-align: start;"">Comment「WAK+1」below to join the Sale.",OceanStar Seafood,1:01:12
6,"2 X TIGER PRAWN 500G/ PORTION @ $19.90</div><div dir=""auto"" style=""text-align: start;"">Comment「TP199+1」below to join the Sale.",OceanStar Seafood,1:01:23
8,"ORGANIC COOKED SHRIMP 31-40PCS/ PKT @$ 18.80</div><div dir=""auto"" style=""text-align: start;"">Comment「SCP+1」below to join the Sale",OceanStar Seafood,1:02:33
12,"AUS MELTIQUE BEEF STEAK 200G+-/ PCS @$11.11</div><div dir=""auto"" style=""text-align: start;"">Comment「AMB+1」below to join the Sale",OceanStar Seafood,1:04:10
13,"BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00</div><div dir=""auto"" style=""text-align: start;"">Comment「BTW78+1」below to join the Sale",OceanStar Seafood,1:04:30
14,"BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00</div><div dir=""auto"" style=""text-align: start;"">Comment「BTW168+1」below to join the Sale",OceanStar Seafood,1:04:45
16,"MACKEREL WHOLE 500-600G/ PCS @$9.90</div><div dir=""auto"" style=""text-align: start;"">Comment「MAC99+1」below to join the Sale",OceanStar Seafood,1:05:05
17,"2 X YELLOW CROAKER 400-450G/ PCS @$22.00</div><div dir=""auto"" style=""text-align: start;"">Comment「YC+1」below to join the Sale",OceanStar Seafood,1:05:21


In [8]:
#check the comments to have a gauge
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' JA+1', '1st day of work', '2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11</div><div dir="auto" style="text-align: start;">Comment「TMP+1」below to join the Sale.', '2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11</div><div dir="auto" style="text-align: start;">Comment「BCP+1」below to join the Sale.', '2 X BABY THREADFIN 300-400G / PCS @ $12.90</div><div dir="auto" style="text-align: start;">Comment「BBT+1」below to join the Sale', '2 X BABY WHITE POMFRET 450G+-/ PKT @ $11.11</div><div dir="auto" style="text-align: start;">Comment「BWP+1」below to join the Sale', '2 X BATANG STEAK 250-350G/ PKT @$13.90</div><div dir="auto" style="text-align: start;">Comment「BST+1」below to join the Sale', '2 X BLACK POMFRET 400-500G/ PCS @$15.90</div><div dir="auto" style="text-align: start;">Comment「BP+1」below to join the Sale', '2 X CHINESE POMFRET 250-300G/ PCS @$14.00</div><div dir="auto" style="text-align: start;">Comment「CP14+1」below to join the Sale', '2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.1

## Data Cleaning

### Convert the emojis to text for easy cleaning

We noticed that the emojis have html parsers attached to it. Hence, we will convert the images of the emojis to text first, to remove the html parsers to the emojis, while retaining the emoji's text. We will convert it back to emoji afterwards.

In [9]:
df['postComment'] = df['postComment'].apply(emoji.demojize)

In [10]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' JA+1', '1st day of work', '2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11</div><div dir="auto" style="text-align: start;">Comment「TMP+1」below to join the Sale.', '2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11</div><div dir="auto" style="text-align: start;">Comment「BCP+1」below to join the Sale.', '2 X BABY THREADFIN 300-400G / PCS @ $12.90</div><div dir="auto" style="text-align: start;">Comment「BBT+1」below to join the Sale', '2 X BABY WHITE POMFRET 450G+-/ PKT @ $11.11</div><div dir="auto" style="text-align: start;">Comment「BWP+1」below to join the Sale', '2 X BATANG STEAK 250-350G/ PKT @$13.90</div><div dir="auto" style="text-align: start;">Comment「BST+1」below to join the Sale', '2 X BLACK POMFRET 400-500G/ PCS @$15.90</div><div dir="auto" style="text-align: start;">Comment「BP+1」below to join the Sale', '2 X CHINESE POMFRET 250-300G/ PCS @$14.00</div><div dir="auto" style="text-align: start;">Comment「CP14+1」below to join the Sale', '2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.1

### Clean the comments using Regex

From the above unique values of the comments, there are several cleaning issues that needs to be addressed.

    1. Removal of links or URLs. 
        - URLs are present when the Facebook users manually type a link in the comment. 
        - Additionally, when users are tagged, their Facebook profile URL is printed as a result.
        - Similarly, there is a unique URL linked to each emoji as well. 
        
    2. Removal of HTML special entities
        - Examples of HTML special entities are '&amp' and '&gt'. 
        
    3. Removal of other HTML special terms
        - Examples of other HTML special entities are whitespace HTML special entities like '#x200B' and '#xa0'.
        
    4. Removal of HTML Parsers to the Emojis
        - Previously, the emojis have been demojized to text already. However, the HTML parsers to the mojis remain. Hence, they are required to be removed as well.
        
    5. Removal of HTML Parcers to Whitespaces
        - When a next line is entered in the same comment, there will be a HTML parser to the this whitespace. Hence, this are to be removed as well.
        
    6. Removal of HTML Parsers to tagged names
        - When users are tagged in the comments, in addition to their Facebook profile URL, there will be HTML parsers to the tagged names as well. Hence, this are to be removed as well.
        
    7. Removsl of other HTML Parsers   

In [11]:
def clean(row):
    
    # Remove links or URLs
    row['postComment'] = re.sub(
        pattern=r'https?:\/\/.*\.png', 
        repl='', 
        string=row['postComment'],
        flags=re.M)
    
    # Remove HTML special entities (e.g.. &amp, &gt;)
    row['postComment'] = re.sub(
        pattern=r'\&\w*;',
        repl='',
        string=row['postComment'])

    # Remove emoji html parsers
    #https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
    row['postComment'] = re.sub(
        pattern=r'<img\salt=\"(\:\w*)(\-?)(\w*\:)\"\s[a-z]{6}\=\"\d\d\"\s[a-z]{14}\=.{26}\s[a-z]{3}\=\"\"\s[a-z]{5}\=\"\d\d\"\/>(<\/span>)?',
        repl=r'\1\2\3', 
        string=row['postComment'],
        flags=re.M)
    
    # Remove whitespaces
    row['postComment'] = re.sub(
        pattern=r'<\/div><div\sdir\=\"auto\"\sstyle\=\"text\-align\:\sstart\;\">',
        repl=' ',
        string=row['postComment'],
        flags=re.M)
    
    # Remove emoji html parsers
    row['postComment'] = re.sub(
        pattern = r'<span\sclass\=\"([a-z0-9]{8}\s)+[a-z0-9]{8}\">',
        repl=r' ',
        string=row['postComment'],
        flags=re.M)

    # Remove emoji html parsers
    row['postComment'] = re.sub(
        pattern = r'<div\sclass\=\"([a-z0-9]{8}\s)+[a-z0-9]{8}\".*<\/div>',
        repl=r' ',
        string=row['postComment'],
        flags=re.M)
    
    # Remove emoji html parsers
    row['postComment'] = re.sub(
        pattern=r'<\/span>', 
        repl=' ', 
        string=row['postComment'],
        flags=re.M)
 
    # Remove consecutive non-ASCII characters
    # This will remove the chinese comments
    #https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
    row['postComment'] = re.sub(
        pattern=r'[^\x00-\x7F]+', 
        repl=' ', 
        string=row['postComment'],
        flags=re.M)

    # Remove links or URLs
    row['postComment'] = re.sub(
        pattern=r'https?:\/\/.*tabindex\=\"\d\"', 
        repl='', 
        string=row['postComment'],
        flags=re.M)
    
    #remove tagged names
    row['postComment'] = re.sub(
        pattern=r'<a class="oajrlxb2 g5ia77u1 qu0x051f esr5mh6w e9989ue4 r7d6kgcz rq0escxv nhd2j8a9 nc684nl6 p7hjln8o kvgmc6g5 cxmmr5t8 oygrvhab hcukyx3x jb3vyjys rz4wbd8a qt6c0cv9 a8nywdso i1ao9s8h esuyzwwr f1sip0of lzcic4wl oo9gr5id gpro0wi8 lrazzd5p" href=">',
        repl='',
        string=row['postComment'],
        flags=re.M)    
    
    #remove tagged names
    row['postComment'] = re.sub(
        pattern=r'<span\sclass\=\"([a-z0-9]{8})\">',
        repl='',
        string=row['postComment'],
        flags=re.M)
    
    #remove tagged names
    row['postComment'] = re.sub(
        pattern=r'<span.*<\/a>',
        repl='',
        string=row['postComment'],
        flags=re.M)
    
    return row
    

In [12]:
df = df.apply(clean, axis=1)

In [13]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' ', '  :face_with_tears_of_joy:', ' :face_with_tears_of_joy:  ', ' :grinning_face_with_sweat:', ' :red_apple: juice code ', ' :rolling_on_the_floor_laughing:ya bye ', ' :thinking_face:', ' :woman_facepalming_light_skin_tone:', ' JA+1', ' go  ', '1st day of work', '2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.', '2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11 Comment BCP+1 below to join the Sale.', '2 X BABY THREADFIN 300-400G / PCS @ $12.90 Comment BBT+1 below to join the Sale', '2 X BABY WHITE POMFRET 450G+-/ PKT @ $11.11 Comment BWP+1 below to join the Sale', '2 X BATANG STEAK 250-350G/ PKT @$13.90 Comment BST+1 below to join the Sale', '2 X BLACK POMFRET 400-500G/ PCS @$15.90 Comment BP+1 below to join the Sale', '2 X CHINESE POMFRET 250-300G/ PCS @$14.00 Comment CP14+1 below to join the Sale', '2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.11 Comment PCT+1 below to join the Sale', '2 X FRANCE COD FL 400-500G/ PKT @$59.90 Comment FCF+1 bel

**Convert encoded emoji text back to emojis**

In [14]:
df['postComment'] = df['postComment'].apply(emoji.emojize)

In [15]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' ', '  😂', ' JA+1', ' go  ', ' 🍎 juice code ', ' 😂  ', ' 😅', ' 🤔', ' 🤣ya bye ', ' 🤦🏻\u200d♀️', '1st day of work', '2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.', '2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11 Comment BCP+1 below to join the Sale.', '2 X BABY THREADFIN 300-400G / PCS @ $12.90 Comment BBT+1 below to join the Sale', '2 X BABY WHITE POMFRET 450G+-/ PKT @ $11.11 Comment BWP+1 below to join the Sale', '2 X BATANG STEAK 250-350G/ PKT @$13.90 Comment BST+1 below to join the Sale', '2 X BLACK POMFRET 400-500G/ PCS @$15.90 Comment BP+1 below to join the Sale', '2 X CHINESE POMFRET 250-300G/ PCS @$14.00 Comment CP14+1 below to join the Sale', '2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.11 Comment PCT+1 below to join the Sale', '2 X FRANCE COD FL 400-500G/ PKT @$59.90 Comment FCF+1 below to join the Sale.', '2 X IQF CHICKEN MID JOINT 450G+-/ P@ $9.90 Comment MJ+1 below to join the Sale', '2 X KASU RABBIT FISH 500G+-/PKT @ $18.00 Com

### Drop Empty Posts

Blank comments are dropped to ensure that unique comments are obtained.

In [16]:
#drop empty posts
df = df.loc[((df['postComment'] != ' ')),:]

In [17]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

['  😂', ' JA+1', ' go  ', ' 🍎 juice code ', ' 😂  ', ' 😅', ' 🤔', ' 🤣ya bye ', ' 🤦🏻\u200d♀️', '1st day of work', '2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.', '2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11 Comment BCP+1 below to join the Sale.', '2 X BABY THREADFIN 300-400G / PCS @ $12.90 Comment BBT+1 below to join the Sale', '2 X BABY WHITE POMFRET 450G+-/ PKT @ $11.11 Comment BWP+1 below to join the Sale', '2 X BATANG STEAK 250-350G/ PKT @$13.90 Comment BST+1 below to join the Sale', '2 X BLACK POMFRET 400-500G/ PCS @$15.90 Comment BP+1 below to join the Sale', '2 X CHINESE POMFRET 250-300G/ PCS @$14.00 Comment CP14+1 below to join the Sale', '2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.11 Comment PCT+1 below to join the Sale', '2 X FRANCE COD FL 400-500G/ PKT @$59.90 Comment FCF+1 below to join the Sale.', '2 X IQF CHICKEN MID JOINT 450G+-/ P@ $9.90 Comment MJ+1 below to join the Sale', '2 X KASU RABBIT FISH 500G+-/PKT @ $18.00 Comment 

### Reindexing the dataframe 
**New Column to reindex the dataframe in accordance to time**

From the data, we can tell that there is an inconsistent timestamp being used in the column 'postCommentTime'. For example, there are times like '0:57' and '1:00:14' which indicates 0 hour 0 mins 57 secs, and 1 hour 0 mins 14 secs. 

Hence, a new column 'postCommentTime_final' is created to ensure that a timestamp of HH:MM:SS is being used consistently throughout. However, we note that the number of days is being included for TimeDeltaIndex.

As a result, we will read the timestamp, by excluding the number of days.  

In [18]:
#TimedeltaIndex
#https://stackoverflow.com/questions/54877467/pandas-convert-hhmm-and-hhmmss-to-standard-hhmmss-in-python
# for example, time of '0:57' will then be 00:00:57; 0 hours 0 mins 57 secs
df['postCommentTime_final'] = pd.to_timedelta(np.where(df['postCommentTime'].str.count(':') == 1, '00:' + df['postCommentTime'], df['postCommentTime']))

In [19]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Hello Miko Team!!,Yvonne Leong,0:44,0 days 00:00:44
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:55,0 days 00:00:55
2,OSS ✌️,Jack Daniel,0:59,0 days 00:00:59
3,PN+1,Eva Sek,1:00:44,0 days 01:00:44
4,LAST 2 SET!!!! 2 X WILD CAUGHT BLUE TAIL PRAWN 500G/ PORTION @ $18.80 Comment BTP188+1 below to join the Sale,OceanStar Seafood,1:00:56,0 days 01:00:56


In [20]:
df['postCommentTime_final'] = df['postCommentTime_final'].astype(str).map(lambda x: x[7:])

In [21]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Hello Miko Team!!,Yvonne Leong,0:44,00:00:44
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:55,00:00:55
2,OSS ✌️,Jack Daniel,0:59,00:00:59
3,PN+1,Eva Sek,1:00:44,01:00:44
4,LAST 2 SET!!!! 2 X WILD CAUGHT BLUE TAIL PRAWN 500G/ PORTION @ $18.80 Comment BTP188+1 below to join the Sale,OceanStar Seafood,1:00:56,01:00:56
5,2 X WILD CAUGHT ANG KA 500G/ POR @$24.90 Comment WAK+1 below to join the Sale.,OceanStar Seafood,1:01:12,01:01:12
6,2 X TIGER PRAWN 500G/ PORTION @ $19.90 Comment TP199+1 below to join the Sale.,OceanStar Seafood,1:01:23,01:01:23
7,u ask lisan to come in 11am leh hahaha,Winnie Wu,1:01:29,01:01:29
8,ORGANIC COOKED SHRIMP 31-40PCS/ PKT @$ 18.80 Comment SCP+1 below to join the Sale,OceanStar Seafood,1:02:33,01:02:33
9,Her fb name difficult to tag her hahaha,Winnie Wu,1:02:57,01:02:57


In [22]:
#reindex according to postCommentTime_final
#previous natsort in data collection didnt take into account the different timestamp format
df = df.reindex(index=order_by_index(df.index, index_natsorted(df.postCommentTime_final)))

In [23]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Hello Miko Team!!,Yvonne Leong,0:44,00:00:44
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:55,00:00:55
2,OSS ✌️,Jack Daniel,0:59,00:00:59
21,Morning OSS,Jennifer Quek,1:07,00:01:07
27,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:09,00:01:09
39,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:12,00:01:12
47,PREMIUM GRADE SALMON 2.2-2.4 KG/ PCS @$58.80 Comment SS588+1 below to join the Sale,OceanStar Seafood,1:32,00:01:32
48,Hello,Joanne Koh,1:33,00:01:33
49,BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00 Comment BTW78+1 below to join the Sale,OceanStar Seafood,1:41,00:01:41
50,BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00 Comment BTW168+1 below to join the Sale,OceanStar Seafood,1:51,00:01:51


In [24]:
#reset the index for the dataframe
#https://stackoverflow.com/questions/20490274/how-to-reset-index-in-a-pandas-dataframe

df = df.reset_index(drop=True)

In [25]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Hello Miko Team!!,Yvonne Leong,0:44,00:00:44
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:55,00:00:55
2,OSS ✌️,Jack Daniel,0:59,00:00:59
3,Morning OSS,Jennifer Quek,1:07,00:01:07
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:09,00:01:09
5,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:12,00:01:12
6,PREMIUM GRADE SALMON 2.2-2.4 KG/ PCS @$58.80 Comment SS588+1 below to join the Sale,OceanStar Seafood,1:32,00:01:32
7,Hello,Joanne Koh,1:33,00:01:33
8,BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00 Comment BTW78+1 below to join the Sale,OceanStar Seafood,1:41,00:01:41
9,BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00 Comment BTW168+1 below to join the Sale,OceanStar Seafood,1:51,00:01:51


**Obtain the length of the video**

Assuming that the last comment of the video is the total length of the video, we will input the length of the video from the comments dataframe into the video attributes dataframe.

In [26]:
#retrieve last comment to obtain the length of the video
df['postCommentTime_final'].iloc[-1]

'01:13:52'

In [27]:
#https://stackoverflow.com/questions/6402812/how-to-convert-an-hmmss-time-string-to-seconds-in-python
def get_sec(time_str):
    """Get Seconds from time."""
    h, m, s = time_str.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

In [28]:
#retrieve last comment to obtain the length of the video in seconds
va['videoLength']= get_sec(df['postCommentTime_final'].iloc[-1])

In [29]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength
0,OCEANSTARLIVE/videos/581200849664350,16,633,4432


## Feature Engineering

### Comments made by the Seller

**New Column to identify the number of comments made by the seller in the video**

From the total sum of comments made by the seller, we will input it into the video attributes dataframe.

In [30]:
(df['postCommentAuthor']=='OceanStar Seafood').sum()

133

In [31]:
va['numSellerComments'] = (df['postCommentAuthor']=='OceanStar Seafood').sum()

In [32]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments
0,OCEANSTARLIVE/videos/581200849664350,16,633,4432,133


**New Column to identify if the comment is made by the Seller or not**

In [33]:
#to delete column 'notSeller' in due course
df['notSeller'] = df['postCommentAuthor'].map(lambda x:1 if x !='OceanStar Seafood' else 0)

In [34]:
#create a new column to show if the comment is made by the seller or not
df['isSeller'] = df['postCommentAuthor'].map(lambda x:1 if x =='OceanStar Seafood' else 0)

In [35]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller
0,Hello Miko Team!!,Yvonne Leong,0:44,00:00:44,1,0
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:55,00:00:55,0,1
2,OSS ✌️,Jack Daniel,0:59,00:00:59,1,0
3,Morning OSS,Jennifer Quek,1:07,00:01:07,1,0
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:09,00:01:09,0,1


In [36]:
df['isSeller'].value_counts()

0    139
1    133
Name: isSeller, dtype: int64

In [37]:
#show all the seller's comments
df.loc[df['isSeller'] == 1]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:55,00:00:55,0,1
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:09,00:01:09,0,1
5,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:12,00:01:12,0,1
6,PREMIUM GRADE SALMON 2.2-2.4 KG/ PCS @$58.80 Comment SS588+1 below to join the Sale,OceanStar Seafood,1:32,00:01:32,0,1
8,BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00 Comment BTW78+1 below to join the Sale,OceanStar Seafood,1:41,00:01:41,0,1
9,BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00 Comment BTW168+1 below to join the Sale,OceanStar Seafood,1:51,00:01:51,0,1
14,MACKEREL WHOLE 500-600G/ PCS @$9.90 Comment MAC99+1 below to join the Sale,OceanStar Seafood,2:46,00:02:46,0,1
15,SALMON TROUT WHOLE 4.0-4.3 KG/ PCS @$108.00 Comment STW+1 below to join the Sale,OceanStar Seafood,3:16,00:03:16,0,1
18,RED SNAPPER WHOLE 1.7-1.9 KG/ PCS @$33.33 Comment RS33+1 below to join the Sale,OceanStar Seafood,3:28,00:03:28,0,1
19,WILD BARRAMUNDI WHOLE 3.5-3.9 KG/ PCS @ $49.90 Comment WB+1 below to join the Sale,OceanStar Seafood,3:35,00:03:35,0,1


### Length of comments

**New Column to identify the length of each comment**

From the comments, we take the length of each comment as the total number of words each comment has.

In [38]:
#length of each comment
#https://stackoverflow.com/questions/37483470/how-to-calculate-number-of-words-in-a-string-in-dataframe
df['postCommentLength'] = df['postComment'].str.split().str.len()

In [39]:
df.head(10)

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength
0,Hello Miko Team!!,Yvonne Leong,0:44,00:00:44,1,0,3
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:55,00:00:55,0,1,15
2,OSS ✌️,Jack Daniel,0:59,00:00:59,1,0,2
3,Morning OSS,Jennifer Quek,1:07,00:01:07,1,0,2
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:09,00:01:09,0,1,13
5,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:12,00:01:12,0,1,13
6,PREMIUM GRADE SALMON 2.2-2.4 KG/ PCS @$58.80 Comment SS588+1 below to join the Sale,OceanStar Seafood,1:32,00:01:32,0,1,14
7,Hello,Joanne Koh,1:33,00:01:33,1,0,1
8,BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00 Comment BTW78+1 below to join the Sale,OceanStar Seafood,1:41,00:01:41,0,1,13
9,BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00 Comment BTW168+1 below to join the Sale,OceanStar Seafood,1:51,00:01:51,0,1,14


**New Column to identify the total number of comments in the video**

From the total number of comments in the video, we will input it into the video attributes dataframe.

In [40]:
#total number of comments
df['postCommentLength'].sum()

2486

In [41]:
va['numComments'] = df['postCommentLength'].sum()

In [42]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments
0,OCEANSTARLIVE/videos/581200849664350,16,633,4432,133,2486


### LNS

LNS is an acronym that stands for 'like and share'. It is a form of customer engagement as it indicates by the customers to the sellers that they have liked and shared the video on their Facebook wall. 

**New Column to identify if Customers are engaging in liking and sharing the video**

In [43]:
#if the customer has commented 'lns' or 'ls' which stands for 'like & shared' & 'like shared' respectively
def lns(comment):
    if re.search(r'(l)(n?)(s)', comment, re.IGNORECASE):
        return int(1)
    else:
        return int(0)

In [44]:
df['lns'] = df['postComment'].map(lambda x:lns(x))

In [45]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns
0,Hello Miko Team!!,Yvonne Leong,0:44,00:00:44,1,0,3,0
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:55,00:00:55,0,1,15,0
2,OSS ✌️,Jack Daniel,0:59,00:00:59,1,0,2,0
3,Morning OSS,Jennifer Quek,1:07,00:01:07,1,0,2,0
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:09,00:01:09,0,1,13,0


**New Column to identify if the number of Customers who explicitly inform the sellers that they are engaging in liking and sharing the video**

In [46]:
#range of customer's engagement for LNS
df['lns'].value_counts()

0    269
1      3
Name: lns, dtype: int64

In [47]:
(df['lns']==1).sum()

3

In [48]:
va['lnsQuantity'] = (df['lns']==1).sum()

In [49]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity
0,OCEANSTARLIVE/videos/581200849664350,16,633,4432,133,2486,3


## Sales Quantity

**New Columns to identify the quantity of sales made**

From the comments, using regex, we first see an overview of the comments that are related to the sale of the products. 

In [50]:
#products offered by the seller
df[df['postComment'].str.contains('(\w*\+\d)', regex=True)]

  df[df['postComment'].str.contains('(\w*\+\d)', regex=True)]


Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:55,00:00:55,0,1,15,0
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:09,00:01:09,0,1,13,0
5,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:12,00:01:12,0,1,13,0
6,PREMIUM GRADE SALMON 2.2-2.4 KG/ PCS @$58.80 Comment SS588+1 below to join the Sale,OceanStar Seafood,1:32,00:01:32,0,1,14,0
8,BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00 Comment BTW78+1 below to join the Sale,OceanStar Seafood,1:41,00:01:41,0,1,13,0
9,BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00 Comment BTW168+1 below to join the Sale,OceanStar Seafood,1:51,00:01:51,0,1,14,0
14,MACKEREL WHOLE 500-600G/ PCS @$9.90 Comment MAC99+1 below to join the Sale,OceanStar Seafood,2:46,00:02:46,0,1,12,0
15,SALMON TROUT WHOLE 4.0-4.3 KG/ PCS @$108.00 Comment STW+1 below to join the Sale,OceanStar Seafood,3:16,00:03:16,0,1,14,0
18,RED SNAPPER WHOLE 1.7-1.9 KG/ PCS @$33.33 Comment RS33+1 below to join the Sale,OceanStar Seafood,3:28,00:03:28,0,1,14,0
19,WILD BARRAMUNDI WHOLE 3.5-3.9 KG/ PCS @ $49.90 Comment WB+1 below to join the Sale,OceanStar Seafood,3:35,00:03:35,0,1,15,0


In [51]:
def sale(comment):
    if re.search(r'(\w*\+)(\d)', comment):
        return int(re.search(r'\w*\+\d', comment).group(0)[-1])
    else:
        return int(0)

In [52]:
df['sales'] = df['postComment'].apply(lambda x:sale(x))

In [53]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales
0,Hello Miko Team!!,Yvonne Leong,0:44,00:00:44,1,0,3,0,0
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:55,00:00:55,0,1,15,0,1
2,OSS ✌️,Jack Daniel,0:59,00:00:59,1,0,2,0,0
3,Morning OSS,Jennifer Quek,1:07,00:01:07,1,0,2,0,0
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:09,00:01:09,0,1,13,0,1


In [54]:
#sales made by the customers; exclude the seller's comments on the codes for the sales
#https://stackoverflow.com/questions/19914937/applying-function-with-multiple-arguments-to-create-a-new-pandas-column
df['salesQuantity'] = np.multiply(df['notSeller'], df['sales'])

In [55]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity
0,Hello Miko Team!!,Yvonne Leong,0:44,00:00:44,1,0,3,0,0,0
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:55,00:00:55,0,1,15,0,1,0
2,OSS ✌️,Jack Daniel,0:59,00:00:59,1,0,2,0,0,0
3,Morning OSS,Jennifer Quek,1:07,00:01:07,1,0,2,0,0,0
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:09,00:01:09,0,1,13,0,1,0


In [56]:
#range of sales quantity
df['salesQuantity'].value_counts()

0    229
1     31
2      8
3      4
Name: salesQuantity, dtype: int64

In [57]:
#total number of orders made
df['salesQuantity'].sum()

59

In [58]:
va['salesQuantity'] = df['salesQuantity'].sum()

In [59]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity
0,OCEANSTARLIVE/videos/581200849664350,16,633,4432,133,2486,3,59


## Products 

The seller will comment and post the unique product codes for each product. Thereafter, customers who are keen on purchasing the products will explicitly comment out the specific & unique product codes, in addition to the quanityt of each product that they wish to purchase.

**New Columns to identify the products purchased by the Customers**

Regex is used to identify the products being offered and the products being purchased as well.

In [60]:
#function to identify the code of the product bought
def sale2(comment):
    if re.search(r'[a-zA-Z]*\s?(\d*)?\s?\+\s?\d', comment):
        return str(re.search(r'[a-zA-Z]*\s?(\d*)?\s?\+\s?\d', comment).group(0)[:-2])
    else:
        return int(0)

In [61]:
#identifies all comments that have the codes of the products, including the seller's comments.
#this column will be dropped afterwards.
df['product'] = df['postComment'].apply(lambda x:sale2(x))

In [62]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity,product
0,Hello Miko Team!!,Yvonne Leong,0:44,00:00:44,1,0,3,0,0,0,0
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:55,00:00:55,0,1,15,0,1,0,EMP399
2,OSS ✌️,Jack Daniel,0:59,00:00:59,1,0,2,0,0,0,0
3,Morning OSS,Jennifer Quek,1:07,00:01:07,1,0,2,0,0,0,0
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:09,00:01:09,0,1,13,0,1,0,EMP399
5,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:12,00:01:12,0,1,13,0,1,0,EMP399
6,PREMIUM GRADE SALMON 2.2-2.4 KG/ PCS @$58.80 Comment SS588+1 below to join the Sale,OceanStar Seafood,1:32,00:01:32,0,1,14,0,1,0,SS588
7,Hello,Joanne Koh,1:33,00:01:33,1,0,1,0,0,0,0
8,BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00 Comment BTW78+1 below to join the Sale,OceanStar Seafood,1:41,00:01:41,0,1,13,0,1,0,BTW78
9,BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00 Comment BTW168+1 below to join the Sale,OceanStar Seafood,1:51,00:01:51,0,1,14,0,1,0,BTW168


In [63]:
#products bought by Customers; exclude the seller's comments on the product details 
df['productBought'] = np.multiply(df['notSeller'], df['product'])

In [64]:
df['productBought'].unique()

array([0, '', 'PC', 'MJ', 'QM', 'Cuttlefish ', 'BTF', 'CP 22 ', 'TNP',
       'TMP', 'Tmp', 'STF', 'STF ', 'Otak', 'PR', 'JA', 'SM', 'Jay',
       'MAC99', 'CST229', 'RST', 'BSQ', 'BWP', 'KN119', 'Kn119', 'RGSS',
       'CCL', 'SS588', 'friedcrabstick', 'PN', 'BBt', 'SCP', 'AMB', 'KBP',
       'Kbp', 'Mca'], dtype=object)

In [65]:
#https://stackoverflow.com/questions/13445241/replacing-blank-values-white-space-with-nan-in-pandas
df['productBought'] = df['productBought'].replace(r'^\s*$', int(0), regex=True)

In [66]:
df['productBought'].unique()

array([0, 'PC', 'MJ', 'QM', 'Cuttlefish ', 'BTF', 'CP 22 ', 'TNP', 'TMP',
       'Tmp', 'STF', 'STF ', 'Otak', 'PR', 'JA', 'SM', 'Jay', 'MAC99',
       'CST229', 'RST', 'BSQ', 'BWP', 'KN119', 'Kn119', 'RGSS', 'CCL',
       'SS588', 'friedcrabstick', 'PN', 'BBt', 'SCP', 'AMB', 'KBP', 'Kbp',
       'Mca'], dtype=object)

In [67]:
#remove whitespaces
df['productBought'] = df['productBought'].str.replace(" ","")

In [68]:
df['productBought'].unique()

array([nan, 'PC', 'MJ', 'QM', 'Cuttlefish', 'BTF', 'CP22', 'TNP', 'TMP',
       'Tmp', 'STF', 'Otak', 'PR', 'JA', 'SM', 'Jay', 'MAC99', 'CST229',
       'RST', 'BSQ', 'BWP', 'KN119', 'Kn119', 'RGSS', 'CCL', 'SS588',
       'friedcrabstick', 'PN', 'BBt', 'SCP', 'AMB', 'KBP', 'Kbp', 'Mca'],
      dtype=object)

In [69]:
df.update(df[['productBought']].fillna(0))

In [70]:
df['productBought'].unique()

array([0, 'PC', 'MJ', 'QM', 'Cuttlefish', 'BTF', 'CP22', 'TNP', 'TMP',
       'Tmp', 'STF', 'Otak', 'PR', 'JA', 'SM', 'Jay', 'MAC99', 'CST229',
       'RST', 'BSQ', 'BWP', 'KN119', 'Kn119', 'RGSS', 'CCL', 'SS588',
       'friedcrabstick', 'PN', 'BBt', 'SCP', 'AMB', 'KBP', 'Kbp', 'Mca'],
      dtype=object)

In [71]:
#change the produce codes to be uppercase for consistency
#https://stackoverflow.com/questions/39512002/convert-whole-dataframe-from-lower-case-to-upper-case-with-pandas
df['productBought'] = df['productBought'].astype(str).str.upper()

In [72]:
df['productBought'].unique()

array(['0', 'PC', 'MJ', 'QM', 'CUTTLEFISH', 'BTF', 'CP22', 'TNP', 'TMP',
       'STF', 'OTAK', 'PR', 'JA', 'SM', 'JAY', 'MAC99', 'CST229', 'RST',
       'BSQ', 'BWP', 'KN119', 'RGSS', 'CCL', 'SS588', 'FRIEDCRABSTICK',
       'PN', 'BBT', 'SCP', 'AMB', 'KBP', 'MCA'], dtype=object)

In [73]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity,product,productBought
0,Hello Miko Team!!,Yvonne Leong,0:44,00:00:44,1,0,3,0,0,0,0,0
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:55,00:00:55,0,1,15,0,1,0,EMP399,0
2,OSS ✌️,Jack Daniel,0:59,00:00:59,1,0,2,0,0,0,0,0
3,Morning OSS,Jennifer Quek,1:07,00:01:07,1,0,2,0,0,0,0,0
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:09,00:01:09,0,1,13,0,1,0,EMP399,0
5,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:12,00:01:12,0,1,13,0,1,0,EMP399,0
6,PREMIUM GRADE SALMON 2.2-2.4 KG/ PCS @$58.80 Comment SS588+1 below to join the Sale,OceanStar Seafood,1:32,00:01:32,0,1,14,0,1,0,SS588,0
7,Hello,Joanne Koh,1:33,00:01:33,1,0,1,0,0,0,0,0
8,BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00 Comment BTW78+1 below to join the Sale,OceanStar Seafood,1:41,00:01:41,0,1,13,0,1,0,BTW78,0
9,BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00 Comment BTW168+1 below to join the Sale,OceanStar Seafood,1:51,00:01:51,0,1,14,0,1,0,BTW168,0


### Price of Products

**New Column to identify the price of the products**

This new column is created to identify the regex of the unique product codes and their corresponding prices, as adviced by the seller.

In [74]:
def price(comment):
    if re.search(r'(\@)(\$)( ?)(.*)', comment):
        return str(re.search(r'(\$)( ?)(.*)', comment).group(0)[:-23])
    else:
        return int(0)

In [75]:
df['productPrice'] = df['postComment'].apply(lambda x:price(x))

In [76]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity,product,productBought,productPrice
0,Hello Miko Team!!,Yvonne Leong,0:44,00:00:44,1,0,3,0,0,0,0,0,0
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:55,00:00:55,0,1,15,0,1,0,EMP399,0,$39.90 Comment EMP399+1
2,OSS ✌️,Jack Daniel,0:59,00:00:59,1,0,2,0,0,0,0,0,0
3,Morning OSS,Jennifer Quek,1:07,00:01:07,1,0,2,0,0,0,0,0,0
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:09,00:01:09,0,1,13,0,1,0,EMP399,0,$39.90 Comment EMP399+1
5,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:12,00:01:12,0,1,13,0,1,0,EMP399,0,$39.90 Comment EMP399+1
6,PREMIUM GRADE SALMON 2.2-2.4 KG/ PCS @$58.80 Comment SS588+1 below to join the Sale,OceanStar Seafood,1:32,00:01:32,0,1,14,0,1,0,SS588,0,$58.80 Comment SS588+1
7,Hello,Joanne Koh,1:33,00:01:33,1,0,1,0,0,0,0,0,0
8,BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00 Comment BTW78+1 below to join the Sale,OceanStar Seafood,1:41,00:01:41,0,1,13,0,1,0,BTW78,0,$78.00 Comment BTW78+1
9,BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00 Comment BTW168+1 below to join the Sale,OceanStar Seafood,1:51,00:01:51,0,1,14,0,1,0,BTW168,0,$168.00 Comment BTW168+1


We noticed that each comment has a word 'Comment' in the middle of the extracted string of comments for the column 'productPrice'. Hence, we will remove the mentioned word.

In [77]:
df['productPrice'] = df['productPrice'].replace(value='', regex=r'Comment')

In [78]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity,product,productBought,productPrice
0,Hello Miko Team!!,Yvonne Leong,0:44,00:00:44,1,0,3,0,0,0,0,0,0
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,0:55,00:00:55,0,1,15,0,1,0,EMP399,0,$39.90 EMP399+1
2,OSS ✌️,Jack Daniel,0:59,00:00:59,1,0,2,0,0,0,0,0,0
3,Morning OSS,Jennifer Quek,1:07,00:01:07,1,0,2,0,0,0,0,0,0
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:09,00:01:09,0,1,13,0,1,0,EMP399,0,$39.90 EMP399+1
5,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,1:12,00:01:12,0,1,13,0,1,0,EMP399,0,$39.90 EMP399+1
6,PREMIUM GRADE SALMON 2.2-2.4 KG/ PCS @$58.80 Comment SS588+1 below to join the Sale,OceanStar Seafood,1:32,00:01:32,0,1,14,0,1,0,SS588,0,$58.80 SS588+1
7,Hello,Joanne Koh,1:33,00:01:33,1,0,1,0,0,0,0,0,0
8,BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00 Comment BTW78+1 below to join the Sale,OceanStar Seafood,1:41,00:01:41,0,1,13,0,1,0,BTW78,0,$78.00 BTW78+1
9,BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00 Comment BTW168+1 below to join the Sale,OceanStar Seafood,1:51,00:01:51,0,1,14,0,1,0,BTW168,0,$168.00 BTW168+1


In [79]:
df['productPrice'].unique()

array([0, '$39.90  EMP399+1', '$58.80  SS588+1', '$78.00  BTW78+1',
       '$168.00  BTW168+1', '$9.90  MAC99+1', '$108.00  STW+1',
       '$33.33  RS33+1', '$8.00  PC+1 ', '$39.90  WG399+1',
       '$58.00  RG58+1', '$88.00  RG88+1', '$55.00  CP55+1',
       '$32.00  CP32+1', '$22.00    CP22+1', '$14.00  CP14+1',
       '$11.11  BCP+1 ', '$24.00  BSS+1', '$10.00  QM+1 ',
       '$24.90  WAK+1 ', '$11.11  TMP+1 ', '$36.00  STF+1 ',
       '$11.11  OTAK+1', '$13.90  CCL+1 ', '$13.90  SBY+1 ',
       '$13.90  VNL+1 ', '$13.90  MCA+1 ', '$12.90  JA+1 ',
       '$2.40  JAY+1 ', '$2.40  JOY+1 ', '$22.90 ( PWP )  CST229+1',
       '$11.11  AMB+1', '$22.90  RST+1', '$59.90  FCF+1 ', '$15.90  BP+1',
       '$13.90  BST+1', '$11.90  KN119+1 ', '$18.00 ( PWP )  RGSS+1',
       '$11.11  WB11+1', '$9.90  WBF+1 ', '$11.11  FG+1',
       '$39.90  SL399+1 ', '$18.00  PN+1', '$ 18.80  SCP+1',
       '$22.00  YC+1', '$8.00  KFC+1', '$ 5.90  KBP+1'], dtype=object)

Excluding the comments where there was no product codes as adviced by the seller, we find the number of unique products offered by the seller.

In [80]:
#number of unique products offered by the seller
int(df['productPrice'].nunique()) - int(1)

46

In [81]:
#total number of products offered
va['numProducts'] = int(df['productPrice'].nunique()) - int(1)

In [82]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts
0,OCEANSTARLIVE/videos/581200849664350,16,633,4432,133,2486,3,59,46


**Drop irrelevant columns**

The following column was dropped for the following reasons:

2. 'notSeller'
- This column 'notSeller' was solely created to calculate the quantity of sales made, and the products purchased by the customer. Hence, we are able to delete it after the quantity of sales have been calculated and the identification of the products purchased by the customers have been identified.
- Notwithstanding the above, a new column 'isSeller' has been feature engineered out as well, which will tell us the same results on whether the comment is posted & written by a seller or not. 

3. 'sales'
- This column identifies the quantity of products to the unique & specific product codes. However, it includes the product codes posted by the sellers as well. Hence, this column was solely created to be multiplied against the column 'notSeller' to calculate the true quantity of sales made by customers only. Hence, we are able to delete it after the quantity of sales have been calculated.

4. 'product'
- This column 'product' was solely created to identify the products purchased by the customers. Hence, we are able to delete it after the products purchased by the customers have been identified - especially since not all products offered by the seller is being bought by the customers.

In [83]:
#drop unwanted columns
df.drop(['postCommentTime', 'notSeller', 'sales', 'product'], axis=1, inplace=True)

In [84]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productBought,productPrice
0,Hello Miko Team!!,Yvonne Leong,00:00:44,0,3,0,0,0,0
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:55,1,15,0,0,0,$39.90 EMP399+1
2,OSS ✌️,Jack Daniel,00:00:59,0,2,0,0,0,0
3,Morning OSS,Jennifer Quek,00:01:07,0,2,0,0,0,0
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:01:09,1,13,0,0,0,$39.90 EMP399+1


### Revenue from the sale of the products

**Dummify the products bought to find the revenue**

In [85]:
#getdummies the products bought
df = pd.get_dummies(df, columns = ['productBought'], drop_first = True)

In [86]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_PR,productBought_QM,productBought_RGSS,productBought_RST,productBought_SCP,productBought_SM,productBought_SS588,productBought_STF,productBought_TMP,productBought_TNP
0,Hello Miko Team!!,Yvonne Leong,00:00:44,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:55,1,15,0,0,$39.90 EMP399+1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,OSS ✌️,Jack Daniel,00:00:59,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Morning OSS,Jennifer Quek,00:01:07,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:01:09,1,13,0,0,$39.90 EMP399+1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
# iterating the columns
for col in df.columns:
    print(col)

postComment
postCommentAuthor
postCommentTime_final
isSeller
postCommentLength
lns
salesQuantity
productPrice
productBought_AMB
productBought_BBT
productBought_BSQ
productBought_BTF
productBought_BWP
productBought_CCL
productBought_CP22
productBought_CST229
productBought_CUTTLEFISH
productBought_FRIEDCRABSTICK
productBought_JA
productBought_JAY
productBought_KBP
productBought_KN119
productBought_MAC99
productBought_MCA
productBought_MJ
productBought_OTAK
productBought_PC
productBought_PN
productBought_PR
productBought_QM
productBought_RGSS
productBought_RST
productBought_SCP
productBought_SM
productBought_SS588
productBought_STF
productBought_TMP
productBought_TNP


After the column 'productBought' has been dummified, the cells will return a '1' if that particular item is bought, and a '0' if it is not. Hence, we will replace all the '1' with the price of the product.

Then, we will create a new revenue column which is a multiplication of the column 'salesQuantity' against the price of the product (i.e. the dummified 'productBought' columnns).

Product AMB

In [88]:
df[df['postComment'].str.contains('AMB', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_PR,productBought_QM,productBought_RGSS,productBought_RST,productBought_SCP,productBought_SM,productBought_SS588,productBought_STF,productBought_TMP,productBought_TNP
124,AUS MELTIQUE BEEF STEAK 200G+-/ PCS @$11.11 Comment AMB+1 below to join the Sale,OceanStar Seafood,00:29:53,1,14,0,0,$11.11 AMB+1,0,0,...,0,0,0,0,0,0,0,0,0,0
128,AUS MELTIQUE BEEF STEAK 200G+-/ PCS @$11.11 Comment AMB+1 below to join the Sale,OceanStar Seafood,00:31:11,1,14,0,0,$11.11 AMB+1,0,0,...,0,0,0,0,0,0,0,0,0,0
240,AUS MELTIQUE BEEF STEAK 200G+-/ PCS @$11.11 Comment AMB+1 below to join the Sale,OceanStar Seafood,01:04:10,1,14,0,0,$11.11 AMB+1,0,0,...,0,0,0,0,0,0,0,0,0,0
243,AMB+2,Serene Chia,01:05:05,0,1,0,2,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
df['productBought_AMB'] = df['productBought_AMB'].map(lambda x:float(11.11) if x == int(1) else 0)

In [90]:
df['revenue_AMB'] = np.multiply(df['productBought_AMB'], df['salesQuantity'])

In [91]:
revenue_AMB = "The total revenue from the sale of the product {} is ${}". format ("AMB", format(df['revenue_AMB'].sum(), '.2f'))
print(revenue_AMB)

The total revenue from the sale of the product AMB is $22.22


Product BBT

In [92]:
df[df['postComment'].str.contains('BBT', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_QM,productBought_RGSS,productBought_RST,productBought_SCP,productBought_SM,productBought_SS588,productBought_STF,productBought_TMP,productBought_TNP,revenue_AMB
189,2 X BABY THREADFIN 300-400G / PCS @ $12.90 Comment BBT+1 below to join the Sale,OceanStar Seafood,00:48:58,1,16,0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [93]:
df['productBought_BBT'] = df['productBought_BBT'].map(lambda x:float(12.90) if x == int(1) else 0)

In [94]:
df['revenue_BBT'] = np.multiply(df['productBought_BBT'], df['salesQuantity'])

In [95]:
revenue_BBT = "The total revenue from the sale of the product {} is ${}". format ("BBT", format(df['revenue_BBT'].sum(), '.2f'))
print(revenue_BBT)


The total revenue from the sale of the product BBT is $12.90


Product BSQ

In [96]:
df[df['postComment'].str.contains('BSQ', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_RGSS,productBought_RST,productBought_SCP,productBought_SM,productBought_SS588,productBought_STF,productBought_TMP,productBought_TNP,revenue_AMB,revenue_BBT
148,BABY SQUID 500G/ PKT @ $9.90 Comment BSQ+1 below to join the Sale.,OceanStar Seafood,00:37:15,1,13,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0
151,BABY SQUID 500G/ PKT @ $9.90 Comment BSQ+1 below to join the Sale.,OceanStar Seafood,00:38:26,1,13,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0
154,BSQ+2,Serene Chia,00:39:11,0,1,0,2,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0


In [97]:
df['productBought_BSQ'] = df['productBought_BSQ'].map(lambda x:float(9.90) if x == int(1) else 0)

In [98]:
df['revenue_BSQ'] = np.multiply(df['productBought_BSQ'], df['salesQuantity'])

In [99]:
revenue_BSQ = "The total revenue from the sale of the product {} is ${}". format ("BSQ", format(df['revenue_BSQ'].sum(), '.2f'))
print(revenue_BSQ)


The total revenue from the sale of the product BSQ is $19.80


Product BTF

In [100]:
df[df['postComment'].str.contains('BTF', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_RST,productBought_SCP,productBought_SM,productBought_SS588,productBought_STF,productBought_TMP,productBought_TNP,revenue_AMB,revenue_BBT,revenue_BSQ
59,BTF+2 CTM+1,Nat Aneles,00:12:17,0,2,0,2,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0


In [101]:
df[df['productPrice'].str.contains('BTF', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_RST,productBought_SCP,productBought_SM,productBought_SS588,productBought_STF,productBought_TMP,productBought_TNP,revenue_AMB,revenue_BBT,revenue_BSQ


Product BWP

In [102]:
df[df['postComment'].str.contains('BWP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_RST,productBought_SCP,productBought_SM,productBought_SS588,productBought_STF,productBought_TMP,productBought_TNP,revenue_AMB,revenue_BBT,revenue_BSQ
157,2 X BABY WHITE POMFRET 450G+-/ PKT @ $11.11 Comment BWP+1 below to join the Sale,OceanStar Seafood,00:40:07,1,16,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
160,2 X BABY WHITE POMFRET 450G+-/ PKT @ $11.11 Comment BWP+1 below to join the Sale,OceanStar Seafood,00:42:12,1,16,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
166,BWP+1,Violet Yap,00:43:27,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0


In [103]:
df['productBought_BWP'] = df['productBought_BWP'].map(lambda x:float(11.11) if x == int(1) else 0)

In [104]:
df['revenue_BWP'] = np.multiply(df['productBought_BWP'], df['salesQuantity'])

In [105]:
revenue_BWP = "The total revenue from the sale of the product {} is ${}". format ("BWP", format(df['revenue_BWP'].sum(), '.2f'))
print(revenue_BWP)


The total revenue from the sale of the product BWP is $11.11


Product CCL

In [106]:
df[df['postComment'].str.contains('CCL', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_SCP,productBought_SM,productBought_SS588,productBought_STF,productBought_TMP,productBought_TNP,revenue_AMB,revenue_BBT,revenue_BSQ,revenue_BWP
90,CREPE CAKE SERIES!!! CHOCO JAPAN HOKKAIDO CREPE CAKE 4 PCS/ PKT @$13.90 Comment CCL+1 below to join the Sale.,OceanStar Seafood,00:20:36,1,19,0,0,$13.90 CCL+1,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
91,CHOCO JAPAN HOKKAIDO CREPE CAKE 4 PCS/ PKT @$13.90 Comment CCL+1 below to join the Sale.,OceanStar Seafood,00:20:53,1,16,0,0,$13.90 CCL+1,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
132,CHOCO JAPAN HOKKAIDO CREPE CAKE 4 PCS/ PKT @$13.90 Comment CCL+1 below to join the Sale.,OceanStar Seafood,00:33:08,1,16,0,0,$13.90 CCL+1,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
194,CCL+1,Bee Thin,00:49:59,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
259,CHOCO JAPAN HOKKAIDO CREPE CAKE 4 PCS/ PKT @$13.90 Comment CCL+1 below to join the Sale.,OceanStar Seafood,01:10:30,1,16,0,0,$13.90 CCL+1,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0


In [107]:
df['productBought_CCL'] = df['productBought_CCL'].map(lambda x:float(13.90) if x == int(1) else 0)

In [108]:
df['revenue_CCL'] = np.multiply(df['productBought_CCL'], df['salesQuantity'])

In [109]:
revenue_CCL = "The total revenue from the sale of the product {} is ${}". format ("CCL", format(df['revenue_CCL'].sum(), '.2f'))
print(revenue_CCL)


The total revenue from the sale of the product CCL is $13.90


Product CP22

In [110]:
df[df['postComment'].str.contains('CP22', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_SM,productBought_SS588,productBought_STF,productBought_TMP,productBought_TNP,revenue_AMB,revenue_BBT,revenue_BSQ,revenue_BWP,revenue_CCL
40,CHINESE POMFRET 500-600G/ PCS @$22.00 Comment CP22+1 below to join the Sale,OceanStar Seafood,00:06:53,1,12,0,0,$22.00 CP22+1,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
226,CHINESE POMFRET 500-600G/ PCS @$22.00 Comment CP22+1 below to join the Sale,OceanStar Seafood,00:57:38,1,12,0,0,$22.00 CP22+1,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


Product CST229

In [111]:
df['productBought_CP22'] = df['productBought_CP22'].map(lambda x:float(22.00) if x == int(1) else 0)

In [112]:
df['revenue_CP22'] = np.multiply(df['productBought_CP22'], df['salesQuantity'])

In [113]:
revenue_CP22 = "The total revenue from the sale of the product {} is ${}". format ("CP22", format(df['revenue_CP22'].sum(), '.2f'))
print(revenue_CP22)


The total revenue from the sale of the product CP22 is $44.00


Product CST299

In [114]:
df[df['postComment'].str.contains('CST299', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_SS588,productBought_STF,productBought_TMP,productBought_TNP,revenue_AMB,revenue_BBT,revenue_BSQ,revenue_BWP,revenue_CCL,revenue_CP22


In [115]:
df[df['productPrice'].str.contains('CST299', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_SS588,productBought_STF,productBought_TMP,productBought_TNP,revenue_AMB,revenue_BBT,revenue_BSQ,revenue_BWP,revenue_CCL,revenue_CP22


Product CUTTLEFISH

In [116]:
df[df['postComment'].str.contains('CUTTLEFISH+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_SS588,productBought_STF,productBought_TMP,productBought_TNP,revenue_AMB,revenue_BBT,revenue_BSQ,revenue_BWP,revenue_CCL,revenue_CP22


Product FRIEDCRABSTICK

In [117]:
df[df['postComment'].str.contains('FRIEDCRABSTICK', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_SS588,productBought_STF,productBought_TMP,productBought_TNP,revenue_AMB,revenue_BBT,revenue_BSQ,revenue_BWP,revenue_CCL,revenue_CP22


Product JA

In [118]:
df[df['postComment'].str.contains('JA+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_SS588,productBought_STF,productBought_TMP,productBought_TNP,revenue_AMB,revenue_BBT,revenue_BSQ,revenue_BWP,revenue_CCL,revenue_CP22
97,JAPAN APPLE JUICE 1L @$12.90 Comment JA+1 below to join the Sale.,OceanStar Seafood,00:21:54,1,12,0,0,$12.90 JA+1,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
98,JA+1,Sabrina Chu,00:22:19,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
164,JAPAN APPLE JUICE 1L @$12.90 Comment JA+1 below to join the Sale.,OceanStar Seafood,00:43:11,1,12,0,0,$12.90 JA+1,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
215,JAPAN APPLE JUICE 1L @$12.90 Comment JA+1 below to join the Sale.,OceanStar Seafood,00:55:32,1,12,0,0,$12.90 JA+1,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
258,JAPAN APPLE JUICE 1L @$12.90 Comment JA+1 below to join the Sale.,OceanStar Seafood,01:10:19,1,12,0,0,$12.90 JA+1,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [119]:
df['productBought_JA'] = df['productBought_JA'].map(lambda x:float(12.90) if x == int(1) else 0)

In [120]:
df['revenue_JA'] = np.multiply(df['productBought_JA'], df['salesQuantity'])

In [121]:
revenue_JA = "The total revenue from the sale of the product {} is ${}". format ("JA", format(df['revenue_JA'].sum(), '.2f'))
print(revenue_JA)


The total revenue from the sale of the product JA is $12.90


Product JAY

In [122]:
df[df['postComment'].str.contains('JAY', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_STF,productBought_TMP,productBought_TNP,revenue_AMB,revenue_BBT,revenue_BSQ,revenue_BWP,revenue_CCL,revenue_CP22,revenue_JA
106,Japan Hokkaido Apple Yoghurt 200ml @$2.40 Comment JAY+1 below to join the Sale.,OceanStar Seafood,00:23:53,1,13,0,0,$2.40 JAY+1,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
170,Japan Hokkaido Apple Yoghurt 200ml @$2.40 Comment JAY+1 below to join the Sale.,OceanStar Seafood,00:43:41,1,13,0,0,$2.40 JAY+1,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
213,Japan Hokkaido Apple Yoghurt 200ml @$2.40 Comment JAY+1 below to join the Sale.,OceanStar Seafood,00:55:22,1,13,0,0,$2.40 JAY+1,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [123]:
df['productBought_JAY'] = df['productBought_JAY'].map(lambda x:float(2.40) if x == int(1) else 0)

In [124]:
df['revenue_JAY'] = np.multiply(df['productBought_JAY'], df['salesQuantity'])

In [125]:
revenue_JAY = "The total revenue from the sale of the product {} is ${}". format ("JAY", format(df['revenue_JAY'].sum(), '.2f'))
print(revenue_JAY)


The total revenue from the sale of the product JAY is $7.20


Product KBP

In [126]:
df[df['postComment'].str.contains('KBP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_TMP,productBought_TNP,revenue_AMB,revenue_BBT,revenue_BSQ,revenue_BWP,revenue_CCL,revenue_CP22,revenue_JA,revenue_JAY
255,KBP+1,Violet Yap,01:09:48,0,1,0,1,0,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
256,KONG BAK PAO 4 PCS / PKT @$ 5.90 Comment KBP+1 below to join the Sale,OceanStar Seafood,01:10:04,1,16,0,0,$ 5.90 KBP+1,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [127]:
df['productBought_KBP'] = df['productBought_KBP'].map(lambda x:float(5.90) if x == int(1) else 0)

In [128]:
df['revenue_KBP'] = np.multiply(df['productBought_KBP'], df['salesQuantity'])

In [129]:
revenue_KBP = "The total revenue from the sale of the product {} is ${}". format ("KBP", format(df['revenue_KBP'].sum(), '.2f'))
print(revenue_KBP)


The total revenue from the sale of the product KBP is $17.70


Product KN119

In [130]:
df[df['postComment'].str.contains('KN119', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,productBought_TNP,revenue_AMB,revenue_BBT,revenue_BSQ,revenue_BWP,revenue_CCL,revenue_CP22,revenue_JA,revenue_JAY,revenue_KBP
175,2 X KUNNING 500G/ PKT @$11.90 Comment KN119+1 below to join the Sale.,OceanStar Seafood,00:45:20,1,13,0,0,$11.90 KN119+1,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
178,KN119+1,Violet Yap,00:46:05,0,1,0,1,0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [131]:
df['productBought_KN119'] = df['productBought_KN119'].map(lambda x:float(11.90) if x == int(1) else 0)

In [132]:
df['revenue_KN119'] = np.multiply(df['productBought_KN119'], df['salesQuantity'])

In [133]:
revenue_KN119 = "The total revenue from the sale of the product {} is ${}". format ("KN119", format(df['revenue_KN119'].sum(), '.2f'))
print(revenue_KN119)


The total revenue from the sale of the product KN119 is $23.80


Product MAC99

In [134]:
df[df['postComment'].str.contains('MAC99', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_AMB,revenue_BBT,revenue_BSQ,revenue_BWP,revenue_CCL,revenue_CP22,revenue_JA,revenue_JAY,revenue_KBP,revenue_KN119
14,MACKEREL WHOLE 500-600G/ PCS @$9.90 Comment MAC99+1 below to join the Sale,OceanStar Seafood,00:02:46,1,12,0,0,$9.90 MAC99+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110,MACKEREL WHOLE 500-600G/ PCS @$9.90 Comment MAC99+1 below to join the Sale,OceanStar Seafood,00:24:36,1,12,0,0,$9.90 MAC99+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
113,MAC99+1,Violet Yap,00:25:41,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
117,MUST GRAB!!! MACKEREL WHOLE 500-600G/ PCS @$9.90 Comment MAC99+1 below to join the Sale,OceanStar Seafood,00:27:17,1,14,0,0,$9.90 MAC99+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123,MAC99+1,Serene Chia,00:29:21,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
179,MACKEREL WHOLE 500-600G/ PCS @$9.90 Comment MAC99+1 below to join the Sale,OceanStar Seafood,00:46:19,1,12,0,0,$9.90 MAC99+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
244,MACKEREL WHOLE 500-600G/ PCS @$9.90 Comment MAC99+1 below to join the Sale,OceanStar Seafood,01:05:05,1,12,0,0,$9.90 MAC99+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [135]:
df['productBought_MAC99'] = df['productBought_MAC99'].map(lambda x:float(9.90) if x == int(1) else 0)

In [136]:
df['revenue_MAC99'] = np.multiply(df['productBought_MAC99'], df['salesQuantity'])

In [137]:
revenue_MAC99 = "The total revenue from the sale of the product {} is ${}". format ("MAC99", format(df['revenue_MAC99'].sum(), '.2f'))
print(revenue_MAC99)


The total revenue from the sale of the product MAC99 is $19.80


Product MCA

In [138]:
df[df['postComment'].str.contains('MCA', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_BBT,revenue_BSQ,revenue_BWP,revenue_CCL,revenue_CP22,revenue_JA,revenue_JAY,revenue_KBP,revenue_KN119,revenue_MAC99
94,MATCHA JAPAN HOKKAIDO CREPE CAKE 4 PCS/ PKT @$13.90 Comment MCA+1 below to join the Sale.,OceanStar Seafood,00:21:16,1,16,0,0,$13.90 MCA+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136,MATCHA JAPAN HOKKAIDO CREPE CAKE 4 PCS/ PKT @$13.90 Comment MCA+1 below to join the Sale.,OceanStar Seafood,00:33:35,1,16,0,0,$13.90 MCA+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
262,MATCHA JAPAN HOKKAIDO CREPE CAKE 4 PCS/ PKT @$13.90 Comment MCA+1 below to join the Sale.,OceanStar Seafood,01:10:55,1,16,0,0,$13.90 MCA+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [139]:
df['productBought_MCA'] = df['productBought_MCA'].map(lambda x:float(13.90) if x == int(1) else 0)

In [140]:
df['revenue_MCA'] = np.multiply(df['productBought_MCA'], df['salesQuantity'])

In [141]:
revenue_MCA = "The total revenue from the sale of the product {} is ${}". format ("MCA", format(df['revenue_MCA'].sum(), '.2f'))
print(revenue_MCA)


The total revenue from the sale of the product MCA is $13.90


Product MJ

In [142]:
df[df['postComment'].str.contains('MJ', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_BSQ,revenue_BWP,revenue_CCL,revenue_CP22,revenue_JA,revenue_JAY,revenue_KBP,revenue_KN119,revenue_MAC99,revenue_MCA
47,MJ+3,Sabrina Chu,00:08:47,0,1,0,3,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71,2 X IQF CHICKEN MID JOINT 450G+-/ P@ $9.90 Comment MJ+1 below to join the Sale,OceanStar Seafood,00:15:42,1,16,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74,MJ+1,Violet Yap,00:16:11,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
149,2 X IQF CHICKEN MID JOINT 450G+-/ P@ $9.90 Comment MJ+1 below to join the Sale,OceanStar Seafood,00:37:43,1,16,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
230,2 X IQF CHICKEN MID JOINT 450G+-/ P@ $9.90 Comment MJ+1 below to join the Sale,OceanStar Seafood,00:59:58,1,16,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [143]:
df['productBought_MJ'] = df['productBought_MJ'].map(lambda x:float(9.90) if x == int(1) else 0)

In [144]:
df['revenue_MJ'] = np.multiply(df['productBought_MJ'], df['salesQuantity'])

In [145]:
revenue_MJ = "The total revenue from the sale of the product {} is ${}". format ("MJ", format(df['revenue_MJ'].sum(), '.2f'))
print(revenue_MJ)


The total revenue from the sale of the product MJ is $39.60


Product OTAK

In [146]:
df[df['postComment'].str.contains('OTAK', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_BWP,revenue_CCL,revenue_CP22,revenue_JA,revenue_JAY,revenue_KBP,revenue_KN119,revenue_MAC99,revenue_MCA,revenue_MJ
86,3 X MUAR OTAK / PKT @$11.11 Comment OTAK+1 below to join the Sale,OceanStar Seafood,00:19:24,1,14,0,0,$11.11 OTAK+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [147]:
df['productBought_OTAK'] = df['productBought_OTAK'].map(lambda x:float(11.11) if x == int(1) else 0)

In [148]:
df['revenue_OTAK'] = np.multiply(df['productBought_OTAK'], df['salesQuantity'])

In [149]:
revenue_OTAK = "The total revenue from the sale of the product {} is ${}". format ("OTAK", format(df['revenue_OTAK'].sum(), '.2f'))
print(revenue_OTAK)


The total revenue from the sale of the product OTAK is $11.11


Product PC

In [150]:
df[df['postComment'].str.contains('PC+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_CCL,revenue_CP22,revenue_JA,revenue_JAY,revenue_KBP,revenue_KN119,revenue_MAC99,revenue_MCA,revenue_MJ,revenue_OTAK
23,PROCESS CUTTLEFISH 500G+-/ PCS@$8.00 Comment PC+1 below to join the Sale.,OceanStar Seafood,00:03:49,1,11,0,0,$8.00 PC+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,PC+1,Joanne Koh,00:04:13,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58,PC+1,Sabrina Chu,00:12:01,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61,PC+1,Violet Yap,00:12:28,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [151]:
df['productBought_PC'] = df['productBought_PC'].map(lambda x:float(8.00) if x == int(1) else 0)

In [152]:
df['revenue_PC'] = np.multiply(df['productBought_PC'], df['salesQuantity'])

In [153]:
revenue_PC = "The total revenue from the sale of the product {} is ${}". format ("PC", format(df['revenue_PC'].sum(), '.2f'))
print(revenue_PC)


The total revenue from the sale of the product PC is $24.00


Product PN

In [154]:
df[df['postComment'].str.contains('PN', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_CP22,revenue_JA,revenue_JAY,revenue_KBP,revenue_KN119,revenue_MAC99,revenue_MCA,revenue_MJ,revenue_OTAK,revenue_PC
229,2 X PRAWN NUT 400G / PKT @$18.00 Comment PN+1 below to join the Sale,OceanStar Seafood,00:59:48,1,15,0,0,$18.00 PN+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231,PN+1,Eva Sek,01:00:44,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [155]:
df['productBought_PN'] = df['productBought_PN'].map(lambda x:float(18.00) if x == int(1) else 0)

In [156]:
df['revenue_PN'] = np.multiply(df['productBought_PN'], df['salesQuantity'])

In [157]:
revenue_PN = "The total revenue from the sale of the product {} is ${}". format ("PN", format(df['revenue_PN'].sum(), '.2f'))
print(revenue_PN)


The total revenue from the sale of the product PN is $18.00


Product PR

In [158]:
df[df['postComment'].str.contains('PR+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_JA,revenue_JAY,revenue_KBP,revenue_KN119,revenue_MAC99,revenue_MCA,revenue_MJ,revenue_OTAK,revenue_PC,revenue_PN
69,3 X IQF PRIME RIBS 500G+- / P @ $21.90 Comment PR+1 below to join the Sale,OceanStar Seafood,00:15:26,1,17,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89,PR+1,Sabrina Chu,00:20:31,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
228,3 X IQF PRIME RIBS 500G+- / P @ $21.90 Comment PR+1 below to join the Sale,OceanStar Seafood,00:58:11,1,17,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [159]:
df['productBought_PR'] = df['productBought_PR'].map(lambda x:float(21.90) if x == int(1) else 0)

In [160]:
df['revenue_PR'] = np.multiply(df['productBought_PR'], df['salesQuantity'])

In [161]:
revenue_PR = "The total revenue from the sale of the product {} is ${}". format ("PR", format(df['revenue_PR'].sum(), '.2f'))
print(revenue_PR)


The total revenue from the sale of the product PR is $21.90


Product QM

In [162]:
df[df['postComment'].str.contains('QM+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_JAY,revenue_KBP,revenue_KN119,revenue_MAC99,revenue_MCA,revenue_MJ,revenue_OTAK,revenue_PC,revenue_PN,revenue_PR
45,QING MU CUTTLEFISH 500-600G/ PCS @$10.00 Comment QM+1 below to join the Sale.,OceanStar Seafood,00:08:47,1,13,0,0,$10.00 QM+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46,QING MU CUTTLEFISH 500-600G/ PCS @$10.00 Comment QM+1 below to join the Sale.,OceanStar Seafood,00:08:47,1,13,0,0,$10.00 QM+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53,QM+2,Catherine Gan-Chua,00:11:14,0,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,QING MU CUTTLEFISH 500-600G/ PCS @$10.00 Comment QM+1 below to join the Sale.,OceanStar Seafood,00:21:32,1,13,0,0,$10.00 QM+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [163]:
df['productBought_QM'] = df['productBought_QM'].map(lambda x:float(10.00) if x == int(1) else 0)

In [164]:
df['revenue_QM'] = np.multiply(df['productBought_QM'], df['salesQuantity'])

In [165]:
revenue_QM = "The total revenue from the sale of the product {} is ${}". format ("QM", format(df['revenue_QM'].sum(), '.2f'))
print(revenue_QM)


The total revenue from the sale of the product QM is $20.00


Product RGSS

In [166]:
df[df['postComment'].str.contains('RGSS+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_KBP,revenue_KN119,revenue_MAC99,revenue_MCA,revenue_MJ,revenue_OTAK,revenue_PC,revenue_PN,revenue_PR,revenue_QM
183,2 X RED GROUPER SOUP SLICED 300G/ PKT @$18.00 ( PWP ) Comment RGSS+1 below to join the Sale,OceanStar Seafood,00:47:13,1,19,0,0,$18.00 ( PWP ) RGSS+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
185,RGSS+1,Violet Yap,00:47:41,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
188,RGSS+1,Bee Thin,00:48:46,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
209,RGSS+1,Bee Thin,00:54:25,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
223,RGSS+1,Serene Chia,00:57:17,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [167]:
df['productBought_RGSS'] = df['productBought_RGSS'].map(lambda x:float(18.00) if x == int(1) else 0)

In [168]:
df['revenue_RGSS'] = np.multiply(df['productBought_RGSS'], df['salesQuantity'])

In [169]:
revenue_RGSS = "The total revenue from the sale of the product {} is ${}". format ("RGSS", format(df['revenue_RGSS'].sum(), '.2f'))
print(revenue_RGSS)


The total revenue from the sale of the product RGSS is $72.00


Product RST

In [170]:
df[df['postComment'].str.contains('RST+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_KN119,revenue_MAC99,revenue_MCA,revenue_MJ,revenue_OTAK,revenue_PC,revenue_PN,revenue_PR,revenue_QM,revenue_RGSS
145,2 X RED SNAPPER TAIL 500-600G/ PCS @$22.90 Comment RST+1 below to join the Sale,OceanStar Seafood,00:35:57,1,15,0,0,$22.90 RST+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
147,RST+1,Violet Yap,00:36:47,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [171]:
df['productBought_RST'] = df['productBought_RST'].map(lambda x:float(22.90) if x == int(1) else 0)

In [172]:
df['revenue_RST'] = np.multiply(df['productBought_RST'], df['salesQuantity'])

In [173]:
revenue_RST = "The total revenue from the sale of the product {} is ${}". format ("RST", format(df['revenue_RST'].sum(), '.2f'))
print(revenue_RST)


The total revenue from the sale of the product RST is $22.90


Product SCP

In [174]:
df[df['postComment'].str.contains('SCP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_MAC99,revenue_MCA,revenue_MJ,revenue_OTAK,revenue_PC,revenue_PN,revenue_PR,revenue_QM,revenue_RGSS,revenue_RST
236,ORGANIC COOKED SHRIMP 31-40PCS/ PKT @$ 18.80 Comment SCP+1 below to join the Sale,OceanStar Seafood,01:02:33,1,14,0,0,$ 18.80 SCP+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
239,SCP+1,Catherine Gan-Chua,01:04:06,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [175]:
df['productBought_SCP'] = df['productBought_SCP'].map(lambda x:float(18.80) if x == int(1) else 0)

In [176]:
df['revenue_SCP'] = np.multiply(df['productBought_SCP'], df['salesQuantity'])

In [177]:
revenue_SCP = "The total revenue from the sale of the product {} is ${}". format ("SCP", format(df['revenue_SCP'].sum(), '.2f'))
print(revenue_SCP)


The total revenue from the sale of the product SCP is $18.80


Product SM

In [178]:
df[df['postComment'].str.contains('SM+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_MCA,revenue_MJ,revenue_OTAK,revenue_PC,revenue_PN,revenue_PR,revenue_QM,revenue_RGSS,revenue_RST,revenue_SCP
100,Japan Hokkaido Straeberry Milk Drink 200ml @ $2.40 Comment SM+1 below to join the Sale.,OceanStar Seafood,00:23:03,1,15,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,Japan Hokkaido Straeberry Milk Drink 200ml @ $2.40 Comment SM+1 below to join the Sale.,OceanStar Seafood,00:23:19,1,15,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104,SM+3,Sabrina Chu,00:23:42,0,1,0,3,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
165,Japan Hokkaido Strawberry Milk Drink 200ml @ $2.40 Comment SM+1 below to join the Sale.,OceanStar Seafood,00:43:20,1,15,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
211,Japan Hokkaido Strawberry Milk Drink 200ml @ $2.40 Comment SM+1 below to join the Sale.,OceanStar Seafood,00:54:58,1,15,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [179]:
df['productBought_SM'] = df['productBought_SM'].map(lambda x:float(2.40) if x == int(1) else 0)

In [180]:
df['revenue_SM'] = np.multiply(df['productBought_SM'], df['salesQuantity'])

In [181]:
revenue_SM = "The total revenue from the sale of the product {} is ${}". format ("SM", format(df['revenue_SM'].sum(), '.2f'))
print(revenue_SM)


The total revenue from the sale of the product SM is $7.20


Product SS588

In [182]:
df[df['postComment'].str.contains('SS588+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_MJ,revenue_OTAK,revenue_PC,revenue_PN,revenue_PR,revenue_QM,revenue_RGSS,revenue_RST,revenue_SCP,revenue_SM
6,PREMIUM GRADE SALMON 2.2-2.4 KG/ PCS @$58.80 Comment SS588+1 below to join the Sale,OceanStar Seafood,00:01:32,1,14,0,0,$58.80 SS588+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
204,LAST PCS!!! PREMIUM GRADE SALMON 2.2-2.4 KG/ PCS @$58.80 Comment SS588+1 below to join the Sale,OceanStar Seafood,00:52:04,1,16,0,0,$58.80 SS588+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
219,SS588+1,Serene Chia,00:56:18,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [183]:
df['productBought_SS588'] = df['productBought_SS588'].map(lambda x:float(58.80) if x == int(1) else 0)

In [184]:
df['revenue_SS588'] = np.multiply(df['productBought_SS588'], df['salesQuantity'])

In [185]:
revenue_SS588 = "The total revenue from the sale of the product {} is ${}". format ("SS588", format(df['revenue_SS588'].sum(), '.2f'))
print(revenue_SS588)


The total revenue from the sale of the product SS588 is $58.80


Product STF

In [186]:
df[df['postComment'].str.contains('STF', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_OTAK,revenue_PC,revenue_PN,revenue_PR,revenue_QM,revenue_RGSS,revenue_RST,revenue_SCP,revenue_SM,revenue_SS588
82,3 X SALMON TROUT FILLET 300-350G/ PCS @$36.00 Comment STF+1 below to join the Sale.,OceanStar Seafood,00:17:49,1,15,0,0,$36.00 STF+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83,STF+1,Bee Thin,00:18:28,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84,STF +3,Sunny Tan,00:19:02,0,2,0,3,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [187]:
df['productBought_STF'] = df['productBought_STF'].map(lambda x:float(36.00) if x == int(1) else 0)

In [188]:
df['revenue_STF'] = np.multiply(df['productBought_STF'], df['salesQuantity'])

In [189]:
revenue_STF = "The total revenue from the sale of the product {} is ${}". format ("STF", format(df['revenue_STF'].sum(), '.2f'))
print(revenue_STF)


The total revenue from the sale of the product STF is $144.00


Product TMP

In [190]:
df[df['postComment'].str.contains('TMP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_PC,revenue_PN,revenue_PR,revenue_QM,revenue_RGSS,revenue_RST,revenue_SCP,revenue_SM,revenue_SS588,revenue_STF
75,2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.,OceanStar Seafood,00:16:36,1,18,0,0,$11.11 TMP+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78,TMP+1,Violet Yap,00:16:54,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80,TMP+2,Bee Thin,00:17:08,0,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
263,2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.,OceanStar Seafood,01:11:02,1,18,0,0,$11.11 TMP+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [191]:
df['productBought_TMP'] = df['productBought_TMP'].map(lambda x:float(11.11) if x == int(1) else 0)

In [192]:
df['revenue_TMP'] = np.multiply(df['productBought_TMP'], df['salesQuantity'])

In [193]:
revenue_TMP = "The total revenue from the sale of the product {} is ${}". format ("TMP", format(df['revenue_TMP'].sum(), '.2f'))
print(revenue_TMP)


The total revenue from the sale of the product TMP is $55.55


Product TNP

In [194]:
df[df['postComment'].str.contains('TNP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_PN,revenue_PR,revenue_QM,revenue_RGSS,revenue_RST,revenue_SCP,revenue_SM,revenue_SS588,revenue_STF,revenue_TMP
77,TNP+2,Bee Thin,00:16:54,0,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [195]:
# iterating the columns
for col in df.columns:
    print(col)

postComment
postCommentAuthor
postCommentTime_final
isSeller
postCommentLength
lns
salesQuantity
productPrice
productBought_AMB
productBought_BBT
productBought_BSQ
productBought_BTF
productBought_BWP
productBought_CCL
productBought_CP22
productBought_CST229
productBought_CUTTLEFISH
productBought_FRIEDCRABSTICK
productBought_JA
productBought_JAY
productBought_KBP
productBought_KN119
productBought_MAC99
productBought_MCA
productBought_MJ
productBought_OTAK
productBought_PC
productBought_PN
productBought_PR
productBought_QM
productBought_RGSS
productBought_RST
productBought_SCP
productBought_SM
productBought_SS588
productBought_STF
productBought_TMP
productBought_TNP
revenue_AMB
revenue_BBT
revenue_BSQ
revenue_BWP
revenue_CCL
revenue_CP22
revenue_JA
revenue_JAY
revenue_KBP
revenue_KN119
revenue_MAC99
revenue_MCA
revenue_MJ
revenue_OTAK
revenue_PC
revenue_PN
revenue_PR
revenue_QM
revenue_RGSS
revenue_RST
revenue_SCP
revenue_SM
revenue_SS588
revenue_STF
revenue_TMP


**Sum of total revenue from the video**

In [196]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_PN,revenue_PR,revenue_QM,revenue_RGSS,revenue_RST,revenue_SCP,revenue_SM,revenue_SS588,revenue_STF,revenue_TMP
0,Hello Miko Team!!,Yvonne Leong,00:00:44,0,3,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:55,1,15,0,0,$39.90 EMP399+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,OSS ✌️,Jack Daniel,00:00:59,0,2,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Morning OSS,Jennifer Quek,00:01:07,0,2,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:01:09,1,13,0,0,$39.90 EMP399+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [197]:
#total revenue from the video
total_revenue = df.loc[:, 'revenue_AMB': 'revenue_TMP'].values.sum()

#round the total revenue to 2 decimals places
total_revenue_rounded = format(total_revenue, '.2f')

print(f"The total revenue for the sale of products for the video is ${total_revenue_rounded}")


The total revenue for the sale of products for the video is $733.09


In [198]:
va['totalRevenue'] = total_revenue_rounded
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue
0,OCEANSTARLIVE/videos/581200849664350,16,633,4432,133,2486,3,59,46,733.09


**New Column for the total revenue at that comment**

In [199]:
#https://stackoverflow.com/questions/42063716/pandas-sum-up-multiple-columns-into-one-column-without-last-column
df['revenue'] = df.loc[:, 'revenue_AMB': 'revenue_TMP'].sum(axis=1)
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AMB,productBought_BBT,...,revenue_PR,revenue_QM,revenue_RGSS,revenue_RST,revenue_SCP,revenue_SM,revenue_SS588,revenue_STF,revenue_TMP,revenue
0,Hello Miko Team!!,Yvonne Leong,00:00:44,0,3,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:55,1,15,0,0,$39.90 EMP399+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,OSS ✌️,Jack Daniel,00:00:59,0,2,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Morning OSS,Jennifer Quek,00:01:07,0,2,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:01:09,1,13,0,0,$39.90 EMP399+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:01:12,1,13,0,0,$39.90 EMP399+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,PREMIUM GRADE SALMON 2.2-2.4 KG/ PCS @$58.80 Comment SS588+1 below to join the Sale,OceanStar Seafood,00:01:32,1,14,0,0,$58.80 SS588+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Hello,Joanne Koh,00:01:33,0,1,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00 Comment BTW78+1 below to join the Sale,OceanStar Seafood,00:01:41,1,13,0,0,$78.00 BTW78+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00 Comment BTW168+1 below to join the Sale,OceanStar Seafood,00:01:51,1,14,0,0,$168.00 BTW168+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [200]:
#https://www.geeksforgeeks.org/how-to-move-a-column-to-first-position-in-pandas-dataframe/
#shift the revenue column to be the 8th column, i.e at position 7
eighth_column = df.pop('revenue')

# insert column using insert(position,column_name,ninth_column) function
df.insert(7, 'revenue', eighth_column)
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,productPrice,productBought_AMB,...,revenue_PN,revenue_PR,revenue_QM,revenue_RGSS,revenue_RST,revenue_SCP,revenue_SM,revenue_SS588,revenue_STF,revenue_TMP
0,Hello Miko Team!!,Yvonne Leong,00:00:44,0,3,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:55,1,15,0,0,0.0,$39.90 EMP399+1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,OSS ✌️,Jack Daniel,00:00:59,0,2,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Morning OSS,Jennifer Quek,00:01:07,0,2,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:01:09,1,13,0,0,0.0,$39.90 EMP399+1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:01:12,1,13,0,0,0.0,$39.90 EMP399+1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,PREMIUM GRADE SALMON 2.2-2.4 KG/ PCS @$58.80 Comment SS588+1 below to join the Sale,OceanStar Seafood,00:01:32,1,14,0,0,0.0,$58.80 SS588+1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Hello,Joanne Koh,00:01:33,0,1,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00 Comment BTW78+1 below to join the Sale,OceanStar Seafood,00:01:41,1,13,0,0,0.0,$78.00 BTW78+1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00 Comment BTW168+1 below to join the Sale,OceanStar Seafood,00:01:51,1,14,0,0,0.0,$168.00 BTW168+1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


After the revenue of the sales have been calculated, all the dummified product columns and the column 'productPrice' will be dropped as they are no longer required to be checked against to identify the price of the product.

In [201]:
df = df.loc[: ,'postComment':'revenue']
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue
0,Hello Miko Team!!,Yvonne Leong,00:00:44,0,3,0,0,0.0
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:55,1,15,0,0,0.0
2,OSS ✌️,Jack Daniel,00:00:59,0,2,0,0,0.0
3,Morning OSS,Jennifer Quek,00:01:07,0,2,0,0,0.0
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:01:09,1,13,0,0,0.0
5,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:01:12,1,13,0,0,0.0
6,PREMIUM GRADE SALMON 2.2-2.4 KG/ PCS @$58.80 Comment SS588+1 below to join the Sale,OceanStar Seafood,00:01:32,1,14,0,0,0.0
7,Hello,Joanne Koh,00:01:33,0,1,0,0,0.0
8,BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00 Comment BTW78+1 below to join the Sale,OceanStar Seafood,00:01:41,1,13,0,0,0.0
9,BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00 Comment BTW168+1 below to join the Sale,OceanStar Seafood,00:01:51,1,14,0,0,0.0


**New Column to identify the frequency of the seller's comments in the video**

In [202]:
#frequency of seller's comments
va['frequencySeller']= np.divide(va['videoLength'].iloc[0],va['numSellerComments'])
#seller's comment appears on average of every 33 seconds

In [203]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue,frequencySeller
0,OCEANSTARLIVE/videos/581200849664350,16,633,4432,133,2486,3,59,46,733.09,33.323308


**New Column to identify the seller**

In [204]:
df['seller'] = 'OCEANSTARLIVE'

In [205]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller
0,Hello Miko Team!!,Yvonne Leong,00:00:44,0,3,0,0,0.0,OCEANSTARLIVE
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:55,1,15,0,0,0.0,OCEANSTARLIVE
2,OSS ✌️,Jack Daniel,00:00:59,0,2,0,0,0.0,OCEANSTARLIVE
3,Morning OSS,Jennifer Quek,00:01:07,0,2,0,0,0.0,OCEANSTARLIVE
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:01:09,1,13,0,0,0.0,OCEANSTARLIVE


**New Column for the Average Compound Score from Sentiment Analysis on raw & uncleaned comments for video**

In [206]:
# Instantiate Sentiment Intensity Analyzer
sent = SentimentIntensityAnalyzer()

In [207]:
df['sentiment_score'] = df['postComment'].apply(sent.polarity_scores)
df['compound'] = [sent.polarity_scores(x)['compound'] for x in df['postComment']]
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller,sentiment_score,compound
0,Hello Miko Team!!,Yvonne Leong,00:00:44,0,3,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:55,1,15,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 0.82, 'pos': 0.18, 'compound': 0.4724}",0.4724
2,OSS ✌️,Jack Daniel,00:00:59,0,2,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
3,Morning OSS,Jennifer Quek,00:01:07,0,2,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:01:09,1,13,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 0.845, 'pos': 0.155, 'compound': 0.296}",0.296


In [208]:
#average compound scores for the video
#df.shape[0] calculates the total number of rows in the dataframe
va['averageCompound']= (df['compound'].sum())/(df['compound'].sum())/df.shape[0]
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue,frequencySeller,averageCompound
0,OCEANSTARLIVE/videos/581200849664350,16,633,4432,133,2486,3,59,46,733.09,33.323308,0.003676


In [209]:
#drop the columns with regarding to the sentiment analysis on the raw & uncleaned comments
#as we will perform sentiment analysis at the comment level on processed comments
df = df.loc[: ,'postComment':'seller']
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller
0,Hello Miko Team!!,Yvonne Leong,00:00:44,0,3,0,0,0.0,OCEANSTARLIVE
1,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:00:55,1,15,0,0,0.0,OCEANSTARLIVE
2,OSS ✌️,Jack Daniel,00:00:59,0,2,0,0,0.0,OCEANSTARLIVE
3,Morning OSS,Jennifer Quek,00:01:07,0,2,0,0,0.0,OCEANSTARLIVE
4,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:01:09,1,13,0,0,0.0,OCEANSTARLIVE


### Saving the cleaned dataframes

In [210]:
# export to csv - change the name of the data file for each video
va.to_csv('../../data/cleaned_data/cleaned_va_OCEANSTARLIVE_581200849664350.csv', index=False)

In [211]:
#check for nulls
#displaying only the columns with nulls and their sum
df[df.columns[df.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [212]:
# export to csv - change the name of the data file for each video
df.to_csv('../../data/cleaned_data/cleaned_OCEANSTARLIVE_581200849664350.csv', index=False)