# Data Import & Cleaning

### Contents:

- [Data Import](#Data-Import)
- [Data Cleaning](#Data-Cleaning)
- [Feature Engineering](#Feature-Engineering)

### Import Libraries

In [1]:
#import standard libraries
import pandas as pd
import numpy as np

#import emoji
import emoji

from natsort import natsorted, index_natsorted, order_by_index

#import warnings to ignore flags when the project is complete
#import warnings
#warnings.filterwarnings('ignore')

#import pre-processing libraries for data cleaning
import string
import re
import nltk
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

## Data Import

**Read scrapped data for the following videos**

In [2]:
va = pd.read_csv('../../data/scrapped_data/va_OCEANSTARLIVE_440316487506965.csv')

In [3]:
va

Unnamed: 0,video_for,totalEmojiReaction,views
0,OCEANSTARLIVE/videos/440316487506965,20,523


In [4]:
df = pd.read_csv('../../data/scrapped_data/OCEANSTARLIVE_440316487506965.csv', encoding='utf-8')

In [5]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime
0,Good morning Miko &amp; Team,Yvonne Leong,0:33
1,Oss,Eve Kang,0:43
2,Good morning everyone,Yvonne Leong,0:49
3,Oss,Eve Kang,0:58
4,Mrg Miko &amp; OSS,Samuel Goh,0:58
5,got truffle mushroom soup,Patsy Teo,1:00:07
6,"2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11</div><div dir=""auto"" style=""text-align: start;"">Comment「TMP+1」below to join the Sale.",OceanStar Seafood,1:00:45
7,tmp+1,Patsy Teo,1:01:22
8,airfry set what temp ?,Patsy Teo,1:01:49
9,"COD STEAK 450G+-/ PKT @$22.90 ( PWP )</div><div dir=""auto"" style=""text-align: start;"">Comment「CST229+1」below to join the Sale",OceanStar Seafood,1:02:20


In [6]:
#https://stackoverflow.com/questions/32072076/find-the-unique-values-in-a-column-and-then-sort-them
#check if the seller uses multiple accounts to reply
postCommentAuthor_unique = df['postCommentAuthor'].unique()
print(sorted(postCommentAuthor_unique))

['Alcina Lim', 'Catherine Phua', 'Chen Ml', 'Eve Kang', 'Geraldine Chew', 'Helena Chou', 'Helmy Tan', 'Huay Geok  Ang', 'Ja Ja', 'Jack Daniel', 'Jacky Pascua', 'Jacqueline Hoe', 'Janice Lye', 'Jennifer Quek', 'June Wang', 'Kelley Lee', 'Khym Hoon Sung', 'OceanStar Seafood', 'Patrick Ong', 'Patsy Teo', 'Sabrina Chu', 'Samuel Goh', 'Shan Goh', 'Shimret Kaur', 'Stella Lim', 'Tan Siew Kuan', 'Wendy Yong', 'Winnie Wu', 'Yvonne Leong', 'き リーサン']


In [7]:
#find comments posted by the seller only
df.loc[df['postCommentAuthor'] == 'OceanStar Seafood']

Unnamed: 0,postComment,postCommentAuthor,postCommentTime
6,"2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11</div><div dir=""auto"" style=""text-align: start;"">Comment「TMP+1」below to join the Sale.",OceanStar Seafood,1:00:45
9,"COD STEAK 450G+-/ PKT @$22.90 ( PWP )</div><div dir=""auto"" style=""text-align: start;"">Comment「CST229+1」below to join the Sale",OceanStar Seafood,1:02:20
14,"2 X MALA FLOWER CLAM 500G/ PKT @ $ 9.90</div><div dir=""auto"" style=""text-align: start;"">Comment「MFC+1」below to join the Sale",OceanStar Seafood,1:04:30
15,"WILD BARRAMUNDI COLLAR 400-500G/ P@$9.90</div><div dir=""auto"" style=""text-align: start;"">Comment「WBC+1」below to join the Sale",OceanStar Seafood,1:05:17
17,"MUST GRAB!!!</div><div dir=""auto"" style=""text-align: start;"">3 X BATANG SOUP SLICED 300G/ PKT @$24.00</div><div dir=""auto"" style=""text-align: start;"">Comment「BSS+1」below to join the Sale",OceanStar Seafood,1:05:48
19,"2 X SQUID 500-600G/ PORTION @$21.90</div><div dir=""auto"" style=""text-align: start;"">Comment「SQ+1」below to join the Sale",OceanStar Seafood,1:06:38
20,"HOKKAIDO SCALLOP 1KG / PKT @ $28.00</div><div dir=""auto"" style=""text-align: start;"">Comment「HS+1」below to join the Sale",OceanStar Seafood,1:06:49
22,"2 X IQF CHICKEN MID JOINT 450G+-/ P@ $9.90</div><div dir=""auto"" style=""text-align: start;"">Comment「MJ+1」below to join the Sale",OceanStar Seafood,1:07:04
23,"2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.11</div><div dir=""auto"" style=""text-align: start;"">Comment「PCT+1」below to join the Sale",OceanStar Seafood,1:07:28
24,"MUST GRAB!!!!</div><div dir=""auto"" style=""text-align: start;"">2 X BATANG TAIL 1.1-1.2 KG/ PCS @$39.90</div><div dir=""auto"" style=""text-align: start;"">Comment「BTT399+1」below to join the Sale",OceanStar Seafood,1:08:38


In [8]:
#check the comments to have a gauge
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' BTF+1', ' OSS', '2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11</div><div dir="auto" style="text-align: start;">Comment「TMP+1」below to join the Sale.', '2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11</div><div dir="auto" style="text-align: start;">Comment「BCP+1」below to join the Sale.', '2 X BABY LEATHER JACKET 450G+-/ PKT @$15.90</div><div dir="auto" style="text-align: start;">Comment「BLJ+1」below to join the Sale', '2 X BABY THREADFIN 300-400G / PCS @ $12.90</div><div dir="auto" style="text-align: start;">Comment「BBT+1」below to join the Sale', '2 X BABY WHITE POMFRET 450G+-/ PKT @ $11.11</div><div dir="auto" style="text-align: start;">Comment「BWP+1」below to join the Sale', '2 X BATANG STEAK 250-350G/ PKT @$13.90</div><div dir="auto" style="text-align: start;">Comment「BST+1」below to join the Sale', '2 X BATANG TAIL 1.1-1.2 KG/ PCS @$39.90</div><div dir="auto" style="text-align: start;">Comment「BTT399+1」below to join the Sale', '2 X BLACK POMFRET 400-500G/ PCS @$15.90</div><div dir="a

## Data Cleaning

### Convert the emojis to text for easy cleaning

We noticed that the emojis have html parsers attached to it. Hence, we will convert the images of the emojis to text first, to remove the html parsers to the emojis, while retaining the emoji's text. We will convert it back to emoji afterwards.

In [9]:
df['postComment'] = df['postComment'].apply(emoji.demojize)

In [10]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' BTF+1', ' OSS', '2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11</div><div dir="auto" style="text-align: start;">Comment「TMP+1」below to join the Sale.', '2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11</div><div dir="auto" style="text-align: start;">Comment「BCP+1」below to join the Sale.', '2 X BABY LEATHER JACKET 450G+-/ PKT @$15.90</div><div dir="auto" style="text-align: start;">Comment「BLJ+1」below to join the Sale', '2 X BABY THREADFIN 300-400G / PCS @ $12.90</div><div dir="auto" style="text-align: start;">Comment「BBT+1」below to join the Sale', '2 X BABY WHITE POMFRET 450G+-/ PKT @ $11.11</div><div dir="auto" style="text-align: start;">Comment「BWP+1」below to join the Sale', '2 X BATANG STEAK 250-350G/ PKT @$13.90</div><div dir="auto" style="text-align: start;">Comment「BST+1」below to join the Sale', '2 X BATANG TAIL 1.1-1.2 KG/ PCS @$39.90</div><div dir="auto" style="text-align: start;">Comment「BTT399+1」below to join the Sale', '2 X BLACK POMFRET 400-500G/ PCS @$15.90</div><div dir="a

### Clean the comments using Regex

From the above unique values of the comments, there are several cleaning issues that needs to be addressed.

    1. Removal of links or URLs. 
        - URLs are present when the Facebook users manually type a link in the comment. 
        - Additionally, when users are tagged, their Facebook profile URL is printed as a result.
        - Similarly, there is a unique URL linked to each emoji as well. 
        
    2. Removal of HTML special entities
        - Examples of HTML special entities are '&amp' and '&gt'. 
        
    3. Removal of other HTML special terms
        - Examples of other HTML special entities are whitespace HTML special entities like '#x200B' and '#xa0'.
        
    4. Removal of HTML Parsers to the Emojis
        - Previously, the emojis have been demojized to text already. However, the HTML parsers to the mojis remain. Hence, they are required to be removed as well.
        
    5. Removal of HTML Parcers to Whitespaces
        - When a next line is entered in the same comment, there will be a HTML parser to the this whitespace. Hence, this are to be removed as well.
        
    6. Removal of HTML Parsers to tagged names
        - When users are tagged in the comments, in addition to their Facebook profile URL, there will be HTML parsers to the tagged names as well. Hence, this are to be removed as well.
        
    7. Removsl of other HTML Parsers   

In [11]:
def clean(row):
    
    # Remove links or URLs
    row['postComment'] = re.sub(
        pattern=r'https?:\/\/.*\.png', 
        repl='', 
        string=row['postComment'],
        flags=re.M)
    
    # Remove HTML special entities (e.g.. &amp, &gt;)
    row['postComment'] = re.sub(
        pattern=r'\&\w*;',
        repl='',
        string=row['postComment'])

    # Remove emoji html parsers
    #https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
    row['postComment'] = re.sub(
        pattern=r'<img\salt=\"(\:\w*)(\-?)(\w*\:)\"\s[a-z]{6}\=\"\d\d\"\s[a-z]{14}\=.{26}\s[a-z]{3}\=\"\"\s[a-z]{5}\=\"\d\d\"\/>(<\/span>)?',
        repl=r'\1\2\3', 
        string=row['postComment'],
        flags=re.M)
    
    # Remove whitespaces
    row['postComment'] = re.sub(
        pattern=r'<\/div><div\sdir\=\"auto\"\sstyle\=\"text\-align\:\sstart\;\">',
        repl=' ',
        string=row['postComment'],
        flags=re.M)
    
    # Remove emoji html parsers
    row['postComment'] = re.sub(
        pattern = r'<span\sclass\=\"([a-z0-9]{8}\s)+[a-z0-9]{8}\">',
        repl=r' ',
        string=row['postComment'],
        flags=re.M)

    # Remove emoji html parsers
    row['postComment'] = re.sub(
        pattern = r'<div\sclass\=\"([a-z0-9]{8}\s)+[a-z0-9]{8}\".*<\/div>',
        repl=r' ',
        string=row['postComment'],
        flags=re.M)
    
    # Remove emoji html parsers
    row['postComment'] = re.sub(
        pattern=r'<\/span>', 
        repl=' ', 
        string=row['postComment'],
        flags=re.M)
 
    # Remove consecutive non-ASCII characters
    # This will remove the chinese comments
    #https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
    row['postComment'] = re.sub(
        pattern=r'[^\x00-\x7F]+', 
        repl=' ', 
        string=row['postComment'],
        flags=re.M)

    # Remove links or URLs
    row['postComment'] = re.sub(
        pattern=r'https?:\/\/.*tabindex\=\"\d\"', 
        repl='', 
        string=row['postComment'],
        flags=re.M)
    
    #remove tagged names
    row['postComment'] = re.sub(
        pattern=r'<a class="oajrlxb2 g5ia77u1 qu0x051f esr5mh6w e9989ue4 r7d6kgcz rq0escxv nhd2j8a9 nc684nl6 p7hjln8o kvgmc6g5 cxmmr5t8 oygrvhab hcukyx3x jb3vyjys rz4wbd8a qt6c0cv9 a8nywdso i1ao9s8h esuyzwwr f1sip0of lzcic4wl oo9gr5id gpro0wi8 lrazzd5p" href=">',
        repl='',
        string=row['postComment'],
        flags=re.M)    
    
    #remove tagged names
    row['postComment'] = re.sub(
        pattern=r'<span\sclass\=\"([a-z0-9]{8})\">',
        repl='',
        string=row['postComment'],
        flags=re.M)
    
    #remove tagged names
    row['postComment'] = re.sub(
        pattern=r'<span.*<\/a>',
        repl='',
        string=row['postComment'],
        flags=re.M)
    
    return row
    

In [12]:
df = df.apply(clean, axis=1)

In [13]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' ', '     :thinking_face: ', '   :thinking_face: ', ' :face_with_tears_of_joy: pet ohh', ' :grinning_squinting_face: ', ' :hot_face:mah ', ' :pouting_face:give u tomatoes face ', ' BTF+1', ' OSS', '2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.', '2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11 Comment BCP+1 below to join the Sale.', '2 X BABY LEATHER JACKET 450G+-/ PKT @$15.90 Comment BLJ+1 below to join the Sale', '2 X BABY THREADFIN 300-400G / PCS @ $12.90 Comment BBT+1 below to join the Sale', '2 X BABY WHITE POMFRET 450G+-/ PKT @ $11.11 Comment BWP+1 below to join the Sale', '2 X BATANG STEAK 250-350G/ PKT @$13.90 Comment BST+1 below to join the Sale', '2 X BATANG TAIL 1.1-1.2 KG/ PCS @$39.90 Comment BTT399+1 below to join the Sale', '2 X BLACK POMFRET 400-500G/ PCS @$15.90 Comment BP+1 below to join the Sale', '2 X CHINESE POMFRET 250-300G/ PCS @$14.00 Comment CP14+1 below to join the Sale', '2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.

**Convert encoded emoji text back to emojis**

In [14]:
df['postComment'] = df['postComment'].apply(emoji.emojize)

In [15]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' ', '     🤔 ', '   🤔 ', ' BTF+1', ' OSS', ' 😂 pet ohh', ' 😆 ', ' 😡give u tomatoes face ', ' 🥵mah ', '2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.', '2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11 Comment BCP+1 below to join the Sale.', '2 X BABY LEATHER JACKET 450G+-/ PKT @$15.90 Comment BLJ+1 below to join the Sale', '2 X BABY THREADFIN 300-400G / PCS @ $12.90 Comment BBT+1 below to join the Sale', '2 X BABY WHITE POMFRET 450G+-/ PKT @ $11.11 Comment BWP+1 below to join the Sale', '2 X BATANG STEAK 250-350G/ PKT @$13.90 Comment BST+1 below to join the Sale', '2 X BATANG TAIL 1.1-1.2 KG/ PCS @$39.90 Comment BTT399+1 below to join the Sale', '2 X BLACK POMFRET 400-500G/ PCS @$15.90 Comment BP+1 below to join the Sale', '2 X CHINESE POMFRET 250-300G/ PCS @$14.00 Comment CP14+1 below to join the Sale', '2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.11 Comment PCT+1 below to join the Sale', '2 X FRANCE COD FL 400-500G/ PKT @$59.90 Comment FCF+1 

### Drop Empty Posts

Blank comments are dropped to ensure that unique comments are obtained.

In [16]:
#drop empty posts
df = df.loc[((df['postComment'] != ' ')),:]

In [17]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

['     🤔 ', '   🤔 ', ' BTF+1', ' OSS', ' 😂 pet ohh', ' 😆 ', ' 😡give u tomatoes face ', ' 🥵mah ', '2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.', '2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11 Comment BCP+1 below to join the Sale.', '2 X BABY LEATHER JACKET 450G+-/ PKT @$15.90 Comment BLJ+1 below to join the Sale', '2 X BABY THREADFIN 300-400G / PCS @ $12.90 Comment BBT+1 below to join the Sale', '2 X BABY WHITE POMFRET 450G+-/ PKT @ $11.11 Comment BWP+1 below to join the Sale', '2 X BATANG STEAK 250-350G/ PKT @$13.90 Comment BST+1 below to join the Sale', '2 X BATANG TAIL 1.1-1.2 KG/ PCS @$39.90 Comment BTT399+1 below to join the Sale', '2 X BLACK POMFRET 400-500G/ PCS @$15.90 Comment BP+1 below to join the Sale', '2 X CHINESE POMFRET 250-300G/ PCS @$14.00 Comment CP14+1 below to join the Sale', '2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.11 Comment PCT+1 below to join the Sale', '2 X FRANCE COD FL 400-500G/ PKT @$59.90 Comment FCF+1 below

### Reindexing the dataframe 
**New Column to reindex the dataframe in accordance to time**

From the data, we can tell that there is an inconsistent timestamp being used in the column 'postCommentTime'. For example, there are times like '0:57' and '1:00:14' which indicates 0 hour 0 mins 57 secs, and 1 hour 0 mins 14 secs. 

Hence, a new column 'postCommentTime_final' is created to ensure that a timestamp of HH:MM:SS is being used consistently throughout. However, we note that the number of days is being included for TimeDeltaIndex.

As a result, we will read the timestamp, by excluding the number of days.  

In [18]:
#TimedeltaIndex
#https://stackoverflow.com/questions/54877467/pandas-convert-hhmm-and-hhmmss-to-standard-hhmmss-in-python
# for example, time of '0:57' will then be 00:00:57; 0 hours 0 mins 57 secs
df['postCommentTime_final'] = pd.to_timedelta(np.where(df['postCommentTime'].str.count(':') == 1, '00:' + df['postCommentTime'], df['postCommentTime']))

In [19]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Good morning Miko Team,Yvonne Leong,0:33,0 days 00:00:33
1,Oss,Eve Kang,0:43,0 days 00:00:43
2,Good morning everyone,Yvonne Leong,0:49,0 days 00:00:49
3,Oss,Eve Kang,0:58,0 days 00:00:58
4,Mrg Miko OSS,Samuel Goh,0:58,0 days 00:00:58


In [20]:
df['postCommentTime_final'] = df['postCommentTime_final'].astype(str).map(lambda x: x[7:])

In [21]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Good morning Miko Team,Yvonne Leong,0:33,00:00:33
1,Oss,Eve Kang,0:43,00:00:43
2,Good morning everyone,Yvonne Leong,0:49,00:00:49
3,Oss,Eve Kang,0:58,00:00:58
4,Mrg Miko OSS,Samuel Goh,0:58,00:00:58
5,got truffle mushroom soup,Patsy Teo,1:00:07,01:00:07
6,2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.,OceanStar Seafood,1:00:45,01:00:45
7,tmp+1,Patsy Teo,1:01:22,01:01:22
8,airfry set what temp ?,Patsy Teo,1:01:49,01:01:49
9,COD STEAK 450G+-/ PKT @$22.90 ( PWP ) Comment CST229+1 below to join the Sale,OceanStar Seafood,1:02:20,01:02:20


In [22]:
#reindex according to postCommentTime_final
#previous natsort in data collection didnt take into account the different timestamp format
df = df.reindex(index=order_by_index(df.index, index_natsorted(df.postCommentTime_final)))

In [23]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Good morning Miko Team,Yvonne Leong,0:33,00:00:33
1,Oss,Eve Kang,0:43,00:00:43
2,Good morning everyone,Yvonne Leong,0:49,00:00:49
3,Oss,Eve Kang,0:58,00:00:58
4,Mrg Miko OSS,Samuel Goh,0:58,00:00:58
11,OSS,Tan Siew Kuan,1:03,00:01:03
12,Hello,Jacqueline Hoe,1:03,00:01:03
35,Gd morning miko team,Jacqueline Hoe,1:13,00:01:13
41,OSS,Yvonne Leong,1:15,00:01:15
43,Good morning Miko! Got milkfish today?,Geraldine Chew,1:16,00:01:16


In [24]:
#reset the index for the dataframe
#https://stackoverflow.com/questions/20490274/how-to-reset-index-in-a-pandas-dataframe

df = df.reset_index(drop=True)

In [25]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Good morning Miko Team,Yvonne Leong,0:33,00:00:33
1,Oss,Eve Kang,0:43,00:00:43
2,Good morning everyone,Yvonne Leong,0:49,00:00:49
3,Oss,Eve Kang,0:58,00:00:58
4,Mrg Miko OSS,Samuel Goh,0:58,00:00:58
5,OSS,Tan Siew Kuan,1:03,00:01:03
6,Hello,Jacqueline Hoe,1:03,00:01:03
7,Gd morning miko team,Jacqueline Hoe,1:13,00:01:13
8,OSS,Yvonne Leong,1:15,00:01:15
9,Good morning Miko! Got milkfish today?,Geraldine Chew,1:16,00:01:16


**Obtain the length of the video**

Assuming that the last comment of the video is the total length of the video, we will input the length of the video from the comments dataframe into the video attributes dataframe.

In [26]:
#retrieve last comment to obtain the length of the video
df['postCommentTime_final'].iloc[-1]

'01:18:50'

In [27]:
#https://stackoverflow.com/questions/6402812/how-to-convert-an-hmmss-time-string-to-seconds-in-python
def get_sec(time_str):
    """Get Seconds from time."""
    h, m, s = time_str.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

In [28]:
#retrieve last comment to obtain the length of the video in seconds
va['videoLength']= get_sec(df['postCommentTime_final'].iloc[-1])

In [29]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength
0,OCEANSTARLIVE/videos/440316487506965,20,523,4730


## Feature Engineering

### Comments made by the Seller

**New Column to identify the number of comments made by the seller in the video**

From the total sum of comments made by the seller, we will input it into the video attributes dataframe.

In [30]:
(df['postCommentAuthor']=='OceanStar Seafood').sum()

120

In [31]:
va['numSellerComments'] = (df['postCommentAuthor']=='OceanStar Seafood').sum()

In [32]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments
0,OCEANSTARLIVE/videos/440316487506965,20,523,4730,120


**New Column to identify if the comment is made by the Seller or not**

In [33]:
#to delete column 'notSeller' in due course
df['notSeller'] = df['postCommentAuthor'].map(lambda x:1 if x !='OceanStar Seafood' else 0)

In [34]:
#create a new column to show if the comment is made by the seller or not
df['isSeller'] = df['postCommentAuthor'].map(lambda x:1 if x =='OceanStar Seafood' else 0)

In [35]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller
0,Good morning Miko Team,Yvonne Leong,0:33,00:00:33,1,0
1,Oss,Eve Kang,0:43,00:00:43,1,0
2,Good morning everyone,Yvonne Leong,0:49,00:00:49,1,0
3,Oss,Eve Kang,0:58,00:00:58,1,0
4,Mrg Miko OSS,Samuel Goh,0:58,00:00:58,1,0


In [36]:
df['isSeller'].value_counts()

0    165
1    120
Name: isSeller, dtype: int64

In [37]:
#show all the seller's comments
df.loc[df['isSeller'] == 1]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller
28,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,2:45,00:02:45,0,1
32,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,3:10,00:03:10,0,1
34,RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,3:18,00:03:18,0,1
36,BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00 Comment BTW78+1 below to join the Sale ',OceanStar Seafood,3:40,00:03:40,0,1
37,BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00 Comment BTW168+1 below to join the Sale,OceanStar Seafood,3:42,00:03:42,0,1
38,PONTIAN WHITE THREADFIN 1.8-2.0 KG/ PCS @$29.90 Comment PWT299+1 below to join the Sale,OceanStar Seafood,4:05,00:04:05,0,1
39,MACKEREL WHOLE 500-600G/ PCS @$9.90 Comment MAC99+1 below to join the Sale,OceanStar Seafood,4:23,00:04:23,0,1
40,2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.11 Comment PCT+1 below to join the Sale,OceanStar Seafood,4:33,00:04:33,0,1
41,WILD GROUPER WHOLE 2.2-2.6 KG/ PCS @$39.90 Comment WG399+1 below to join the Sale,OceanStar Seafood,4:44,00:04:44,0,1
42,WILD BARRAMUNDI WHOLE 3.5-3.9 KG/ PCS @ $49.90 Comment WB+1 below to join the Sale,OceanStar Seafood,4:55,00:04:55,0,1


### Length of comments

**New Column to identify the length of each comment**

From the comments, we take the length of each comment as the total number of words each comment has.

In [38]:
#length of each comment
#https://stackoverflow.com/questions/37483470/how-to-calculate-number-of-words-in-a-string-in-dataframe
df['postCommentLength'] = df['postComment'].str.split().str.len()

In [39]:
df.head(10)

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength
0,Good morning Miko Team,Yvonne Leong,0:33,00:00:33,1,0,4
1,Oss,Eve Kang,0:43,00:00:43,1,0,1
2,Good morning everyone,Yvonne Leong,0:49,00:00:49,1,0,3
3,Oss,Eve Kang,0:58,00:00:58,1,0,1
4,Mrg Miko OSS,Samuel Goh,0:58,00:00:58,1,0,3
5,OSS,Tan Siew Kuan,1:03,00:01:03,1,0,1
6,Hello,Jacqueline Hoe,1:03,00:01:03,1,0,1
7,Gd morning miko team,Jacqueline Hoe,1:13,00:01:13,1,0,4
8,OSS,Yvonne Leong,1:15,00:01:15,1,0,1
9,Good morning Miko! Got milkfish today?,Geraldine Chew,1:16,00:01:16,1,0,6


**New Column to identify the total number of comments in the video**

From the total number of comments in the video, we will input it into the video attributes dataframe.

In [40]:
#total number of comments
df['postCommentLength'].sum()

2351

In [41]:
va['numComments'] = df['postCommentLength'].sum()

In [42]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments
0,OCEANSTARLIVE/videos/440316487506965,20,523,4730,120,2351


### LNS

LNS is an acronym that stands for 'like and share'. It is a form of customer engagement as it indicates by the customers to the sellers that they have liked and shared the video on their Facebook wall. 

**New Column to identify if Customers are engaging in liking and sharing the video**

In [43]:
#if the customer has commented 'lns' or 'ls' which stands for 'like & shared' & 'like shared' respectively
def lns(comment):
    if re.search(r'(l)(n?)(s)', comment, re.IGNORECASE):
        return int(1)
    else:
        return int(0)

In [44]:
df['lns'] = df['postComment'].map(lambda x:lns(x))

In [45]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns
0,Good morning Miko Team,Yvonne Leong,0:33,00:00:33,1,0,4,0
1,Oss,Eve Kang,0:43,00:00:43,1,0,1,0
2,Good morning everyone,Yvonne Leong,0:49,00:00:49,1,0,3,0
3,Oss,Eve Kang,0:58,00:00:58,1,0,1,0
4,Mrg Miko OSS,Samuel Goh,0:58,00:00:58,1,0,3,0


**New Column to identify if the number of Customers who explicitly inform the sellers that they are engaging in liking and sharing the video**

In [46]:
#range of customer's engagement for LNS
df['lns'].value_counts()

0    281
1      4
Name: lns, dtype: int64

In [47]:
(df['lns']==1).sum()

4

In [48]:
va['lnsQuantity'] = (df['lns']==1).sum()

In [49]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity
0,OCEANSTARLIVE/videos/440316487506965,20,523,4730,120,2351,4


## Sales Quantity

**New Columns to identify the quantity of sales made**

From the comments, using regex, we first see an overview of the comments that are related to the sale of the products. 

In [50]:
#products offered by the seller
df[df['postComment'].str.contains('(\w*\+\d)', regex=True)]

  df[df['postComment'].str.contains('(\w*\+\d)', regex=True)]


Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns
28,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,2:45,00:02:45,0,1,15,0
32,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,3:10,00:03:10,0,1,13,0
34,RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,3:18,00:03:18,0,1,14,0
36,BALAI THREADFIN WHOL 2.4-2.8KG/ PCS @$78.00 Comment BTW78+1 below to join the Sale ',OceanStar Seafood,3:40,00:03:40,0,1,14,0
37,BALAI THREADFIN WHOLE 5.5-6.0 KG/ PCS @$168.00 Comment BTW168+1 below to join the Sale,OceanStar Seafood,3:42,00:03:42,0,1,14,0
38,PONTIAN WHITE THREADFIN 1.8-2.0 KG/ PCS @$29.90 Comment PWT299+1 below to join the Sale,OceanStar Seafood,4:05,00:04:05,0,1,14,0
39,MACKEREL WHOLE 500-600G/ PCS @$9.90 Comment MAC99+1 below to join the Sale,OceanStar Seafood,4:23,00:04:23,0,1,12,0
40,2 X COD TAIL PORTION 1.0-1.2 KG/ PCS ( ST) @ $111.11 Comment PCT+1 below to join the Sale,OceanStar Seafood,4:33,00:04:33,0,1,19,0
41,WILD GROUPER WHOLE 2.2-2.6 KG/ PCS @$39.90 Comment WG399+1 below to join the Sale,OceanStar Seafood,4:44,00:04:44,0,1,14,0
42,WILD BARRAMUNDI WHOLE 3.5-3.9 KG/ PCS @ $49.90 Comment WB+1 below to join the Sale,OceanStar Seafood,4:55,00:04:55,0,1,15,0


In [51]:
def sale(comment):
    if re.search(r'(\w*\+)(\d)', comment):
        return int(re.search(r'\w*\+\d', comment).group(0)[-1])
    else:
        return int(0)

In [52]:
df['sales'] = df['postComment'].apply(lambda x:sale(x))

In [53]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales
0,Good morning Miko Team,Yvonne Leong,0:33,00:00:33,1,0,4,0,0
1,Oss,Eve Kang,0:43,00:00:43,1,0,1,0,0
2,Good morning everyone,Yvonne Leong,0:49,00:00:49,1,0,3,0,0
3,Oss,Eve Kang,0:58,00:00:58,1,0,1,0,0
4,Mrg Miko OSS,Samuel Goh,0:58,00:00:58,1,0,3,0,0


In [54]:
#sales made by the customers; exclude the seller's comments on the codes for the sales
#https://stackoverflow.com/questions/19914937/applying-function-with-multiple-arguments-to-create-a-new-pandas-column
df['salesQuantity'] = np.multiply(df['notSeller'], df['sales'])

In [55]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity
0,Good morning Miko Team,Yvonne Leong,0:33,00:00:33,1,0,4,0,0,0
1,Oss,Eve Kang,0:43,00:00:43,1,0,1,0,0,0
2,Good morning everyone,Yvonne Leong,0:49,00:00:49,1,0,3,0,0,0
3,Oss,Eve Kang,0:58,00:00:58,1,0,1,0,0,0
4,Mrg Miko OSS,Samuel Goh,0:58,00:00:58,1,0,3,0,0,0


In [56]:
#range of sales quantity
df['salesQuantity'].value_counts()

0    231
1     52
2      2
Name: salesQuantity, dtype: int64

In [57]:
#total number of orders made
df['salesQuantity'].sum()

56

In [58]:
va['salesQuantity'] = df['salesQuantity'].sum()

In [59]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity
0,OCEANSTARLIVE/videos/440316487506965,20,523,4730,120,2351,4,56


## Products 

The seller will comment and post the unique product codes for each product. Thereafter, customers who are keen on purchasing the products will explicitly comment out the specific & unique product codes, in addition to the quanityt of each product that they wish to purchase.

**New Columns to identify the products purchased by the Customers**

Regex is used to identify the products being offered and the products being purchased as well.

In [60]:
#function to identify the code of the product bought
def sale2(comment):
    if re.search(r'(\w*\+)(\d)', comment):
        return str(re.search(r'\w*\+\d', comment).group(0)[:-2])
    else:
        return int(0)

In [61]:
#identifies all comments that have the codes of the products, including the seller's comments.
#this column will be dropped afterwards.
df['product'] = df['postComment'].apply(lambda x:sale2(x))

In [62]:
#products bought by Customers; exclude the seller's comments on the product details 
df['productBought'] = np.multiply(df['notSeller'], df['product'])

In [63]:
df['productBought'].unique()

array([0, '', 'WB', 'Mc299', 'MP', 'WGF', 'WGH', 'SLD', 'WAK', 'WST299',
       'SP', 'Sp', 'Lamb', 'CTM', 'STF', '4D', 'CCL', 'BCP', 'MAC99',
       'BSS', 'WBF', 'WAKM', 'BST', 'AMB', 'BP', 'BTT399', 'BTF', 'FG',
       'KN119', 'tmp', 'CST229', 'MJ'], dtype=object)

In [64]:
#https://stackoverflow.com/questions/13445241/replacing-blank-values-white-space-with-nan-in-pandas
df['productBought'] = df['productBought'].replace(r'^\s*$', int(0), regex=True)

In [65]:
df['productBought'].unique()

array([0, 'WB', 'Mc299', 'MP', 'WGF', 'WGH', 'SLD', 'WAK', 'WST299', 'SP',
       'Sp', 'Lamb', 'CTM', 'STF', '4D', 'CCL', 'BCP', 'MAC99', 'BSS',
       'WBF', 'WAKM', 'BST', 'AMB', 'BP', 'BTT399', 'BTF', 'FG', 'KN119',
       'tmp', 'CST229', 'MJ'], dtype=object)

In [66]:
#change the produce codes to be uppercase for consistency
#https://stackoverflow.com/questions/39512002/convert-whole-dataframe-from-lower-case-to-upper-case-with-pandas
df['productBought'] = df['productBought'].astype(str).str.upper()

In [67]:
df['productBought'].unique()

array(['0', 'WB', 'MC299', 'MP', 'WGF', 'WGH', 'SLD', 'WAK', 'WST299',
       'SP', 'LAMB', 'CTM', 'STF', '4D', 'CCL', 'BCP', 'MAC99', 'BSS',
       'WBF', 'WAKM', 'BST', 'AMB', 'BP', 'BTT399', 'BTF', 'FG', 'KN119',
       'TMP', 'CST229', 'MJ'], dtype=object)

In [68]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity,product,productBought
0,Good morning Miko Team,Yvonne Leong,0:33,00:00:33,1,0,4,0,0,0,0,0
1,Oss,Eve Kang,0:43,00:00:43,1,0,1,0,0,0,0,0
2,Good morning everyone,Yvonne Leong,0:49,00:00:49,1,0,3,0,0,0,0,0
3,Oss,Eve Kang,0:58,00:00:58,1,0,1,0,0,0,0,0
4,Mrg Miko OSS,Samuel Goh,0:58,00:00:58,1,0,3,0,0,0,0,0
5,OSS,Tan Siew Kuan,1:03,00:01:03,1,0,1,0,0,0,0,0
6,Hello,Jacqueline Hoe,1:03,00:01:03,1,0,1,0,0,0,0,0
7,Gd morning miko team,Jacqueline Hoe,1:13,00:01:13,1,0,4,0,0,0,0,0
8,OSS,Yvonne Leong,1:15,00:01:15,1,0,1,0,0,0,0,0
9,Good morning Miko! Got milkfish today?,Geraldine Chew,1:16,00:01:16,1,0,6,0,0,0,0,0


### Price of Products

**New Column to identify the price of the products**

This new column is created to identify the regex of the unique product codes and their corresponding prices, as adviced by the seller.

In [69]:
def price(comment):
    if re.search(r'(\@)(\$)( ?)(.*)', comment):
        return str(re.search(r'(\$)( ?)(.*)', comment).group(0)[:-23])
    else:
        return int(0)

In [70]:
df['productPrice'] = df['postComment'].apply(lambda x:price(x))

In [71]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity,product,productBought,productPrice
0,Good morning Miko Team,Yvonne Leong,0:33,00:00:33,1,0,4,0,0,0,0,0,0
1,Oss,Eve Kang,0:43,00:00:43,1,0,1,0,0,0,0,0,0
2,Good morning everyone,Yvonne Leong,0:49,00:00:49,1,0,3,0,0,0,0,0,0
3,Oss,Eve Kang,0:58,00:00:58,1,0,1,0,0,0,0,0,0
4,Mrg Miko OSS,Samuel Goh,0:58,00:00:58,1,0,3,0,0,0,0,0,0
5,OSS,Tan Siew Kuan,1:03,00:01:03,1,0,1,0,0,0,0,0,0
6,Hello,Jacqueline Hoe,1:03,00:01:03,1,0,1,0,0,0,0,0,0
7,Gd morning miko team,Jacqueline Hoe,1:13,00:01:13,1,0,4,0,0,0,0,0,0
8,OSS,Yvonne Leong,1:15,00:01:15,1,0,1,0,0,0,0,0,0
9,Good morning Miko! Got milkfish today?,Geraldine Chew,1:16,00:01:16,1,0,6,0,0,0,0,0,0


We noticed that each comment has a word 'Comment' in the middle of the extracted string of comments for the column 'productPrice'. Hence, we will remove the mentioned word.

In [72]:
df['productPrice'] = df['productPrice'].replace(value='', regex=r'Comment')

In [73]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity,product,productBought,productPrice
0,Good morning Miko Team,Yvonne Leong,0:33,00:00:33,1,0,4,0,0,0,0,0,0
1,Oss,Eve Kang,0:43,00:00:43,1,0,1,0,0,0,0,0,0
2,Good morning everyone,Yvonne Leong,0:49,00:00:49,1,0,3,0,0,0,0,0,0
3,Oss,Eve Kang,0:58,00:00:58,1,0,1,0,0,0,0,0,0
4,Mrg Miko OSS,Samuel Goh,0:58,00:00:58,1,0,3,0,0,0,0,0,0
5,OSS,Tan Siew Kuan,1:03,00:01:03,1,0,1,0,0,0,0,0,0
6,Hello,Jacqueline Hoe,1:03,00:01:03,1,0,1,0,0,0,0,0,0
7,Gd morning miko team,Jacqueline Hoe,1:13,00:01:13,1,0,4,0,0,0,0,0,0
8,OSS,Yvonne Leong,1:15,00:01:15,1,0,1,0,0,0,0,0,0
9,Good morning Miko! Got milkfish today?,Geraldine Chew,1:16,00:01:16,1,0,6,0,0,0,0,0,0


In [74]:
df['productPrice'].unique()

array([0, '$39.90  EMP399+1', '$24.90  RES249+1', '$78.00  BTW78+1 b',
       '$168.00  BTW168+1', '$29.90  PWT299+1', '$9.90  MAC99+1',
       '$39.90  WG399+1', '$88.00  RG88+1', '$33.33  RS33+1',
       '$99.00  CP99+1', '$88.00  CP88+1', '$77.00  CP77+1',
       '$22.00    CP22+1', '$14.00  CP14+1', '$29.90  MC299+1',
       '$19.90  WGF+1', '$12.90  WGH+1', '$39.90  BTT399+1',
       '$19.90  SLD+1 ', '$24.90  WAK+1 ', '$36.00  STF+1 ',
       '$13.90  CCL+1 ', '$13.90  SBY+1 ', '$13.90  VNL+1 ',
       '$13.90  MCA+1 ', '$13.90  CCL+1 bel', '$11.11  TMP+1 ',
       '$11.11  BCP+1 ', '$24.00  BSS+1', '$9.90  WBF+1 ',
       '$16.00 ( PWP )  WAKM+1', '$18.00  WGT+1', '$13.90  BST+1',
       '$11.11  AMB+1', '$15.90  BP+1', '$11.90  KN119+1 ',
       '$12.90 ( PWP )  BTF+1 ', '$9.90  FG+1', '$10.00  QM+1 ',
       '$ 18.80  SCP+1 ', '$15.90  BLJ+1', '$22.90 ( PWP )  CST229+1',
       '$9.90  WBC+1', '$21.90  SQ+1', '$ 5.90  KBP+1', '$59.90  FCF+1 ',
       '$11.11  WB11+1', '$30.00 

Excluding the comments where there was no product codes as adviced by the seller, we find the number of unique products offered by the seller.

In [75]:
#number of unique products offered by the seller
int(df['productPrice'].nunique()) - int(1)

48

In [76]:
#total number of products offered
va['numProducts'] = int(df['productPrice'].nunique()) - int(1)

In [77]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts
0,OCEANSTARLIVE/videos/440316487506965,20,523,4730,120,2351,4,56,48


**Drop irrelevant columns**

The following column was dropped for the following reasons:

2. 'notSeller'
- This column 'notSeller' was solely created to calculate the quantity of sales made, and the products purchased by the customer. Hence, we are able to delete it after the quantity of sales have been calculated and the identification of the products purchased by the customers have been identified.
- Notwithstanding the above, a new column 'isSeller' has been feature engineered out as well, which will tell us the same results on whether the comment is posted & written by a seller or not. 

3. 'sales'
- This column identifies the quantity of products to the unique & specific product codes. However, it includes the product codes posted by the sellers as well. Hence, this column was solely created to be multiplied against the column 'notSeller' to calculate the true quantity of sales made by customers only. Hence, we are able to delete it after the quantity of sales have been calculated.

4. 'product'
- This column 'product' was solely created to identify the products purchased by the customers. Hence, we are able to delete it after the products purchased by the customers have been identified - especially since not all products offered by the seller is being bought by the customers.

In [78]:
#drop unwanted columns
df.drop(['postCommentTime', 'notSeller', 'sales', 'product'], axis=1, inplace=True)

In [79]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productBought,productPrice
0,Good morning Miko Team,Yvonne Leong,00:00:33,0,4,0,0,0,0
1,Oss,Eve Kang,00:00:43,0,1,0,0,0,0
2,Good morning everyone,Yvonne Leong,00:00:49,0,3,0,0,0,0
3,Oss,Eve Kang,00:00:58,0,1,0,0,0,0
4,Mrg Miko OSS,Samuel Goh,00:00:58,0,3,0,0,0,0


### Revenue from the sale of the products

**Dummify the products bought to find the revenue**

In [80]:
#getdummies the products bought
df = pd.get_dummies(df, columns = ['productBought'], drop_first = True)

In [81]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,productBought_SP,productBought_STF,productBought_TMP,productBought_WAK,productBought_WAKM,productBought_WB,productBought_WBF,productBought_WGF,productBought_WGH,productBought_WST299
0,Good morning Miko Team,Yvonne Leong,00:00:33,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Oss,Eve Kang,00:00:43,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Good morning everyone,Yvonne Leong,00:00:49,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Oss,Eve Kang,00:00:58,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Mrg Miko OSS,Samuel Goh,00:00:58,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [82]:
# iterating the columns
for col in df.columns:
    print(col)

postComment
postCommentAuthor
postCommentTime_final
isSeller
postCommentLength
lns
salesQuantity
productPrice
productBought_4D
productBought_AMB
productBought_BCP
productBought_BP
productBought_BSS
productBought_BST
productBought_BTF
productBought_BTT399
productBought_CCL
productBought_CST229
productBought_CTM
productBought_FG
productBought_KN119
productBought_LAMB
productBought_MAC99
productBought_MC299
productBought_MJ
productBought_MP
productBought_SLD
productBought_SP
productBought_STF
productBought_TMP
productBought_WAK
productBought_WAKM
productBought_WB
productBought_WBF
productBought_WGF
productBought_WGH
productBought_WST299


After the column 'productBought' has been dummified, the cells will return a '1' if that particular item is bought, and a '0' if it is not. Hence, we will replace all the '1' with the price of the product.

Then, we will create a new revenue column which is a multiplication of the column 'salesQuantity' against the price of the product (i.e. the dummified 'productBought' columnns).

Product 4D

In [83]:
df[df['postComment'].str.contains('4D', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,productBought_SP,productBought_STF,productBought_TMP,productBought_WAK,productBought_WAKM,productBought_WB,productBought_WBF,productBought_WGF,productBought_WGH,productBought_WST299
139,Next time i dream of u dont eat crepe leh. give me 4D no.,Stella Lim,00:29:47,0,14,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
142,Haha 4D+1,Jack Daniel,00:30:12,0,2,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


Product AMB

In [84]:
df[df['postComment'].str.contains('AMB', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,productBought_SP,productBought_STF,productBought_TMP,productBought_WAK,productBought_WAKM,productBought_WB,productBought_WBF,productBought_WGF,productBought_WGH,productBought_WST299
201,AUS MELTIQUE BEEF STEAK 200G+-/ PCS @$11.11 Comment AMB+1 below to join the Sale,OceanStar Seafood,00:42:03,1,14,0,0,$11.11 AMB+1,0,0,...,0,0,0,0,0,0,0,0,0,0
202,AMB+1,Helmy Tan,00:42:42,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
203,AUS MELTIQUE BEEF STEAK 200G+-/ PCS @$11.11 Comment AMB+1 below to join the Sale,OceanStar Seafood,00:42:55,1,14,0,0,$11.11 AMB+1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
df['productBought_AMB'] = df['productBought_AMB'].map(lambda x:float(11.11) if x == int(1) else 0)

In [86]:
df['revenue_AMB'] = np.multiply(df['productBought_AMB'], df['salesQuantity'])

In [87]:
revenue_AMB = "The total revenue from the sale of the product {} is ${}". format ("AMB", format(df['revenue_AMB'].sum(), '.2f'))
print(revenue_AMB)

The total revenue from the sale of the product AMB is $11.11


Product BCP

In [88]:
df[df['postComment'].str.contains('BCP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,productBought_STF,productBought_TMP,productBought_WAK,productBought_WAKM,productBought_WB,productBought_WBF,productBought_WGF,productBought_WGH,productBought_WST299,revenue_AMB
158,2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11 Comment BCP+1 below to join the Sale.,OceanStar Seafood,00:32:48,1,15,0,0,$11.11 BCP+1,0,0.0,...,0,0,0,0,0,0,0,0,0,0.0
159,BCP+1,Chen Ml,00:33:04,0,1,0,1,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0.0
161,2 X BABY CHINESE POMFRET 450G+-/ PKT @$11.11 Comment BCP+1 below to join the Sale.,OceanStar Seafood,00:33:42,1,15,0,0,$11.11 BCP+1,0,0.0,...,0,0,0,0,0,0,0,0,0,0.0


In [89]:
df['productBought_BCP'] = df['productBought_BCP'].map(lambda x:float(11.11) if x == int(1) else 0)

In [90]:
df['revenue_BCP'] = np.multiply(df['productBought_BCP'], df['salesQuantity'])

In [91]:
revenue_BCP = "The total revenue from the sale of the product {} is ${}". format ("BCP", format(df['revenue_BCP'].sum(), '.2f'))
print(revenue_BCP)


The total revenue from the sale of the product BCP is $11.11


Product BP

In [92]:
df[df['postComment'].str.contains('BP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,productBought_TMP,productBought_WAK,productBought_WAKM,productBought_WB,productBought_WBF,productBought_WGF,productBought_WGH,productBought_WST299,revenue_AMB,revenue_BCP
206,2 X BLACK POMFRET 400-500G/ PCS @$15.90 Comment BP+1 below to join the Sale,OceanStar Seafood,00:44:57,1,14,0,0,$15.90 BP+1,0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0
208,BP+1,Helmy Tan,00:46:22,0,1,0,1,0,0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0
236,BP+1,Patsy Teo,00:58:39,0,1,0,1,0,0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0
260,LAST 6 PKT!!!! KONG BAK PAO 4 PCS / PKT @$ 5.90 Comment KBP+1 below to join the Sale,OceanStar Seafood,01:10:09,1,19,0,0,$ 5.90 KBP+1,0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0


In [93]:
df['productBought_BP'] = df['productBought_BP'].map(lambda x:float(15.90) if x == int(1) else 0)

In [94]:
df['revenue_BP'] = np.multiply(df['productBought_BP'], df['salesQuantity'])

In [95]:
revenue_BP = "The total revenue from the sale of the product {} is ${}". format ("BP", format(df['revenue_BP'].sum(), '.2f'))
print(revenue_BP)


The total revenue from the sale of the product BP is $31.80


Product BSS

In [96]:
df[df['postComment'].str.contains('BSS', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,productBought_WAK,productBought_WAKM,productBought_WB,productBought_WBF,productBought_WGF,productBought_WGH,productBought_WST299,revenue_AMB,revenue_BCP,revenue_BP
165,3 X BATANG SOUP SLICED 300G/ PKT @$24.00 Comment BSS+1 below to join the Sale,OceanStar Seafood,00:34:31,1,15,0,0,$24.00 BSS+1,0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
167,BSS+1,Huay Geok Ang,00:35:10,0,1,0,1,0,0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
168,3 X BATANG SOUP SLICED 300G/ PKT @$24.00 Comment BSS+1 below to join the Sale,OceanStar Seafood,00:35:17,1,15,0,0,$24.00 BSS+1,0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
169,BSS+2,Janice Lye,00:35:18,0,1,0,2,0,0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
171,BSS+1,Helmy Tan,00:35:28,0,1,0,1,0,0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
173,BSS+1,Chen Ml,00:35:37,0,1,0,1,0,0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
249,MUST GRAB!!! 3 X BATANG SOUP SLICED 300G/ PKT @$24.00 Comment BSS+1 below to join the Sale,OceanStar Seafood,01:05:48,1,17,0,0,$24.00 BSS+1,0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0


In [97]:
df['productBought_BSS'] = df['productBought_BSS'].map(lambda x:float(24.00) if x == int(1) else 0)

In [98]:
df['revenue_BSS'] = np.multiply(df['productBought_BSS'], df['salesQuantity'])

In [99]:
revenue_BSS = "The total revenue from the sale of the product {} is ${}". format ("BSS", format(df['revenue_BSS'].sum(), '.2f'))
print(revenue_BSS)


The total revenue from the sale of the product BSS is $120.00


Product BST

In [100]:
df[df['postComment'].str.contains('BST', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,productBought_WAKM,productBought_WB,productBought_WBF,productBought_WGF,productBought_WGH,productBought_WST299,revenue_AMB,revenue_BCP,revenue_BP,revenue_BSS
190,2 X BATANG STEAK 250-350G/ PKT @$13.90 Comment BST+1 below to join the Sale,OceanStar Seafood,00:39:50,1,14,0,0,$13.90 BST+1,0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
191,BST+1,Shimret Kaur,00:40:26,0,1,0,1,0,0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
195,BST+1,Helmy Tan,00:40:53,0,1,0,1,0,0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
200,2 X BATANG STEAK 250-350G/ PKT @$13.90 Comment BST+1 below to join the Sale,OceanStar Seafood,00:41:29,1,14,0,0,$13.90 BST+1,0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0


In [101]:
df['productBought_BST'] = df['productBought_BST'].map(lambda x:float(13.90) if x == int(1) else 0)

In [102]:
df['revenue_BST'] = np.multiply(df['productBought_BST'], df['salesQuantity'])

In [103]:
revenue_BST = "The total revenue from the sale of the product {} is ${}". format ("BST", format(df['revenue_BST'].sum(), '.2f'))
print(revenue_BST)


The total revenue from the sale of the product BST is $27.80


Product BTF

In [104]:
df[df['postComment'].str.contains('BTF', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,productBought_WB,productBought_WBF,productBought_WGF,productBought_WGH,productBought_WST299,revenue_AMB,revenue_BCP,revenue_BP,revenue_BSS,revenue_BST
214,BALAI THREADFIN FILLET 300-350G/PKT @$12.90 ( PWP ) Comment BTF+1 below to join the Sale.,OceanStar Seafood,00:48:54,1,15,0,0,$12.90 ( PWP ) BTF+1,0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
217,BTF+1,Helmy Tan,00:49:37,0,1,0,1,0,0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
218,BTF+1,June Wang,00:49:52,0,1,0,1,0,0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
219,BALAI THREADFIN FILLET 300-350G/PKT @$12.90 ( PWP ) Comment BTF+1 below to join the Sale.,OceanStar Seafood,00:51:01,1,15,0,0,$12.90 ( PWP ) BTF+1,0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
220,BTF+1,Huay Geok Ang,00:51:14,0,1,0,1,0,0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
267,BALAI THREADFIN FILLET 300-350G/PKT @$12.90 ( PWP ) Comment BTF+1 below to join the Sale.,OceanStar Seafood,01:13:40,1,15,0,0,$12.90 ( PWP ) BTF+1,0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


In [105]:
df['productBought_BTF'] = df['productBought_BTF'].map(lambda x:float(12.90) if x == int(1) else 0)

In [106]:
df['revenue_BTF'] = np.multiply(df['productBought_BTF'], df['salesQuantity'])

In [107]:
revenue_BTF = "The total revenue from the sale of the product {} is ${}". format ("BTF", format(df['revenue_BTF'].sum(), '.2f'))
print(revenue_BTF)


The total revenue from the sale of the product BTF is $38.70


Product BT399

In [108]:
df[df['postComment'].str.contains('BT399', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,productBought_WBF,productBought_WGF,productBought_WGH,productBought_WST299,revenue_AMB,revenue_BCP,revenue_BP,revenue_BSS,revenue_BST,revenue_BTF


Product CCL

In [109]:
df[df['postComment'].str.contains('CCL', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,productBought_WBF,productBought_WGF,productBought_WGH,productBought_WST299,revenue_AMB,revenue_BCP,revenue_BP,revenue_BSS,revenue_BST,revenue_BTF
143,CHOCO JAPAN HOKKAIDO CREPE CAKE 4 PCS/ PKT @$13.90 Comment CCL+1 below to join the Sale.,OceanStar Seafood,00:30:23,1,16,0,0,$13.90 CCL+1,0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
148,CCL+1,Helmy Tan,00:31:10,0,1,0,1,0,0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
149,CREPE SERIES!!! CHOCO JAPAN HOKKAIDO CREPE CAKE 4 PCS/ PKT @$13.90 Comment CCL+1 below to join the Sale.,OceanStar Seafood,00:31:17,1,18,0,0,$13.90 CCL+1 bel,0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
150,CREPE SERIES!!! CHOCO JAPAN HOKKAIDO CREPE CAKE 4 PCS/ PKT @$13.90 Comment CCL+1 below to join the Sale.,OceanStar Seafood,00:31:17,1,18,0,0,$13.90 CCL+1,0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
152,CHOCO JAPAN HOKKAIDO CREPE CAKE 4 PCS/ PKT @$13.90 Comment CCL+1 below to join the Sale.,OceanStar Seafood,00:31:29,1,16,0,0,$13.90 CCL+1,0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [110]:
df['productBought_CCL'] = df['productBought_CCL'].map(lambda x:float(13.90) if x == int(1) else 0)

In [111]:
df['revenue_CCL'] = np.multiply(df['productBought_CCL'], df['salesQuantity'])

In [112]:
revenue_CCL = "The total revenue from the sale of the product {} is ${}". format ("CCL", format(df['revenue_CCL'].sum(), '.2f'))
print(revenue_CCL)


The total revenue from the sale of the product CCL is $13.90


Product CST299

In [113]:
df[df['postComment'].str.contains('CST299', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,productBought_WGF,productBought_WGH,productBought_WST299,revenue_AMB,revenue_BCP,revenue_BP,revenue_BSS,revenue_BST,revenue_BTF,revenue_CCL


Product CTM

In [114]:
df[df['postComment'].str.contains('CTM', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,productBought_WGF,productBought_WGH,productBought_WST299,revenue_AMB,revenue_BCP,revenue_BP,revenue_BSS,revenue_BST,revenue_BTF,revenue_CCL
132,3 X IQF CHICKEN CUBE 300G /PKT@ $9.90 Comment CTM+1 below to join the Sale,OceanStar Seafood,00:28:26,1,15,0,0,0,0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
134,CTM+1,Chen Ml,00:28:44,0,1,0,1,0,0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
141,3 X IQF CHICKEN CUBE 300G /PKT@ $9.90 Comment CTM+1 below to join the Sale,OceanStar Seafood,00:30:01,1,15,0,0,0,0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
145,CTM+1,Helmy Tan,00:30:35,0,1,0,1,0,0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
df['productBought_CTM'] = df['productBought_CTM'].map(lambda x:float(9.90) if x == int(1) else 0)

In [116]:
df['revenue_CTM'] = np.multiply(df['productBought_CTM'], df['salesQuantity'])

In [117]:
revenue_CTM = "The total revenue from the sale of the product {} is ${}". format ("CTM", format(df['revenue_CTM'].sum(), '.2f'))
print(revenue_CTM)


The total revenue from the sale of the product CTM is $19.80


Product FG

In [118]:
df[df['postComment'].str.contains('FG', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,productBought_WGH,productBought_WST299,revenue_AMB,revenue_BCP,revenue_BP,revenue_BSS,revenue_BST,revenue_BTF,revenue_CCL,revenue_CTM
221,3 X FLOWER GROUPER 250-350G/ PCS @$9.90 Comment FG+1 below to join the Sale,OceanStar Seafood,00:51:54,1,14,0,0,$9.90 FG+1,0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
223,FG+1,Helmy Tan,00:52:44,0,1,0,1,0,0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
224,SUPER DEAL!! 3 X FLOWER GROUPER 250-350G/ PCS @$9.90 Comment FG+1 below to join the Sale,OceanStar Seafood,00:52:45,1,16,0,0,$9.90 FG+1,0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
271,3 X FLOWER GROUPER 250-350G/ PCS @$9.90 Comment FG+1 below to join the Sale,OceanStar Seafood,01:14:28,1,14,0,0,$9.90 FG+1,0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [119]:
df['productBought_FG'] = df['productBought_FG'].map(lambda x:float(9.90) if x == int(1) else 0)

In [120]:
df['revenue_FG'] = np.multiply(df['productBought_FG'], df['salesQuantity'])

In [121]:
revenue_FG = "The total revenue from the sale of the product {} is ${}". format ("FG", format(df['revenue_FG'].sum(), '.2f'))
print(revenue_FG)


The total revenue from the sale of the product FG is $9.90


Product KN119

In [122]:
df[df['postComment'].str.contains('KN119', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,productBought_WST299,revenue_AMB,revenue_BCP,revenue_BP,revenue_BSS,revenue_BST,revenue_BTF,revenue_CCL,revenue_CTM,revenue_FG
210,LAST 2 SET!!! 2 X KUNNING 500G/ PKT @$11.90 Comment KN119+1 below to join the Sale.,OceanStar Seafood,00:47:06,1,16,0,0,$11.90 KN119+1,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
225,KN119+1,Patsy Teo,00:53:12,0,1,0,1,0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
229,LAST SET!!!! 2 X KUNNING 500G/ PKT @$11.90 Comment KN119+1 below to join the Sale.,OceanStar Seafood,00:54:11,1,15,0,0,$11.90 KN119+1,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [123]:
df['productBought_KN119'] = df['productBought_KN119'].map(lambda x:float(11.90) if x == int(1) else 0)

In [124]:
df['revenue_KN119'] = np.multiply(df['productBought_KN119'], df['salesQuantity'])

In [125]:
revenue_KN119 = "The total revenue from the sale of the product {} is ${}". format ("KN119", format(df['revenue_KN119'].sum(), '.2f'))
print(revenue_KN119)


The total revenue from the sale of the product KN119 is $11.90


Product LAMB

In [126]:
df[df['postComment'].str.contains('LAMB', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_AMB,revenue_BCP,revenue_BP,revenue_BSS,revenue_BST,revenue_BTF,revenue_CCL,revenue_CTM,revenue_FG,revenue_KN119


Product MAC99

In [127]:
df[df['postComment'].str.contains('MAC99', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_AMB,revenue_BCP,revenue_BP,revenue_BSS,revenue_BST,revenue_BTF,revenue_CCL,revenue_CTM,revenue_FG,revenue_KN119
39,MACKEREL WHOLE 500-600G/ PCS @$9.90 Comment MAC99+1 below to join the Sale,OceanStar Seafood,00:04:23,1,12,0,0,$9.90 MAC99+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
163,MACKEREL WHOLE 500-600G/ PCS @$9.90 Comment MAC99+1 below to join the Sale,OceanStar Seafood,00:33:59,1,12,0,0,$9.90 MAC99+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
166,MAC99+1,Helmy Tan,00:34:34,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
185,MACKEREL WHOLE 500-600G/ PCS @$9.90 Comment MAC99+1 below to join the Sale,OceanStar Seafood,00:38:22,1,12,0,0,$9.90 MAC99+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [128]:
df['productBought_MAC99'] = df['productBought_MAC99'].map(lambda x:float(9.90) if x == int(1) else 0)

In [129]:
df['revenue_MAC99'] = np.multiply(df['productBought_MAC99'], df['salesQuantity'])

In [130]:
revenue_MAC99 = "The total revenue from the sale of the product {} is ${}". format ("MAC99", format(df['revenue_MAC99'].sum(), '.2f'))
print(revenue_MAC99)


The total revenue from the sale of the product MAC99 is $9.90


Product MC299

In [131]:
df[df['postComment'].str.contains('MC299', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_BCP,revenue_BP,revenue_BSS,revenue_BST,revenue_BTF,revenue_CCL,revenue_CTM,revenue_FG,revenue_KN119,revenue_MAC99
65,MUST GRAB!!! 3 X MALE MUD CRAB 350-450G/ PCS @$29.90 Comment MC299+1 below to join the Sale,OceanStar Seafood,00:08:14,1,17,0,0,$29.90 MC299+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66,MUST GRAB!!! 3 X MALE MUD CRAB 350-450G/ PCS @$29.90 Comment MC299+1 below to join the Sale,OceanStar Seafood,00:08:52,1,17,0,0,$29.90 MC299+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,SUPER DEAL!!! 3 X MALE MUD CRAB 350-450G/ PCS @$29.90 Comment MC299+1 below to join the Sale,OceanStar Seafood,00:21:11,1,17,0,0,$29.90 MC299+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
204,LAST 5 SET!!! 3 X MALE MUD CRAB 350-450G/ PCS @$29.90 Comment MC299+1 below to join the Sale,OceanStar Seafood,00:43:51,1,18,0,0,$29.90 MC299+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [132]:
df['productBought_MC299'] = df['productBought_MC299'].map(lambda x:float(29.90) if x == int(1) else 0)

In [133]:
df['revenue_MC299'] = np.multiply(df['productBought_MC299'], df['salesQuantity'])

In [134]:
revenue_MC299 = "The total revenue from the sale of the product {} is ${}". format ("MC299", format(df['revenue_MC299'].sum(), '.2f'))
print(revenue_MC299)


The total revenue from the sale of the product MC299 is $29.90


Product MJ

In [135]:
df[df['postComment'].str.contains('MJ', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_BP,revenue_BSS,revenue_BST,revenue_BTF,revenue_CCL,revenue_CTM,revenue_FG,revenue_KN119,revenue_MAC99,revenue_MC299
184,MJ +2,Sabrina Chu,00:38:20,0,2,0,2,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
254,2 X IQF CHICKEN MID JOINT 450G+-/ P@ $9.90 Comment MJ+1 below to join the Sale,OceanStar Seafood,01:07:04,1,16,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
265,MJ+1,Patsy Teo,01:12:49,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [136]:
df['productBought_MJ'] = df['productBought_MJ'].map(lambda x:float(9.90) if x == int(1) else 0)

In [137]:
df['revenue_MJ'] = np.multiply(df['productBought_MJ'], df['salesQuantity'])

In [138]:
revenue_MJ = "The total revenue from the sale of the product {} is ${}". format ("MJ", format(df['revenue_MJ'].sum(), '.2f'))
print(revenue_MJ)


The total revenue from the sale of the product MJ is $9.90


Product MP

In [139]:
df[df['postComment'].str.contains('MP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_BSS,revenue_BST,revenue_BTF,revenue_CCL,revenue_CTM,revenue_FG,revenue_KN119,revenue_MAC99,revenue_MC299,revenue_MJ
28,BIG FISHES!!! WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:02:45,1,15,0,0,$39.90 EMP399+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32,WILD CAUGHT EMPURAU 600-900G/ PCS @$39.90 Comment EMP399+1 below to join the Sale,OceanStar Seafood,00:03:10,1,13,0,0,$39.90 EMP399+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,RED EMPEROR SNAPPER WHOLE1.6-1.8 KG/ PCS @$24.90 Comment RES249+1 below to join the Sale,OceanStar Seafood,00:03:18,1,14,0,0,$24.90 RES249+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78,2 X SARAWAK MINCED PORK 300G / P@ $6.00 Comment MP+1 below to join the Sale.,OceanStar Seafood,00:11:38,1,16,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79,MUST GRAB!!! 2 X SARAWAK MINCED PORK 300G / P@ $6.00 Comment MP+1 below to join the Sale.,OceanStar Seafood,00:12:12,1,18,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80,MP+1,Sabrina Chu,00:12:39,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84,MP+1,Shimret Kaur,00:14:18,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,2 X SARAWAK MINCED PORK 300G / P@ $6.00 Comment MP+1 below to join the Sale.,OceanStar Seafood,00:20:44,1,16,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,BEST BUY!!! 2 X SARAWAK MINCED PORK 300G / P@ $6.00 Comment MP+1 below to join the Sale.,OceanStar Seafood,00:20:55,1,18,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104,MP+1,Huay Geok Ang,00:23:22,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [140]:
df['productBought_MP'] = df['productBought_MP'].map(lambda x:float(6.00) if x == int(1) else 0)

In [141]:
df['revenue_MP'] = np.multiply(df['productBought_MP'], df['salesQuantity'])

In [142]:
revenue_MP = "The total revenue from the sale of the product {} is ${}". format ("MP", format(df['revenue_MP'].sum(), '.2f'))
print(revenue_MP)


The total revenue from the sale of the product MP is $42.00


Product SLD

In [143]:
df[df['postComment'].str.contains('SLD', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_BST,revenue_BTF,revenue_CCL,revenue_CTM,revenue_FG,revenue_KN119,revenue_MAC99,revenue_MC299,revenue_MJ,revenue_MP
101,2 X SMALL SQUID 500G/ PKT @$19.90 Comment SLD+1 below to join the Sale.,OceanStar Seafood,00:21:52,1,14,0,0,$19.90 SLD+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105,2 X SMALL SQUID 500G/ PKT @$19.90 Comment SLD+1 below to join the Sale.,OceanStar Seafood,00:23:27,1,14,0,0,$19.90 SLD+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
108,SLD+1,June Wang,00:24:17,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
230,2 X SMALL SQUID 500G/ PKT @$19.90 Comment SLD+1 below to join the Sale.,OceanStar Seafood,00:55:01,1,14,0,0,$19.90 SLD+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [144]:
df['productBought_SLD'] = df['productBought_SLD'].map(lambda x:float(19.90) if x == int(1) else 0)

In [145]:
df['revenue_SLD'] = np.multiply(df['productBought_SLD'], df['salesQuantity'])

In [146]:
revenue_SLD = "The total revenue from the sale of the product {} is ${}". format ("SLD", format(df['revenue_SLD'].sum(), '.2f'))
print(revenue_SLD)


The total revenue from the sale of the product SLD is $19.90


Product SP

In [147]:
df[df['postComment'].str.contains('SP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_BTF,revenue_CCL,revenue_CTM,revenue_FG,revenue_KN119,revenue_MAC99,revenue_MC299,revenue_MJ,revenue_MP,revenue_SLD
116,3 X SILVER POMFRET 350-450G/PCS@ $12.90 Comment SP+1 below to join the Sale,OceanStar Seafood,00:26:12,1,13,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
121,MUST GRAB!!!! 3 X SILVER POMFRET 350-450G/PCS@ $12.90 Comment SP+1 below to join the Sale,OceanStar Seafood,00:26:52,1,15,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
122,SP+1,Helmy Tan,00:26:59,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
125,SP+1,Chen Ml,00:27:41,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138,3 X SILVER POMFRET 350-450G/PCS@ $12.90 Comment SP+1 below to join the Sale,OceanStar Seafood,00:29:28,1,13,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
140,SP+1,Janice Lye,00:29:57,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
205,MUST GRAB!!! 3 X SILVER POMFRET 350-450G/PCS@ $12.90 Comment SP+1 below to join the Sale,OceanStar Seafood,00:44:40,1,15,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
227,SP+1,Patsy Teo,00:53:58,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [148]:
df['productBought_SP'] = df['productBought_SP'].map(lambda x:float(12.90) if x == int(1) else 0)

In [149]:
df['revenue_SP'] = np.multiply(df['productBought_SP'], df['salesQuantity'])

In [150]:
revenue_SP = "The total revenue from the sale of the product {} is ${}". format ("SP", format(df['revenue_SP'].sum(), '.2f'))
print(revenue_SP)


The total revenue from the sale of the product SP is $64.50


Product STF

In [151]:
df[df['postComment'].str.contains('STF', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_CCL,revenue_CTM,revenue_FG,revenue_KN119,revenue_MAC99,revenue_MC299,revenue_MJ,revenue_MP,revenue_SLD,revenue_SP
133,3 X SALMON TROUT FILLET 300-350G/ PCS @$36.00 Comment STF+1 below to join the Sale.,OceanStar Seafood,00:28:40,1,15,0,0,$36.00 STF+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
137,STF+1,June Wang,00:29:02,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
274,3 X SALMON TROUT FILLET 300-350G/ PCS @$36.00 Comment STF+1 below to join the Sale.,OceanStar Seafood,01:16:29,1,15,0,0,$36.00 STF+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [152]:
df['productBought_STF'] = df['productBought_STF'].map(lambda x:float(36.00) if x == int(1) else 0)

In [153]:
df['revenue_STF'] = np.multiply(df['productBought_STF'], df['salesQuantity'])

In [154]:
revenue_STF = "The total revenue from the sale of the product {} is ${}". format ("STF", format(df['revenue_STF'].sum(), '.2f'))
print(revenue_STF)


The total revenue from the sale of the product STF is $36.00


Product TMP

In [155]:
df[df['postComment'].str.contains('TMP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_CTM,revenue_FG,revenue_KN119,revenue_MAC99,revenue_MC299,revenue_MJ,revenue_MP,revenue_SLD,revenue_SP,revenue_STF
156,2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.,OceanStar Seafood,00:32:21,1,18,0,0,$11.11 TMP+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
240,2 X AUTHENTIC THAI MOO PING 5 PCS / PKT @$11.11 Comment TMP+1 below to join the Sale.,OceanStar Seafood,01:00:45,1,18,0,0,$11.11 TMP+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [156]:
df['productBought_TMP'] = df['productBought_TMP'].map(lambda x:float(11.11) if x == int(1) else 0)

In [157]:
df['revenue_TMP'] = np.multiply(df['productBought_TMP'], df['salesQuantity'])

In [158]:
revenue_TMP = "The total revenue from the sale of the product {} is ${}". format ("TMP", format(df['revenue_TMP'].sum(), '.2f'))
print(revenue_TMP)


The total revenue from the sale of the product TMP is $11.11


Product WAK

In [159]:
df[df['postComment'].str.contains('WAK+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_FG,revenue_KN119,revenue_MAC99,revenue_MC299,revenue_MJ,revenue_MP,revenue_SLD,revenue_SP,revenue_STF,revenue_TMP
107,2 X WILD CAUGHT ANG KA 500G/ POR @$24.90 Comment WAK+1 below to join the Sale.,OceanStar Seafood,00:24:12,1,16,0,0,$24.90 WAK+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109,2 X WILD CAUGHT ANG KA 500G/ POR @$24.90 Comment WAK+1 below to join the Sale.,OceanStar Seafood,00:25:12,1,16,0,0,$24.90 WAK+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115,WAK+1,Helmy Tan,00:26:10,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130,2 X WILD CAUGHT ANG KA 500G/ POR @$24.90 Comment WAK+1 below to join the Sale.,OceanStar Seafood,00:28:12,1,16,0,0,$24.90 WAK+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193,2 X WILD CAUGHT ANG KA 500G/ POR @$24.90 Comment WAK+1 below to join the Sale.,OceanStar Seafood,00:40:36,1,16,0,0,$24.90 WAK+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
261,2 X WILD CAUGHT ANG KA 500G/ POR @$24.90 Comment WAK+1 below to join the Sale.,OceanStar Seafood,01:10:45,1,16,0,0,$24.90 WAK+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [160]:
df['productBought_WAK'] = df['productBought_WAK'].map(lambda x:float(24.90) if x == int(1) else 0)

In [161]:
df['revenue_WAK'] = np.multiply(df['productBought_WAK'], df['salesQuantity'])

In [162]:
revenue_WAK = "The total revenue from the sale of the product {} is ${}". format ("WAK", format(df['revenue_WAK'].sum(), '.2f'))
print(revenue_WAK)


The total revenue from the sale of the product WAK is $24.90


Product WAKM

In [163]:
df[df['postComment'].str.contains('WAKM', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_KN119,revenue_MAC99,revenue_MC299,revenue_MJ,revenue_MP,revenue_SLD,revenue_SP,revenue_STF,revenue_TMP,revenue_WAK
178,SUPER DEAL!!! 2 X WILD ANG KA PEELED MEAT 500G/ PKT @$16.00 ( PWP ) Comment WAKM+1 below to join the Sale,OceanStar Seafood,00:36:27,1,22,0,0,$16.00 ( PWP ) WAKM+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
180,SUPER DEAL!!! 2 X WILD ANG KA PEELED MEAT 500G/ PKT @$16.00 ( PWP ) Comment WAKM+1 below to join the Sale,OceanStar Seafood,00:37:04,1,22,0,0,$16.00 ( PWP ) WAKM+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
182,WAKM+1,Shimret Kaur,00:37:41,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [164]:
df['productBought_WAKM'] = df['productBought_WAKM'].map(lambda x:float(16.00) if x == int(1) else 0)

In [165]:
df['revenue_WAKM'] = np.multiply(df['productBought_WAKM'], df['salesQuantity'])

In [166]:
revenue_WAKM = "The total revenue from the sale of the product {} is ${}". format ("WAKM", format(df['revenue_WAKM'].sum(), '.2f'))
print(revenue_WAKM)


The total revenue from the sale of the product WAKM is $16.00


Product WB

In [167]:
df[df['postComment'].str.contains('WB+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_MAC99,revenue_MC299,revenue_MJ,revenue_MP,revenue_SLD,revenue_SP,revenue_STF,revenue_TMP,revenue_WAK,revenue_WAKM
42,WILD BARRAMUNDI WHOLE 3.5-3.9 KG/ PCS @ $49.90 Comment WB+1 below to join the Sale,OceanStar Seafood,00:04:55,1,15,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63,WB+1,Geraldine Chew,00:08:02,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
188,WILD BARRAMUNDI WHOLE 3.5-3.9 KG/ PCS @ $49.90 Comment WB+1 below to join the Sale,OceanStar Seafood,00:39:38,1,15,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [168]:
df['productBought_WB'] = df['productBought_WB'].map(lambda x:float(49.90) if x == int(1) else 0)

In [169]:
df['revenue_WB'] = np.multiply(df['productBought_WB'], df['salesQuantity'])

In [170]:
revenue_WB = "The total revenue from the sale of the product {} is ${}". format ("WB", format(df['revenue_WB'].sum(), '.2f'))
print(revenue_WB)


The total revenue from the sale of the product WB is $49.90


Product WBF

In [171]:
df[df['postComment'].str.contains('WBF', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_MC299,revenue_MJ,revenue_MP,revenue_SLD,revenue_SP,revenue_STF,revenue_TMP,revenue_WAK,revenue_WAKM,revenue_WB
176,2 X WILD BARRA FL 180-250G/ PCS @$9.90 Comment WBF+1 below to join the Sale.,OceanStar Seafood,00:35:51,1,15,0,0,$9.90 WBF+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
177,WBF+1,June Wang,00:36:04,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
183,WBF+1,Helmy Tan,00:37:46,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [172]:
df['productBought_WBF'] = df['productBought_WBF'].map(lambda x:float(9.90) if x == int(1) else 0)

In [173]:
df['revenue_WBF'] = np.multiply(df['productBought_WBF'], df['salesQuantity'])

In [174]:
revenue_WBF = "The total revenue from the sale of the product {} is ${}". format ("WBF", format(df['revenue_WBF'].sum(), '.2f'))
print(revenue_WBF)


The total revenue from the sale of the product WBF is $19.80


Product WGF

In [175]:
df[df['postComment'].str.contains('WGF', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_MJ,revenue_MP,revenue_SLD,revenue_SP,revenue_STF,revenue_TMP,revenue_WAK,revenue_WAKM,revenue_WB,revenue_WBF
81,2 X WILD GROUPER FILLET 300-350G/ PCS @$19.90 Comment WGF+1 below to join the Sale,OceanStar Seafood,00:13:30,1,15,0,0,$19.90 WGF+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82,WGF+1,Shimret Kaur,00:14:06,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83,WGF+1,June Wang,00:14:18,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87,WGF+1,Helmy Tan,00:15:11,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103,MUST GRAB!!!! 2 X WILD GROUPER FILLET 300-350G/ PCS @$19.90 Comment WGF+1 below to join the Sale,OceanStar Seafood,00:23:00,1,17,0,0,$19.90 WGF+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [176]:
df['productBought_WGF'] = df['productBought_WGF'].map(lambda x:float(19.90) if x == int(1) else 0)

In [177]:
df['revenue_WGF'] = np.multiply(df['productBought_WGF'], df['salesQuantity'])

In [178]:
revenue_WGF = "The total revenue from the sale of the product {} is ${}". format ("WGF", format(df['revenue_WGF'].sum(), '.2f'))
print(revenue_WGF)


The total revenue from the sale of the product WGF is $59.70


Product WGH

In [179]:
df[df['postComment'].str.contains('WGH', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_MP,revenue_SLD,revenue_SP,revenue_STF,revenue_TMP,revenue_WAK,revenue_WAKM,revenue_WB,revenue_WBF,revenue_WGF
86,2 X WILD GROUPER HEAD 1.0-1.3 KG/ PCS @$12.90 Comment WGH+1 below to join the Sale,OceanStar Seafood,00:15:06,1,16,0,0,$12.90 WGH+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93,2 X WILD GROUPER HEAD 1.0-1.3 KG/ PCS @$12.90 Comment WGH+1 below to join the Sale,OceanStar Seafood,00:19:22,1,16,0,0,$12.90 WGH+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
94,WGH+1,Helmy Tan,00:20:07,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,LAST 2 SET!!! 2 X WILD GROUPER HEAD 1.0-1.3 KG/ PCS @$12.90 Comment WGH+1 below to join the Sale,OceanStar Seafood,00:20:27,1,19,0,0,$12.90 WGH+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,WGH+1,Chen Ml,00:21:03,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,WGH+1,Huay Geok Ang,00:21:21,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [180]:
df['productBought_WGH'] = df['productBought_WGH'].map(lambda x:float(12.90) if x == int(1) else 0)

In [181]:
df['revenue_WGH'] = np.multiply(df['productBought_WGH'], df['salesQuantity'])

In [182]:
revenue_WGH = "The total revenue from the sale of the product {} is ${}". format ("WGH", format(df['revenue_WGH'].sum(), '.2f'))
print(revenue_WGH)


The total revenue from the sale of the product WGH is $38.70


Product WST299

In [183]:
df[df['postComment'].str.contains('WST299', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_SLD,revenue_SP,revenue_STF,revenue_TMP,revenue_WAK,revenue_WAKM,revenue_WB,revenue_WBF,revenue_WGF,revenue_WGH
112,2 X WILD CAUGHT SEA TIGER PRAWN 500G/ PORTION @ $29.90 Comment WST299+1 below to join the Sale.,OceanStar Seafood,00:25:31,1,18,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119,WST299+1,Shimret Kaur,00:26:36,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [184]:
df['productBought_WST299'] = df['productBought_WST299'].map(lambda x:float(29.90) if x == int(1) else 0)

In [185]:
df['revenue_WST299'] = np.multiply(df['productBought_WST299'], df['salesQuantity'])

In [186]:
revenue_WST299 = "The total revenue from the sale of the product {} is ${}". format ("WST299", format(df['revenue_WST299'].sum(), '.2f'))
print(revenue_WST299)


The total revenue from the sale of the product WST299 is $29.90


In [187]:
# iterating the columns
for col in df.columns:
    print(col)

postComment
postCommentAuthor
postCommentTime_final
isSeller
postCommentLength
lns
salesQuantity
productPrice
productBought_4D
productBought_AMB
productBought_BCP
productBought_BP
productBought_BSS
productBought_BST
productBought_BTF
productBought_BTT399
productBought_CCL
productBought_CST229
productBought_CTM
productBought_FG
productBought_KN119
productBought_LAMB
productBought_MAC99
productBought_MC299
productBought_MJ
productBought_MP
productBought_SLD
productBought_SP
productBought_STF
productBought_TMP
productBought_WAK
productBought_WAKM
productBought_WB
productBought_WBF
productBought_WGF
productBought_WGH
productBought_WST299
revenue_AMB
revenue_BCP
revenue_BP
revenue_BSS
revenue_BST
revenue_BTF
revenue_CCL
revenue_CTM
revenue_FG
revenue_KN119
revenue_MAC99
revenue_MC299
revenue_MJ
revenue_MP
revenue_SLD
revenue_SP
revenue_STF
revenue_TMP
revenue_WAK
revenue_WAKM
revenue_WB
revenue_WBF
revenue_WGF
revenue_WGH
revenue_WST299


**Sum of total revenue from the video**

In [188]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_SP,revenue_STF,revenue_TMP,revenue_WAK,revenue_WAKM,revenue_WB,revenue_WBF,revenue_WGF,revenue_WGH,revenue_WST299
0,Good morning Miko Team,Yvonne Leong,00:00:33,0,4,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Oss,Eve Kang,00:00:43,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Good morning everyone,Yvonne Leong,00:00:49,0,3,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Oss,Eve Kang,00:00:58,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Mrg Miko OSS,Samuel Goh,00:00:58,0,3,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [189]:
#total revenue from the video
total_revenue = df.loc[:, 'revenue_AMB': 'revenue_WST299'].values.sum()

#round the total revenue to 2 decimals places
total_revenue_rounded = format(total_revenue, '.2f')

print(f"The total revenue for the sale of products for the video is ${total_revenue_rounded}")


The total revenue for the sale of products for the video is $758.13


In [190]:
va['totalRevenue'] = total_revenue_rounded
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue
0,OCEANSTARLIVE/videos/440316487506965,20,523,4730,120,2351,4,56,48,758.13


**New Column for the total revenue at that comment**

In [191]:
#https://stackoverflow.com/questions/42063716/pandas-sum-up-multiple-columns-into-one-column-without-last-column
df['revenue'] = df.loc[:, 'revenue_AMB': 'revenue_WST299'].sum(axis=1)
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_4D,productBought_AMB,...,revenue_STF,revenue_TMP,revenue_WAK,revenue_WAKM,revenue_WB,revenue_WBF,revenue_WGF,revenue_WGH,revenue_WST299,revenue
0,Good morning Miko Team,Yvonne Leong,00:00:33,0,4,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Oss,Eve Kang,00:00:43,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Good morning everyone,Yvonne Leong,00:00:49,0,3,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Oss,Eve Kang,00:00:58,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Mrg Miko OSS,Samuel Goh,00:00:58,0,3,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,OSS,Tan Siew Kuan,00:01:03,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Hello,Jacqueline Hoe,00:01:03,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Gd morning miko team,Jacqueline Hoe,00:01:13,0,4,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,OSS,Yvonne Leong,00:01:15,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Good morning Miko! Got milkfish today?,Geraldine Chew,00:01:16,0,6,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [192]:
#https://www.geeksforgeeks.org/how-to-move-a-column-to-first-position-in-pandas-dataframe/
#shift the revenue column to be the 8th column, i.e at position 7
eighth_column = df.pop('revenue')

# insert column using insert(position,column_name,ninth_column) function
df.insert(7, 'revenue', eighth_column)
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,productPrice,productBought_4D,...,revenue_SP,revenue_STF,revenue_TMP,revenue_WAK,revenue_WAKM,revenue_WB,revenue_WBF,revenue_WGF,revenue_WGH,revenue_WST299
0,Good morning Miko Team,Yvonne Leong,00:00:33,0,4,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Oss,Eve Kang,00:00:43,0,1,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Good morning everyone,Yvonne Leong,00:00:49,0,3,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Oss,Eve Kang,00:00:58,0,1,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Mrg Miko OSS,Samuel Goh,00:00:58,0,3,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,OSS,Tan Siew Kuan,00:01:03,0,1,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Hello,Jacqueline Hoe,00:01:03,0,1,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Gd morning miko team,Jacqueline Hoe,00:01:13,0,4,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,OSS,Yvonne Leong,00:01:15,0,1,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Good morning Miko! Got milkfish today?,Geraldine Chew,00:01:16,0,6,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


After the revenue of the sales have been calculated, all the dummified product columns and the column 'productPrice' will be dropped as they are no longer required to be checked against to identify the price of the product.

In [193]:
df = df.loc[: ,'postComment':'revenue']
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue
0,Good morning Miko Team,Yvonne Leong,00:00:33,0,4,0,0,0.0
1,Oss,Eve Kang,00:00:43,0,1,0,0,0.0
2,Good morning everyone,Yvonne Leong,00:00:49,0,3,0,0,0.0
3,Oss,Eve Kang,00:00:58,0,1,0,0,0.0
4,Mrg Miko OSS,Samuel Goh,00:00:58,0,3,0,0,0.0
5,OSS,Tan Siew Kuan,00:01:03,0,1,0,0,0.0
6,Hello,Jacqueline Hoe,00:01:03,0,1,0,0,0.0
7,Gd morning miko team,Jacqueline Hoe,00:01:13,0,4,0,0,0.0
8,OSS,Yvonne Leong,00:01:15,0,1,0,0,0.0
9,Good morning Miko! Got milkfish today?,Geraldine Chew,00:01:16,0,6,0,0,0.0


**New Column to identify the frequency of the seller's comments in the video**

In [194]:
#frequency of seller's comments
va['frequencySeller']= np.divide(va['videoLength'].iloc[0],va['numSellerComments'])
#seller's comment appears on average of every 39 seconds

In [195]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue,frequencySeller
0,OCEANSTARLIVE/videos/440316487506965,20,523,4730,120,2351,4,56,48,758.13,39.416667


**New Column to identify the seller**

In [196]:
df['seller'] = 'OCEANSTARLIVE'

In [197]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller
0,Good morning Miko Team,Yvonne Leong,00:00:33,0,4,0,0,0.0,OCEANSTARLIVE
1,Oss,Eve Kang,00:00:43,0,1,0,0,0.0,OCEANSTARLIVE
2,Good morning everyone,Yvonne Leong,00:00:49,0,3,0,0,0.0,OCEANSTARLIVE
3,Oss,Eve Kang,00:00:58,0,1,0,0,0.0,OCEANSTARLIVE
4,Mrg Miko OSS,Samuel Goh,00:00:58,0,3,0,0,0.0,OCEANSTARLIVE


**New Column for the Average Compound Score from Sentiment Analysis on raw & uncleaned comments for video**

In [198]:
# Instantiate Sentiment Intensity Analyzer
sent = SentimentIntensityAnalyzer()

In [199]:
df['sentiment_score'] = df['postComment'].apply(sent.polarity_scores)
df['compound'] = [sent.polarity_scores(x)['compound'] for x in df['postComment']]
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller,sentiment_score,compound
0,Good morning Miko Team,Yvonne Leong,00:00:33,0,4,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}",0.4404
1,Oss,Eve Kang,00:00:43,0,1,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
2,Good morning everyone,Yvonne Leong,00:00:49,0,3,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 0.408, 'pos': 0.592, 'compound': 0.4404}",0.4404
3,Oss,Eve Kang,00:00:58,0,1,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
4,Mrg Miko OSS,Samuel Goh,00:00:58,0,3,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0


In [200]:
#average compound scores for the video
#df.shape[0] calculates the total number of rows in the dataframe
va['averageCompound']= (df['compound'].sum())/(df['compound'].sum())/df.shape[0]
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue,frequencySeller,averageCompound
0,OCEANSTARLIVE/videos/440316487506965,20,523,4730,120,2351,4,56,48,758.13,39.416667,0.003509


In [201]:
#drop the columns with regarding to the sentiment analysis on the raw & uncleaned comments
#as we will perform sentiment analysis at the comment level on processed comments
df = df.loc[: ,'postComment':'seller']
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller
0,Good morning Miko Team,Yvonne Leong,00:00:33,0,4,0,0,0.0,OCEANSTARLIVE
1,Oss,Eve Kang,00:00:43,0,1,0,0,0.0,OCEANSTARLIVE
2,Good morning everyone,Yvonne Leong,00:00:49,0,3,0,0,0.0,OCEANSTARLIVE
3,Oss,Eve Kang,00:00:58,0,1,0,0,0.0,OCEANSTARLIVE
4,Mrg Miko OSS,Samuel Goh,00:00:58,0,3,0,0,0.0,OCEANSTARLIVE


### Saving the cleaned dataframes

In [202]:
# export to csv - change the name of the data file for each video
va.to_csv('../../data/cleaned_data/cleaned_va_OCEANSTARLIVE_440316487506965.csv', index=False)

In [203]:
#check for nulls
#displaying only the columns with nulls and their sum
df[df.columns[df.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [204]:
# export to csv - change the name of the data file for each video
df.to_csv('../../data/cleaned_data/cleaned_OCEANSTARLIVE_440316487506965.csv', index=False)