# Data Import & Cleaning

### Contents:

- [Data Import](#Data-Import)
- [Data Cleaning](#Data-Cleaning)
- [Feature Engineering](#Feature-Engineering)

### Import Libraries

In [1]:
#import standard libraries
import pandas as pd
import numpy as np

#import emoji
import emoji

from natsort import natsorted, index_natsorted, order_by_index

#import warnings to ignore flags when the project is complete
#import warnings
#warnings.filterwarnings('ignore')

#import pre-processing libraries for data cleaning
import string
import re
import nltk
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

## Data Import

**Read scrapped data for the following videos**

In [2]:
va = pd.read_csv('../../data/scrapped_data/va_OCEANSTARLIVE_2174138006061999.csv')

In [3]:
va

Unnamed: 0,video_for,totalEmojiReaction,views
0,OCEANSTARLIVE/videos/2174138006061999,27,1.2K


In [4]:
va['views'].iloc[0]

'1.2K'

In [5]:
#drop the K and replace it with 2 '0's behind
va['views'] = va['views'].str.replace("K", "00", regex=True)
#drop the dot
va['views'] = va['views'].str.replace(".", "", regex=True)
#change the string to be an integer
va['views'] = int(va['views'].iloc[0])

In [6]:
va

Unnamed: 0,video_for,totalEmojiReaction,views
0,OCEANSTARLIVE/videos/2174138006061999,27,1200


In [7]:
va.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   video_for           1 non-null      object
 1   totalEmojiReaction  1 non-null      int64 
 2   views               1 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 152.0+ bytes


In [8]:
df = pd.read_csv('../../data/scrapped_data/OCEANSTARLIVE_2174138006061999.csv', encoding='utf-8')

In [9]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime
0,Morning OSS,Jennifer Quek,0:34
1,hello miko,Ting Wen,0:36
2,LNS,Jennie Gan,0:43
3,LNS OSS,Jennifer Quek,0:45
4,Morning oss,Goh Hui Ling Veron,0:46
5,Good morning Miko and Kelvin,Angela Tay,0:47
6,Good morning miko and OSS team,Jennie Gan,0:49
7,Oss,Angela Tay,0:53
8,OSS,Jennie Gan,0:53
9,"<a class=""oajrlxb2 g5ia77u1 qu0x051f esr5mh6w e9989ue4 r7d6kgcz rq0escxv nhd2j8a9 nc684nl6 p7hjln8o kvgmc6g5 cxmmr5t8 oygrvhab hcukyx3x jb3vyjys rz4wbd8a qt6c0cv9 a8nywdso i1ao9s8h esuyzwwr f1sip0of lzcic4wl oo9gr5id gpro0wi8 lrazzd5p"" href=""https://www.facebook.com/tingwen?__tn__=R"" role=""link"" tabindex=""0""><span class=""nc684nl6""><span>Ting Wen</span></span></a> yes i wake up le",Clarice Goh,0:55


In [10]:
#https://stackoverflow.com/questions/32072076/find-the-unique-values-in-a-column-and-then-sort-them
#check if the seller uses multiple accounts to reply
postCommentAuthor_unique = df['postCommentAuthor'].unique()
print(sorted(postCommentAuthor_unique))

['Adeline Neo', 'Angela Tay', 'Angela Yeo', 'Anne Ng', 'Chor Kee Lee', 'Cindy Lau', 'Cindy Ong', 'Clarice Goh', 'Cynthia Sew', 'Del Del', 'Doreen Wah', 'Esther Loo', 'Eve Kang', 'Goh Hui Ling Veron', 'Hear Their Voices', 'Iris Teoh', 'Jasmine Gn', 'Jennie Gan', 'Jennifer Quek', 'Jenny Ang', 'Jenny Ong', 'Jolynn Toh', 'Joy Mah', 'Judy Teo', 'June Ng', 'Kath Choo', 'Mariania Chan', 'Ng Fui Yong', 'Nur Nurr', 'OceanStar Seafood', 'Patrick Ong', 'Pearlyn Chua', 'Samuel Goh', 'Sharon Chua', 'Sharon Lau', 'Sian Foong Lim', 'Stella Lim', 'Ting Wen', 'Wendy Chia', 'Winnie Wu', 'Yen Ling Chan', 'Yvonne Chia', 'き リーサン', '小包龙']


In [11]:
#find comments posted by the seller only
df.loc[df['postCommentAuthor'] == 'OceanStar Seafood']

Unnamed: 0,postComment,postCommentAuthor,postCommentTime
11,"BIG FISHES!!!</div><div dir=""auto"" style=""text-align: start;"">NZ KING SALMON WHOLE 4.2-4.6 KG/PCS@$168.00</div><div dir=""auto"" style=""text-align: start;"">Comment「KS168+1」below to join the Sale",OceanStar Seafood,0:57
12,"ANG KO LI HEAD 1.5KG+-/ PCS @$9.90</div><div dir=""auto"" style=""text-align: start;"">Comment「AKLH+1」below to join the Sale",OceanStar Seafood,1:00:00
13,"SLIPPER LOBSTER 6-8 PCS/ KG@$23.90</div><div dir=""auto"" style=""text-align: start;"">Comment「SL239+1」below to join the Sale.",OceanStar Seafood,1:00:14
18,"2 X SQUID 500-600G/ PORTION @$21.90</div><div dir=""auto"" style=""text-align: start;"">Comment「SQ+1」below to join the Sale.",OceanStar Seafood,1:01:30
19,"2 X KUNNING 500G/ PKT @$11.90</div><div dir=""auto"" style=""text-align: start;"">Comment「KN119+1」below to join the Sale.",OceanStar Seafood,1:01:46
20,"LAST 3 PCS!!!</div><div dir=""auto"" style=""text-align: start;"">LOCAL CATCH WHITE POMFRET 300-400G/ PCS @$13.00</div><div dir=""auto"" style=""text-align: start;"">Comment「WP13+1」below to join the Sale",OceanStar Seafood,1:02:12
25,"2 X SILVER POMFRET 350-450G/PCS@ $9.90</div><div dir=""auto"" style=""text-align: start;"">Comment「SP+1」below to join the Sale",OceanStar Seafood,1:03:38
27,"2 X BLACK POMFRET 350-450G/ PCS @$15.90</div><div dir=""auto"" style=""text-align: start;"">Comment「BP+1」below to join the Sale",OceanStar Seafood,1:04:04
30,"MINI BALAI THREADFIN 800-1.1KG/ PCS @$11.11 ( PWP)</div><div dir=""auto"" style=""text-align: start;"">Comment「MBT11+1」below to join the Sale",OceanStar Seafood,1:04:18
36,"LAST 2 PCS!!!</div><div dir=""auto"" style=""text-align: start;"">CHINESE POMFRET 800-900G/ PCS @$29.90</div><div dir=""auto"" style=""text-align: start;"">Comment「CP299+1」below to join the Sale",OceanStar Seafood,1:05:53


In [12]:
#check the comments to have a gauge
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' sori pork Rib from where', '2 X BATANG SOUP SLICED 300G/ PKT @$21.90</div><div dir="auto" style="text-align: start;">Comment「BSS+1」below to join the Sale', '2 X BLACK POMFRET 350-450G/ PCS @$15.90</div><div dir="auto" style="text-align: start;">Comment「BP+1」below to join the Sale', '2 X CHINESE POMFRET 250-400G/ PCS @$14.00</div><div dir="auto" style="text-align: start;">Comment「CP14+1」below to join the Sale', '2 X CLAM SOUP 500G/ PKT @ $12.90</div><div dir="auto" style="text-align: start;">Comment「CS+1」below to join the Sale', '2 X FLOWER GROUPER 300-400G/ PCS @$9.90</div><div dir="auto" style="text-align: start;">Comment「FG+1」below to join the Sale', '2 X FRANCE COD FL 400-500G/ PKT @$59.90</div><div dir="auto" style="text-align: start;">Comment「FCF+1」below to join the Sale', '2 X IQF CHICKEN MID JOINT 450G+-/ P@$9.90</div><div dir="auto" style="text-align: start;">Comment「MJ+1」below to join the Sale', '2 X JP HAMACHI COLLAR 400-500G @ $38.00</div><div dir="auto" style="text-align

## Data Cleaning

### Convert the emojis to text for easy cleaning

We noticed that the emojis have html parsers attached to it. Hence, we will convert the images of the emojis to text first, to remove the html parsers to the emojis, while retaining the emoji's text. We will convert it back to emoji afterwards.

In [13]:
df['postComment'] = df['postComment'].apply(emoji.demojize)

In [14]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' sori pork Rib from where', '2 X BATANG SOUP SLICED 300G/ PKT @$21.90</div><div dir="auto" style="text-align: start;">Comment「BSS+1」below to join the Sale', '2 X BLACK POMFRET 350-450G/ PCS @$15.90</div><div dir="auto" style="text-align: start;">Comment「BP+1」below to join the Sale', '2 X CHINESE POMFRET 250-400G/ PCS @$14.00</div><div dir="auto" style="text-align: start;">Comment「CP14+1」below to join the Sale', '2 X CLAM SOUP 500G/ PKT @ $12.90</div><div dir="auto" style="text-align: start;">Comment「CS+1」below to join the Sale', '2 X FLOWER GROUPER 300-400G/ PCS @$9.90</div><div dir="auto" style="text-align: start;">Comment「FG+1」below to join the Sale', '2 X FRANCE COD FL 400-500G/ PKT @$59.90</div><div dir="auto" style="text-align: start;">Comment「FCF+1」below to join the Sale', '2 X IQF CHICKEN MID JOINT 450G+-/ P@$9.90</div><div dir="auto" style="text-align: start;">Comment「MJ+1」below to join the Sale', '2 X JP HAMACHI COLLAR 400-500G @ $38.00</div><div dir="auto" style="text-align

### Clean the comments using Regex

From the above unique values of the comments, there are several cleaning issues that needs to be addressed.

    1. Removal of links or URLs. 
        - URLs are present when the Facebook users manually type a link in the comment. 
        - Additionally, when users are tagged, their Facebook profile URL is printed as a result.
        - Similarly, there is a unique URL linked to each emoji as well. 
        
    2. Removal of HTML special entities
        - Examples of HTML special entities are '&amp' and '&gt'. 
        
    3. Removal of other HTML special terms
        - Examples of other HTML special entities are whitespace HTML special entities like '#x200B' and '#xa0'.
        
    4. Removal of HTML Parsers to the Emojis
        - Previously, the emojis have been demojized to text already. However, the HTML parsers to the mojis remain. Hence, they are required to be removed as well.
        
    5. Removal of HTML Parcers to Whitespaces
        - When a next line is entered in the same comment, there will be a HTML parser to the this whitespace. Hence, this are to be removed as well.
        
    6. Removal of HTML Parsers to tagged names
        - When users are tagged in the comments, in addition to their Facebook profile URL, there will be HTML parsers to the tagged names as well. Hence, this are to be removed as well.
        
    7. Removsl of other HTML Parsers   

In [15]:
def clean(row):
    
    # Remove links or URLs
    row['postComment'] = re.sub(
        pattern=r'https?:\/\/.*\.png', 
        repl='', 
        string=row['postComment'],
        flags=re.M)
    
    # Remove HTML special entities (e.g.. &amp, &gt;)
    row['postComment'] = re.sub(
        pattern=r'\&\w*;',
        repl='',
        string=row['postComment'])

    # Remove emoji html parsers
    #https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
    row['postComment'] = re.sub(
        pattern=r'<img\salt=\"(\:\w*)(\-?)(\w*\:)\"\s[a-z]{6}\=\"\d\d\"\s[a-z]{14}\=.{26}\s[a-z]{3}\=\"\"\s[a-z]{5}\=\"\d\d\"\/>(<\/span>)?',
        repl=r'\1\2\3', 
        string=row['postComment'],
        flags=re.M)
    
    # Remove whitespaces
    row['postComment'] = re.sub(
        pattern=r'<\/div><div\sdir\=\"auto\"\sstyle\=\"text\-align\:\sstart\;\">',
        repl=' ',
        string=row['postComment'],
        flags=re.M)
    
    # Remove emoji html parsers
    row['postComment'] = re.sub(
        pattern = r'<span\sclass\=\"([a-z0-9]{8}\s)+[a-z0-9]{8}\">',
        repl=r' ',
        string=row['postComment'],
        flags=re.M)
   
    # Remove emoji html parsers
    row['postComment'] = re.sub(
        pattern=r'<\/span>', 
        repl=' ', 
        string=row['postComment'],
        flags=re.M)
 
    # Remove consecutive non-ASCII characters
    # This will remove the chinese comments
    #https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
    row['postComment'] = re.sub(
        pattern=r'[^\x00-\x7F]+', 
        repl=' ', 
        string=row['postComment'],
        flags=re.M)

    # Remove links or URLs
    row['postComment'] = re.sub(
        pattern=r'https?:\/\/.*tabindex\=\"\d\"', 
        repl='', 
        string=row['postComment'],
        flags=re.M)
    
    #remove tagged names
    row['postComment'] = re.sub(
        pattern=r'<a class="oajrlxb2 g5ia77u1 qu0x051f esr5mh6w e9989ue4 r7d6kgcz rq0escxv nhd2j8a9 nc684nl6 p7hjln8o kvgmc6g5 cxmmr5t8 oygrvhab hcukyx3x jb3vyjys rz4wbd8a qt6c0cv9 a8nywdso i1ao9s8h esuyzwwr f1sip0of lzcic4wl oo9gr5id gpro0wi8 lrazzd5p" href=">',
        repl='',
        string=row['postComment'],
        flags=re.M)    
    
    #remove tagged names
    row['postComment'] = re.sub(
        pattern=r'<span\sclass\=\"([a-z0-9]{8})\">',
        repl='',
        string=row['postComment'],
        flags=re.M)
    
    #remove tagged names
    row['postComment'] = re.sub(
        pattern=r'<span.*<\/a>',
        repl='',
        string=row['postComment'],
        flags=re.M)
    
    return row
    

In [16]:
df = df.apply(clean, axis=1)

In [17]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' ', '   :waving_hand:', ' :hand_with_fingers_splayed_light_skin_tone:', ' :rolling_on_the_floor_laughing:', ' Bo tag again ', ' fall in', ' miko,bye bye', ' sori pork Rib from where', ' ur fav', ' yes i wake up le', '2 X BATANG SOUP SLICED 300G/ PKT @$21.90 Comment BSS+1 below to join the Sale', '2 X BLACK POMFRET 350-450G/ PCS @$15.90 Comment BP+1 below to join the Sale', '2 X CHINESE POMFRET 250-400G/ PCS @$14.00 Comment CP14+1 below to join the Sale', '2 X CLAM SOUP 500G/ PKT @ $12.90 Comment CS+1 below to join the Sale', '2 X FLOWER GROUPER 300-400G/ PCS @$9.90 Comment FG+1 below to join the Sale', '2 X FRANCE COD FL 400-500G/ PKT @$59.90 Comment FCF+1 below to join the Sale', '2 X IQF CHICKEN MID JOINT 450G+-/ P@$9.90 Comment MJ+1 below to join the Sale', '2 X JP HAMACHI COLLAR 400-500G @ $38.00 Comment HC+1 below to join the Sale', '2 X JP KANIKAMA 500G/PKT @ $29.90 Comment KKM+1 below to join the Sale', '2 X KING SALMON FILLET 300-350G / PCS @ $39.90 Comment KSF+1 below to joi

**Convert encoded emoji text back to emojis**

In [18]:
df['postComment'] = df['postComment'].apply(emoji.emojize)

In [19]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' ', '   👋', ' Bo tag again ', ' fall in', ' miko,bye bye', ' sori pork Rib from where', ' ur fav', ' yes i wake up le', ' 🖐🏻', ' 🤣', '2 X BATANG SOUP SLICED 300G/ PKT @$21.90 Comment BSS+1 below to join the Sale', '2 X BLACK POMFRET 350-450G/ PCS @$15.90 Comment BP+1 below to join the Sale', '2 X CHINESE POMFRET 250-400G/ PCS @$14.00 Comment CP14+1 below to join the Sale', '2 X CLAM SOUP 500G/ PKT @ $12.90 Comment CS+1 below to join the Sale', '2 X FLOWER GROUPER 300-400G/ PCS @$9.90 Comment FG+1 below to join the Sale', '2 X FRANCE COD FL 400-500G/ PKT @$59.90 Comment FCF+1 below to join the Sale', '2 X IQF CHICKEN MID JOINT 450G+-/ P@$9.90 Comment MJ+1 below to join the Sale', '2 X JP HAMACHI COLLAR 400-500G @ $38.00 Comment HC+1 below to join the Sale', '2 X JP KANIKAMA 500G/PKT @ $29.90 Comment KKM+1 below to join the Sale', '2 X KING SALMON FILLET 300-350G / PCS @ $39.90 Comment KSF+1 below to join the Sale.', '2 X KOREAN FLOWER CLAM 500G/ PKT @$8.00 Comment KFC+1 below to join 

### Drop Empty Posts

Blank comments are dropped to ensure that unique comments are obtained.

In [20]:
#drop empty posts
df = df.loc[((df['postComment'] != ' ')),:]

In [21]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

['   👋', ' Bo tag again ', ' fall in', ' miko,bye bye', ' sori pork Rib from where', ' ur fav', ' yes i wake up le', ' 🖐🏻', ' 🤣', '2 X BATANG SOUP SLICED 300G/ PKT @$21.90 Comment BSS+1 below to join the Sale', '2 X BLACK POMFRET 350-450G/ PCS @$15.90 Comment BP+1 below to join the Sale', '2 X CHINESE POMFRET 250-400G/ PCS @$14.00 Comment CP14+1 below to join the Sale', '2 X CLAM SOUP 500G/ PKT @ $12.90 Comment CS+1 below to join the Sale', '2 X FLOWER GROUPER 300-400G/ PCS @$9.90 Comment FG+1 below to join the Sale', '2 X FRANCE COD FL 400-500G/ PKT @$59.90 Comment FCF+1 below to join the Sale', '2 X IQF CHICKEN MID JOINT 450G+-/ P@$9.90 Comment MJ+1 below to join the Sale', '2 X JP HAMACHI COLLAR 400-500G @ $38.00 Comment HC+1 below to join the Sale', '2 X JP KANIKAMA 500G/PKT @ $29.90 Comment KKM+1 below to join the Sale', '2 X KING SALMON FILLET 300-350G / PCS @ $39.90 Comment KSF+1 below to join the Sale.', '2 X KOREAN FLOWER CLAM 500G/ PKT @$8.00 Comment KFC+1 below to join the S

### Reindexing the dataframe 
**New Column to reindex the dataframe in accordance to time**

From the data, we can tell that there is an inconsistent timestamp being used in the column 'postCommentTime'. For example, there are times like '0:57' and '1:00:14' which indicates 0 hour 0 mins 57 secs, and 1 hour 0 mins 14 secs. 

Hence, a new column 'postCommentTime_final' is created to ensure that a timestamp of HH:MM:SS is being used consistently throughout. However, we note that the number of days is being included for TimeDeltaIndex.

As a result, we will read the timestamp, by excluding the number of days.  

In [22]:
#TimedeltaIndex
#https://stackoverflow.com/questions/54877467/pandas-convert-hhmm-and-hhmmss-to-standard-hhmmss-in-python
# for example, time of '0:57' will then be 00:00:57; 0 hours 0 mins 57 secs
df['postCommentTime_final'] = pd.to_timedelta(np.where(df['postCommentTime'].str.count(':') == 1, '00:' + df['postCommentTime'], df['postCommentTime']))

In [23]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Morning OSS,Jennifer Quek,0:34,0 days 00:00:34
1,hello miko,Ting Wen,0:36,0 days 00:00:36
2,LNS,Jennie Gan,0:43,0 days 00:00:43
3,LNS OSS,Jennifer Quek,0:45,0 days 00:00:45
4,Morning oss,Goh Hui Ling Veron,0:46,0 days 00:00:46


In [24]:
df['postCommentTime_final'] = df['postCommentTime_final'].astype(str).map(lambda x: x[7:])

In [25]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Morning OSS,Jennifer Quek,0:34,00:00:34
1,hello miko,Ting Wen,0:36,00:00:36
2,LNS,Jennie Gan,0:43,00:00:43
3,LNS OSS,Jennifer Quek,0:45,00:00:45
4,Morning oss,Goh Hui Ling Veron,0:46,00:00:46
5,Good morning Miko and Kelvin,Angela Tay,0:47,00:00:47
6,Good morning miko and OSS team,Jennie Gan,0:49,00:00:49
7,Oss,Angela Tay,0:53,00:00:53
8,OSS,Jennie Gan,0:53,00:00:53
9,yes i wake up le,Clarice Goh,0:55,00:00:55


In [26]:
#reindex according to postCommentTime_final
#previous natsort in data collection didnt take into account the different timestamp format
df = df.reindex(index=order_by_index(df.index, index_natsorted(df.postCommentTime_final)))

In [27]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Morning OSS,Jennifer Quek,0:34,00:00:34
1,hello miko,Ting Wen,0:36,00:00:36
2,LNS,Jennie Gan,0:43,00:00:43
3,LNS OSS,Jennifer Quek,0:45,00:00:45
4,Morning oss,Goh Hui Ling Veron,0:46,00:00:46
5,Good morning Miko and Kelvin,Angela Tay,0:47,00:00:47
6,Good morning miko and OSS team,Jennie Gan,0:49,00:00:49
7,Oss,Angela Tay,0:53,00:00:53
8,OSS,Jennie Gan,0:53,00:00:53
9,yes i wake up le,Clarice Goh,0:55,00:00:55


In [28]:
#reset the index for the dataframe
#https://stackoverflow.com/questions/20490274/how-to-reset-index-in-a-pandas-dataframe

df = df.reset_index(drop=True)

In [29]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Morning OSS,Jennifer Quek,0:34,00:00:34
1,hello miko,Ting Wen,0:36,00:00:36
2,LNS,Jennie Gan,0:43,00:00:43
3,LNS OSS,Jennifer Quek,0:45,00:00:45
4,Morning oss,Goh Hui Ling Veron,0:46,00:00:46
5,Good morning Miko and Kelvin,Angela Tay,0:47,00:00:47
6,Good morning miko and OSS team,Jennie Gan,0:49,00:00:49
7,Oss,Angela Tay,0:53,00:00:53
8,OSS,Jennie Gan,0:53,00:00:53
9,yes i wake up le,Clarice Goh,0:55,00:00:55


**Obtain the length of the video**

Assuming that the last comment of the video is the total length of the video, we will input the length of the video from the comments dataframe into the video attributes dataframe.

In [30]:
#retrieve last comment to obtain the length of the video
df['postCommentTime_final'].iloc[-1]

'01:13:33'

In [31]:
#https://stackoverflow.com/questions/6402812/how-to-convert-an-hmmss-time-string-to-seconds-in-python
def get_sec(time_str):
    """Get Seconds from time."""
    h, m, s = time_str.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

In [32]:
#retrieve last comment to obtain the length of the video in seconds
va['videoLength']= get_sec(df['postCommentTime_final'].iloc[-1])

In [33]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength
0,OCEANSTARLIVE/videos/2174138006061999,27,1200,4413


## Feature Engineering

### Comments made by the Seller

**New Column to identify the number of comments made by the seller in the video**

From the total sum of comments made by the seller, we will input it into the video attributes dataframe.

In [34]:
(df['postCommentAuthor']=='OceanStar Seafood').sum()

97

In [35]:
va['numSellerComments'] = (df['postCommentAuthor']=='OceanStar Seafood').sum()

In [36]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments
0,OCEANSTARLIVE/videos/2174138006061999,27,1200,4413,97


**New Column to identify if the comment is made by the Seller or not**

In [37]:
#to delete column 'notSeller' in due course
df['notSeller'] = df['postCommentAuthor'].map(lambda x:1 if x !='OceanStar Seafood' else 0)

In [38]:
#create a new column to show if the comment is made by the seller or not
df['isSeller'] = df['postCommentAuthor'].map(lambda x:1 if x =='OceanStar Seafood' else 0)

In [39]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller
0,Morning OSS,Jennifer Quek,0:34,00:00:34,1,0
1,hello miko,Ting Wen,0:36,00:00:36,1,0
2,LNS,Jennie Gan,0:43,00:00:43,1,0
3,LNS OSS,Jennifer Quek,0:45,00:00:45,1,0
4,Morning oss,Goh Hui Ling Veron,0:46,00:00:46,1,0


In [40]:
df['isSeller'].value_counts()

0    193
1     97
Name: isSeller, dtype: int64

In [41]:
#show all the seller's comments
df.loc[df['isSeller'] == 1]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller
11,BIG FISHES!!! NZ KING SALMON WHOLE 4.2-4.6 KG/PCS@$168.00 Comment KS168+1 below to join the Sale,OceanStar Seafood,0:57,00:00:57,0,1
19,NZ KING SALMON WHOLE 4.2-4.6 KG/PCS@$168.00 Comment KS168+1 below to join the Sale,OceanStar Seafood,1:16,00:01:16,0,1
20,BALAI THREADFIN WHOLE 5.6-5.9 KG/ PCS @$ 138.00 Comment BTW138+1 below to join the Sale,OceanStar Seafood,1:27,00:01:27,0,1
21,BALAI THREADFIN WHOLE 5.6-5.9 KG/ PCS @$ 138.00 Comment BTW138+1 below to join the Sale,OceanStar Seafood,1:48,00:01:48,0,1
23,WILD GROUPER WHOLE 2.2-2.5 KG/ PCS @$39.90 Comment WG399+1 below to join the Sale,OceanStar Seafood,2:04,00:02:04,0,1
24,SAMPAN CAUGHT QING YI 2.2-2.5 KG/ PCS @$29.90 Comment GP299+1 below to join the Sale,OceanStar Seafood,2:14,00:02:14,0,1
27,RED SNAPPER 1.7-1.9 KG/ PCS @$29.90 Comment RS299+1 below to join the Sale,OceanStar Seafood,2:25,00:02:25,0,1
28,ANG KO LI WHOLE 2.5-2.8 KG/ PCS @$39.90 Comment AKL+1 below to join the Sale.,OceanStar Seafood,2:35,00:02:35,0,1
29,SALMON TROUT WHOLE 4.0-4.3 KG/ PCS @$108.00 Comment STW+1 below to join the Sale,OceanStar Seafood,2:44,00:02:44,0,1
32,MUST GRAB!!! RED GROUPER WHOLE 1.1-1.3KG / PCS @ $23.90 Comment RG239+1 below to join the Sale,OceanStar Seafood,2:57,00:02:57,0,1


### Length of comments

**New Column to identify the length of each comment**

From the comments, we take the length of each comment as the total number of words each comment has.

In [42]:
#length of each comment
#https://stackoverflow.com/questions/37483470/how-to-calculate-number-of-words-in-a-string-in-dataframe
df['postCommentLength'] = df['postComment'].str.split().str.len()

In [43]:
df.head(10)

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength
0,Morning OSS,Jennifer Quek,0:34,00:00:34,1,0,2
1,hello miko,Ting Wen,0:36,00:00:36,1,0,2
2,LNS,Jennie Gan,0:43,00:00:43,1,0,1
3,LNS OSS,Jennifer Quek,0:45,00:00:45,1,0,2
4,Morning oss,Goh Hui Ling Veron,0:46,00:00:46,1,0,2
5,Good morning Miko and Kelvin,Angela Tay,0:47,00:00:47,1,0,5
6,Good morning miko and OSS team,Jennie Gan,0:49,00:00:49,1,0,6
7,Oss,Angela Tay,0:53,00:00:53,1,0,1
8,OSS,Jennie Gan,0:53,00:00:53,1,0,1
9,yes i wake up le,Clarice Goh,0:55,00:00:55,1,0,5


**New Column to identify the total number of comments in the video**

From the total number of comments in the video, we will input it into the video attributes dataframe.

In [44]:
#total number of comments
df['postCommentLength'].sum()

1972

In [45]:
va['numComments'] = df['postCommentLength'].sum()

In [46]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments
0,OCEANSTARLIVE/videos/2174138006061999,27,1200,4413,97,1972


### LNS

LNS is an acronym that stands for 'like and share'. It is a form of customer engagement as it indicates by the customers to the sellers that they have liked and shared the video on their Facebook wall. 

**New Column to identify if Customers are engaging in liking and sharing the video**

In [47]:
#if the customer has commented 'lns' or 'ls' which stands for 'like & shared' & 'like shared' respectively
def lns(comment):
    if re.search(r'(l)(n?)(s)', comment, re.IGNORECASE):
        return int(1)
    else:
        return int(0)

In [48]:
df['lns'] = df['postComment'].map(lambda x:lns(x))

In [49]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns
0,Morning OSS,Jennifer Quek,0:34,00:00:34,1,0,2,0
1,hello miko,Ting Wen,0:36,00:00:36,1,0,2,0
2,LNS,Jennie Gan,0:43,00:00:43,1,0,1,1
3,LNS OSS,Jennifer Quek,0:45,00:00:45,1,0,2,1
4,Morning oss,Goh Hui Ling Veron,0:46,00:00:46,1,0,2,0


**New Column to identify if the number of Customers who explicitly inform the sellers that they are engaging in liking and sharing the video**

In [50]:
#range of customer's engagement for LNS
df['lns'].value_counts()

0    281
1      9
Name: lns, dtype: int64

In [51]:
(df['lns']==1).sum()

9

In [52]:
va['lnsQuantity'] = (df['lns']==1).sum()

In [53]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity
0,OCEANSTARLIVE/videos/2174138006061999,27,1200,4413,97,1972,9


## Sales Quantity

**New Columns to identify the quantity of sales made**

From the comments, using regex, we first see an overview of the comments that are related to the sale of the products. 

In [54]:
#products offered by the seller
df[df['postComment'].str.contains('(\w*\+\d)', regex=True)]

  df[df['postComment'].str.contains('(\w*\+\d)', regex=True)]


Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns
11,BIG FISHES!!! NZ KING SALMON WHOLE 4.2-4.6 KG/PCS@$168.00 Comment KS168+1 below to join the Sale,OceanStar Seafood,0:57,00:00:57,0,1,15,0
16,Es99+1,Sian Foong Lim,1:08,00:01:08,1,0,1,0
19,NZ KING SALMON WHOLE 4.2-4.6 KG/PCS@$168.00 Comment KS168+1 below to join the Sale,OceanStar Seafood,1:16,00:01:16,0,1,13,0
20,BALAI THREADFIN WHOLE 5.6-5.9 KG/ PCS @$ 138.00 Comment BTW138+1 below to join the Sale,OceanStar Seafood,1:27,00:01:27,0,1,15,0
21,BALAI THREADFIN WHOLE 5.6-5.9 KG/ PCS @$ 138.00 Comment BTW138+1 below to join the Sale,OceanStar Seafood,1:48,00:01:48,0,1,15,0
23,WILD GROUPER WHOLE 2.2-2.5 KG/ PCS @$39.90 Comment WG399+1 below to join the Sale,OceanStar Seafood,2:04,00:02:04,0,1,14,0
24,SAMPAN CAUGHT QING YI 2.2-2.5 KG/ PCS @$29.90 Comment GP299+1 below to join the Sale,OceanStar Seafood,2:14,00:02:14,0,1,15,0
27,RED SNAPPER 1.7-1.9 KG/ PCS @$29.90 Comment RS299+1 below to join the Sale,OceanStar Seafood,2:25,00:02:25,0,1,13,0
28,ANG KO LI WHOLE 2.5-2.8 KG/ PCS @$39.90 Comment AKL+1 below to join the Sale.,OceanStar Seafood,2:35,00:02:35,0,1,15,0
29,SALMON TROUT WHOLE 4.0-4.3 KG/ PCS @$108.00 Comment STW+1 below to join the Sale,OceanStar Seafood,2:44,00:02:44,0,1,14,0


In [55]:
def sale(comment):
    if re.search(r'(\w*\+)(\d)', comment):
        return int(re.search(r'\w*\+\d', comment).group(0)[-1])
    else:
        return int(0)

In [56]:
df['sales'] = df['postComment'].apply(lambda x:sale(x))

In [57]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales
0,Morning OSS,Jennifer Quek,0:34,00:00:34,1,0,2,0,0
1,hello miko,Ting Wen,0:36,00:00:36,1,0,2,0,0
2,LNS,Jennie Gan,0:43,00:00:43,1,0,1,1,0
3,LNS OSS,Jennifer Quek,0:45,00:00:45,1,0,2,1,0
4,Morning oss,Goh Hui Ling Veron,0:46,00:00:46,1,0,2,0,0


In [58]:
#sales made by the customers; exclude the seller's comments on the codes for the sales
#https://stackoverflow.com/questions/19914937/applying-function-with-multiple-arguments-to-create-a-new-pandas-column
df['salesQuantity'] = np.multiply(df['notSeller'], df['sales'])

In [59]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity
0,Morning OSS,Jennifer Quek,0:34,00:00:34,1,0,2,0,0,0
1,hello miko,Ting Wen,0:36,00:00:36,1,0,2,0,0,0
2,LNS,Jennie Gan,0:43,00:00:43,1,0,1,1,0,0
3,LNS OSS,Jennifer Quek,0:45,00:00:45,1,0,2,1,0,0
4,Morning oss,Goh Hui Ling Veron,0:46,00:00:46,1,0,2,0,0,0


In [60]:
#range of sales quantity
df['salesQuantity'].value_counts()

0    198
1     80
2     11
3      1
Name: salesQuantity, dtype: int64

In [61]:
#total number of orders made
df['salesQuantity'].sum()

105

In [62]:
va['salesQuantity'] = df['salesQuantity'].sum()

In [63]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity
0,OCEANSTARLIVE/videos/2174138006061999,27,1200,4413,97,1972,9,105


## Products 

The seller will comment and post the unique product codes for each product. Thereafter, customers who are keen on purchasing the products will explicitly comment out the specific & unique product codes, in addition to the quanityt of each product that they wish to purchase.

**New Columns to identify the products purchased by the Customers**

Regex is used to identify the products being offered and the products being purchased as well.

In [64]:
#function to identify the code of the product bought
def sale2(comment):
    if re.search(r'(\w*\+)(\d)', comment):
        return str(re.search(r'\w*\+\d', comment).group(0)[:-2])
    else:
        return int(0)

In [65]:
#identifies all comments that have the codes of the products, including the seller's comments.
#this column will be dropped afterwards.
df['product'] = df['postComment'].apply(lambda x:sale2(x))

In [66]:
#products bought by Customers; exclude the seller's comments on the product details 
df['productBought'] = np.multiply(df['notSeller'], df['product'])

In [67]:
df['productBought'].unique()

array([0, '', 'Es99', 'RG239', 'CP299', 'WB69', 'Wb69', 'CP14', 'FG',
       'KFC', 'Kfc', 'Rg239', 'Cp14', 'Fg', 'MBT11', 'Mbt11', 'OLP239',
       'WAK', 'Wak', 'KSF', 'Ksf', 'Ksh', 'Ksb', 'KSB', 'MP', 'PR', 'MJ',
       'FJ', 'KN119', 'Mj', 'KBP', 'FCF', 'BSS', 'Rgss', 'BST', 'Bst',
       'BMC', 'Aklt', 'RGSS', 'SQ', 'Kn119', 'Wp13', 'Sp', 'WP13', 'AKLH',
       'Rgf', 'CS'], dtype=object)

In [68]:
#https://stackoverflow.com/questions/13445241/replacing-blank-values-white-space-with-nan-in-pandas
df['productBought'] = df['productBought'].replace(r'^\s*$', int(0), regex=True)

In [69]:
df['productBought'].unique()

array([0, 'Es99', 'RG239', 'CP299', 'WB69', 'Wb69', 'CP14', 'FG', 'KFC',
       'Kfc', 'Rg239', 'Cp14', 'Fg', 'MBT11', 'Mbt11', 'OLP239', 'WAK',
       'Wak', 'KSF', 'Ksf', 'Ksh', 'Ksb', 'KSB', 'MP', 'PR', 'MJ', 'FJ',
       'KN119', 'Mj', 'KBP', 'FCF', 'BSS', 'Rgss', 'BST', 'Bst', 'BMC',
       'Aklt', 'RGSS', 'SQ', 'Kn119', 'Wp13', 'Sp', 'WP13', 'AKLH', 'Rgf',
       'CS'], dtype=object)

In [70]:
#change the produce codes to be uppercase for consistency
#https://stackoverflow.com/questions/39512002/convert-whole-dataframe-from-lower-case-to-upper-case-with-pandas
df['productBought'] = df['productBought'].astype(str).str.upper()

In [71]:
df['productBought'].unique()

array(['0', 'ES99', 'RG239', 'CP299', 'WB69', 'CP14', 'FG', 'KFC',
       'MBT11', 'OLP239', 'WAK', 'KSF', 'KSH', 'KSB', 'MP', 'PR', 'MJ',
       'FJ', 'KN119', 'KBP', 'FCF', 'BSS', 'RGSS', 'BST', 'BMC', 'AKLT',
       'SQ', 'WP13', 'SP', 'AKLH', 'RGF', 'CS'], dtype=object)

In [72]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity,product,productBought
0,Morning OSS,Jennifer Quek,0:34,00:00:34,1,0,2,0,0,0,0,0
1,hello miko,Ting Wen,0:36,00:00:36,1,0,2,0,0,0,0,0
2,LNS,Jennie Gan,0:43,00:00:43,1,0,1,1,0,0,0,0
3,LNS OSS,Jennifer Quek,0:45,00:00:45,1,0,2,1,0,0,0,0
4,Morning oss,Goh Hui Ling Veron,0:46,00:00:46,1,0,2,0,0,0,0,0
5,Good morning Miko and Kelvin,Angela Tay,0:47,00:00:47,1,0,5,0,0,0,0,0
6,Good morning miko and OSS team,Jennie Gan,0:49,00:00:49,1,0,6,0,0,0,0,0
7,Oss,Angela Tay,0:53,00:00:53,1,0,1,0,0,0,0,0
8,OSS,Jennie Gan,0:53,00:00:53,1,0,1,0,0,0,0,0
9,yes i wake up le,Clarice Goh,0:55,00:00:55,1,0,5,0,0,0,0,0


### Price of Products

**New Column to identify the price of the products**

This new column is created to identify the regex of the unique product codes and their corresponding prices, as adviced by the seller.

In [73]:
def price(comment):
    if re.search(r'(\@)(\$)( ?)(.*)', comment):
        return str(re.search(r'(\$)( ?)(.*)', comment).group(0)[:-23])
    else:
        return int(0)

In [74]:
df['productPrice'] = df['postComment'].apply(lambda x:price(x))

In [75]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity,product,productBought,productPrice
0,Morning OSS,Jennifer Quek,0:34,00:00:34,1,0,2,0,0,0,0,0,0
1,hello miko,Ting Wen,0:36,00:00:36,1,0,2,0,0,0,0,0,0
2,LNS,Jennie Gan,0:43,00:00:43,1,0,1,1,0,0,0,0,0
3,LNS OSS,Jennifer Quek,0:45,00:00:45,1,0,2,1,0,0,0,0,0
4,Morning oss,Goh Hui Ling Veron,0:46,00:00:46,1,0,2,0,0,0,0,0,0
5,Good morning Miko and Kelvin,Angela Tay,0:47,00:00:47,1,0,5,0,0,0,0,0,0
6,Good morning miko and OSS team,Jennie Gan,0:49,00:00:49,1,0,6,0,0,0,0,0,0
7,Oss,Angela Tay,0:53,00:00:53,1,0,1,0,0,0,0,0,0
8,OSS,Jennie Gan,0:53,00:00:53,1,0,1,0,0,0,0,0,0
9,yes i wake up le,Clarice Goh,0:55,00:00:55,1,0,5,0,0,0,0,0,0


We noticed that each comment has a word 'Comment' in the middle of the extracted string of comments for the column 'productPrice'. Hence, we will remove the mentioned word.

In [76]:
df['productPrice'] = df['productPrice'].replace(value='', regex=r'Comment')

In [77]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,notSeller,isSeller,postCommentLength,lns,sales,salesQuantity,product,productBought,productPrice
0,Morning OSS,Jennifer Quek,0:34,00:00:34,1,0,2,0,0,0,0,0,0
1,hello miko,Ting Wen,0:36,00:00:36,1,0,2,0,0,0,0,0,0
2,LNS,Jennie Gan,0:43,00:00:43,1,0,1,1,0,0,0,0,0
3,LNS OSS,Jennifer Quek,0:45,00:00:45,1,0,2,1,0,0,0,0,0
4,Morning oss,Goh Hui Ling Veron,0:46,00:00:46,1,0,2,0,0,0,0,0,0
5,Good morning Miko and Kelvin,Angela Tay,0:47,00:00:47,1,0,5,0,0,0,0,0,0
6,Good morning miko and OSS team,Jennie Gan,0:49,00:00:49,1,0,6,0,0,0,0,0,0
7,Oss,Angela Tay,0:53,00:00:53,1,0,1,0,0,0,0,0,0
8,OSS,Jennie Gan,0:53,00:00:53,1,0,1,0,0,0,0,0,0
9,yes i wake up le,Clarice Goh,0:55,00:00:55,1,0,5,0,0,0,0,0,0


In [78]:
df['productPrice'].unique()

array([0, '$168.00  KS168+1', '$ 138.00  BTW138+1', '$39.90  WG399+1',
       '$29.90  GP299+1', '$29.90  RS299+1', '$39.90  AKL+1 ',
       '$108.00  STW+1', '$29.90  CP299+1', '$6.90  WB69+1',
       '$14.00  CP14+1', '$9.90  FG+1', '$8.00  KFC+1', '$ 18.80  SCP+1 ',
       '$11.11 ( PWP)  MBT11+1', '$23.90  OLP239+1 ',
       '$11.11 ( PWP)  WAK+1 ', '$21.90  SQ+1 ', '$ 5.50  KSH+1',
       '$16.00  KSB+1', '$9.90  MJ+1', '$59.90  FCF+1', '$ 5.90  KBP+1',
       '$13.90  BST+1', '$29.90  RGSS+1', '$21.90  BSS+1',
       '$9.90  AKLH+1', '$23.90  SL239+1 ', '$11.90  KN119+1 ',
       '$13.00  WP13+1', '$15.90  BP+1', '$36.00  STF+1 ',
       '$24.00  LP24+1 ', '$19.90  TP199+1 ', '$9.90  CS99+1'],
      dtype=object)

Excluding the comments where there was no product codes as adviced by the seller, we find the number of unique products offered by the seller.

In [79]:
#number of unique products offered by the seller
int(df['productPrice'].nunique()) - int(1)

34

In [80]:
#total number of products offered
va['numProducts'] = int(df['productPrice'].nunique()) - int(1)

In [81]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts
0,OCEANSTARLIVE/videos/2174138006061999,27,1200,4413,97,1972,9,105,34


**Drop irrelevant columns**

The following column was dropped for the following reasons:

2. 'notSeller'
- This column 'notSeller' was solely created to calculate the quantity of sales made, and the products purchased by the customer. Hence, we are able to delete it after the quantity of sales have been calculated and the identification of the products purchased by the customers have been identified.
- Notwithstanding the above, a new column 'isSeller' has been feature engineered out as well, which will tell us the same results on whether the comment is posted & written by a seller or not. 

3. 'sales'
- This column identifies the quantity of products to the unique & specific product codes. However, it includes the product codes posted by the sellers as well. Hence, this column was solely created to be multiplied against the column 'notSeller' to calculate the true quantity of sales made by customers only. Hence, we are able to delete it after the quantity of sales have been calculated.

4. 'product'
- This column 'product' was solely created to identify the products purchased by the customers. Hence, we are able to delete it after the products purchased by the customers have been identified - especially since not all products offered by the seller is being bought by the customers.

In [82]:
#drop unwanted columns
df.drop(['postCommentTime', 'notSeller', 'sales', 'product'], axis=1, inplace=True)

In [83]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productBought,productPrice
0,Morning OSS,Jennifer Quek,00:00:34,0,2,0,0,0,0
1,hello miko,Ting Wen,00:00:36,0,2,0,0,0,0
2,LNS,Jennie Gan,00:00:43,0,1,1,0,0,0
3,LNS OSS,Jennifer Quek,00:00:45,0,2,1,0,0,0
4,Morning oss,Goh Hui Ling Veron,00:00:46,0,2,0,0,0,0


### Revenue from the sale of the products

**Dummify the products bought to find the revenue**

In [84]:
#getdummies the products bought
df = pd.get_dummies(df, columns = ['productBought'], drop_first = True)

In [85]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,productBought_OLP239,productBought_PR,productBought_RG239,productBought_RGF,productBought_RGSS,productBought_SP,productBought_SQ,productBought_WAK,productBought_WB69,productBought_WP13
0,Morning OSS,Jennifer Quek,00:00:34,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,hello miko,Ting Wen,00:00:36,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,LNS,Jennie Gan,00:00:43,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,LNS OSS,Jennifer Quek,00:00:45,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Morning oss,Goh Hui Ling Veron,00:00:46,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [86]:
# iterating the columns
for col in df.columns:
    print(col)

postComment
postCommentAuthor
postCommentTime_final
isSeller
postCommentLength
lns
salesQuantity
productPrice
productBought_AKLH
productBought_AKLT
productBought_BMC
productBought_BSS
productBought_BST
productBought_CP14
productBought_CP299
productBought_CS
productBought_ES99
productBought_FCF
productBought_FG
productBought_FJ
productBought_KBP
productBought_KFC
productBought_KN119
productBought_KSB
productBought_KSF
productBought_KSH
productBought_MBT11
productBought_MJ
productBought_MP
productBought_OLP239
productBought_PR
productBought_RG239
productBought_RGF
productBought_RGSS
productBought_SP
productBought_SQ
productBought_WAK
productBought_WB69
productBought_WP13


After the column 'productBought' has been dummified, the cells will return a '1' if that particular item is bought, and a '0' if it is not. Hence, we will replace all the '1' with the price of the product.

Then, we will create a new revenue column which is a multiplication of the column 'salesQuantity' against the price of the product (i.e. the dummified 'productBought' columnns).

Product AKLH

In [87]:
df[df['postComment'].str.contains('AKLH', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,productBought_OLP239,productBought_PR,productBought_RG239,productBought_RGF,productBought_RGSS,productBought_SP,productBought_SQ,productBought_WAK,productBought_WB69,productBought_WP13
241,ANG KO LI HEAD 1.5KG+-/ PCS @$9.90 Comment AKLH+1 below to join the Sale,OceanStar Seafood,01:00:00,1,14,0,0,$9.90 AKLH+1,0,0,...,0,0,0,0,0,0,0,0,0,0
268,AKLH+1,Wendy Chia,01:07:49,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [88]:
df['productBought_AKLH'] = df['productBought_AKLH'].map(lambda x:float(9.90) if x == int(1) else 0)

In [89]:
df['revenue_AKLH'] = np.multiply(df['productBought_AKLH'], df['salesQuantity'])

In [90]:
revenue_AKLH = "The total revenue from the sale of the product {} is ${}". format ("AKLH", format(df['revenue_AKLH'].sum(), '.2f'))
print(revenue_AKLH)


The total revenue from the sale of the product AKLH is $9.90


Product AKLT

In [91]:
df[df['postComment'].str.contains('AKLT', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,productBought_PR,productBought_RG239,productBought_RGF,productBought_RGSS,productBought_SP,productBought_SQ,productBought_WAK,productBought_WB69,productBought_WP13,revenue_AKLH
232,ANG KO LI TAIL 1.1-1.2 KG/ PCS @ $19.90 Comment AKLT+1 below to join the Sale,OceanStar Seafood,00:57:31,1,16,0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [92]:
df['productBought_AKLT'] = df['productBought_AKLT'].map(lambda x:float(19.90) if x == int(1) else 0)

In [93]:
df['revenue_AKLT'] = np.multiply(df['productBought_AKLT'], df['salesQuantity'])

In [94]:
revenue_AKLT = "The total revenue from the sale of the product {} is ${}". format ("AKLT", format(df['revenue_AKLT'].sum(), '.2f'))
print(revenue_AKLT)

The total revenue from the sale of the product AKLT is $19.90


Product BMC

In [95]:
df[df['postComment'].str.contains('BMC', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,productBought_RG239,productBought_RGF,productBought_RGSS,productBought_SP,productBought_SQ,productBought_WAK,productBought_WB69,productBought_WP13,revenue_AKLH,revenue_AKLT
231,BRAISED MEI CAI / PORTION @ $ 8.90 Comment BMC+1 below to join the Sale,OceanStar Seafood,00:57:20,1,15,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0
233,BMC+1,Pearlyn Chua,00:57:31,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0


In [96]:
df['productBought_BMC'] = df['productBought_BMC'].map(lambda x:float(8.90) if x == int(1) else 0)

In [97]:
df['revenue_BMC'] = np.multiply(df['productBought_BMC'], df['salesQuantity'])

In [98]:
revenue_BMC = "The total revenue from the sale of the product {} is ${}". format ("BMC", format(df['revenue_BMC'].sum(), '.2f'))
print(revenue_BMC)


The total revenue from the sale of the product BMC is $8.90


Product BSS

In [99]:
df[df['postComment'].str.contains('BSS', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,productBought_RGF,productBought_RGSS,productBought_SP,productBought_SQ,productBought_WAK,productBought_WB69,productBought_WP13,revenue_AKLH,revenue_AKLT,revenue_BMC
219,BSS+1,Sian Foong Lim,00:55:12,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
229,2 X BATANG SOUP SLICED 300G/ PKT @$21.90 Comment BSS+1 below to join the Sale,OceanStar Seafood,00:57:07,1,15,0,0,$21.90 BSS+1,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0


In [100]:
df['productBought_BSS'] = df['productBought_BSS'].map(lambda x:float(21.90) if x == int(1) else 0)

In [101]:
df['revenue_BSS'] = np.multiply(df['productBought_BSS'], df['salesQuantity'])

In [102]:
revenue_BSS = "The total revenue from the sale of the product {} is ${}". format ("BSS", format(df['revenue_BSS'].sum(), '.2f'))
print(revenue_BSS)


The total revenue from the sale of the product BSS is $21.90


Product BST

In [103]:
df[df['postComment'].str.contains('BST', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,productBought_RGSS,productBought_SP,productBought_SQ,productBought_WAK,productBought_WB69,productBought_WP13,revenue_AKLH,revenue_AKLT,revenue_BMC,revenue_BSS
220,LAST 6 SET!!! 2 X BATANG STEAK 250-350G/ PKT @$13.90 Comment BST+1 below to join the Sale,OceanStar Seafood,00:55:20,1,17,0,0,$13.90 BST+1,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
222,BST+1,Adeline Neo,00:55:43,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
224,BST+1,Jasmine Gn,00:56:00,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
242,SLIPPER LOBSTER 6-8 PCS/ KG@$23.90 Comment SL239+1 below to join the Sale.,OceanStar Seafood,01:00:14,1,12,0,0,$23.90 SL239+1,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0


In [104]:
df['productBought_BST'] = df['productBought_BST'].map(lambda x:float(13.90) if x == int(1) else 0)

In [105]:
df['revenue_BST'] = np.multiply(df['productBought_BST'], df['salesQuantity'])

In [106]:
revenue_BST = "The total revenue from the sale of the product {} is ${}". format ("BST", format(df['revenue_BST'].sum(), '.2f'))
print(revenue_BST)


The total revenue from the sale of the product BST is $55.60


Product CP14

In [107]:
df[df['postComment'].str.contains('CP14', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,productBought_SP,productBought_SQ,productBought_WAK,productBought_WB69,productBought_WP13,revenue_AKLH,revenue_AKLT,revenue_BMC,revenue_BSS,revenue_BST
62,2 X PONTIAN CHINESE POMFRET 250-400G/ PCS @$14.00 Comment CP14+1 below to join the Sale,OceanStar Seafood,00:11:47,1,15,0,0,$14.00 CP14+1,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
64,CP14+1,Anne Ng,00:12:15,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
70,CP14+1,Iris Teoh,00:13:12,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
117,2 X CHINESE POMFRET 250-400G/ PCS @$14.00 Comment CP14+1 below to join the Sale,OceanStar Seafood,00:24:57,1,14,0,0,$14.00 CP14+1,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
131,CP14+1,Yvonne Chia,00:27:28,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


In [108]:
df['productBought_CP14'] = df['productBought_CP14'].map(lambda x:float(14.00) if x == int(1) else 0)

In [109]:
df['revenue_CP14'] = np.multiply(df['productBought_CP14'], df['salesQuantity'])

In [110]:
revenue_CP14 = "The total revenue from the sale of the product {} is ${}". format ("CP14", format(df['revenue_CP14'].sum(), '.2f'))
print(revenue_CP14)


The total revenue from the sale of the product CP14 is $56.00


Product CP299

In [111]:
df[df['postComment'].str.contains('CP299', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,productBought_SQ,productBought_WAK,productBought_WB69,productBought_WP13,revenue_AKLH,revenue_AKLT,revenue_BMC,revenue_BSS,revenue_BST,revenue_CP14
46,LAST 3 PCS!!! PONTIAN CHINESE POMFRET 800-900G/ PCS @$29.90 Comment CP299+1 below to join the Sale,OceanStar Seafood,00:07:06,1,16,0,0,$29.90 CP299+1,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
53,CP299+1,Ng Fui Yong,00:09:20,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
59,LAST 2 PCS!! CHINESE POMFRET 800-900G/ PCS @$29.90 Comment CP299+1 below to join the Sale,OceanStar Seafood,00:10:10,1,15,0,0,$29.90 CP299+1,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
100,LAST 2 PCS!!! CHINESE POMFRET 800-900G/ PCS @$29.90 Comment CP299+1 below to join the Sale,OceanStar Seafood,00:21:06,1,15,0,0,$29.90 CP299+1,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
262,LAST 2 PCS!!! CHINESE POMFRET 800-900G/ PCS @$29.90 Comment CP299+1 below to join the Sale,OceanStar Seafood,01:05:53,1,15,0,0,$29.90 CP299+1,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
265,CHINESE POMFRET 800-900G/ PCS @$29.90 Comment CP299+1 below to join the Sale,OceanStar Seafood,01:06:22,1,12,0,0,$29.90 CP299+1,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
266,CP299+2,Wendy Chia,01:06:48,0,1,0,2,0,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [112]:
df['productBought_CP299'] = df['productBought_CP299'].map(lambda x:float(29.90) if x == int(1) else 0)

In [113]:
df['revenue_CP299'] = np.multiply(df['productBought_CP299'], df['salesQuantity'])

In [114]:
revenue_CP299 = "The total revenue from the sale of the product {} is ${}". format ("CP299", format(df['revenue_CP299'].sum(), '.2f'))
print(revenue_CP299)


The total revenue from the sale of the product CP299 is $89.70


Product CS

In [115]:
df[df['postComment'].str.contains('CS+', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,productBought_WAK,productBought_WB69,productBought_WP13,revenue_AKLH,revenue_AKLT,revenue_BMC,revenue_BSS,revenue_BST,revenue_CP14,revenue_CP299
93,2 X CLAM SOUP 500G/ PKT @ $12.90 Comment CS+1 below to join the Sale,OceanStar Seafood,00:17:10,1,15,0,0,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,2 X CLAM SOUP 500G/ PKT @ $12.90 Comment CS+1 below to join the Sale,OceanStar Seafood,00:17:51,1,15,0,0,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
282,CHICKEN SATAY 20 PCS+ SAUCE / PKT @$9.90 Comment CS99+1 below to join the Sale,OceanStar Seafood,01:11:11,1,15,0,0,$9.90 CS99+1,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
288,2 X CLAM SOUP 500G/ PKT @ $12.90 Comment CS+1 below to join the Sale,OceanStar Seafood,01:12:59,1,15,0,0,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
289,CS+1,Pearlyn Chua,01:13:33,0,1,0,1,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
df['productBought_CS'] = df['productBought_CS'].map(lambda x:float(12.90) if x == int(1) else 0)

In [117]:
df['revenue_CS'] = np.multiply(df['productBought_CS'], df['salesQuantity'])

In [118]:
revenue_CS = "The total revenue from the sale of the product {} is ${}". format ("CS", format(df['revenue_CS'].sum(), '.2f'))
print(revenue_CS)


The total revenue from the sale of the product CS is $12.90


Product ES99

In [119]:
df[df['postComment'].str.contains('ES99', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,productBought_WB69,productBought_WP13,revenue_AKLH,revenue_AKLT,revenue_BMC,revenue_BSS,revenue_BST,revenue_CP14,revenue_CP299,revenue_CS


Product FCF

In [120]:
df[df['postComment'].str.contains('FCF', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,productBought_WB69,productBought_WP13,revenue_AKLH,revenue_AKLT,revenue_BMC,revenue_BSS,revenue_BST,revenue_CP14,revenue_CP299,revenue_CS
208,2 X FRANCE COD FL 400-500G/ PKT @$59.90 Comment FCF+1 below to join the Sale,OceanStar Seafood,00:51:16,1,15,0,0,$59.90 FCF+1,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
210,FCF+1,Jolynn Toh,00:52:23,0,1,0,1,0,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [121]:
df['productBought_FCF'] = df['productBought_FCF'].map(lambda x:float(59.90) if x == int(1) else 0)

In [122]:
df['revenue_FCF'] = np.multiply(df['productBought_FCF'], df['salesQuantity'])

In [123]:
revenue_FCF = "The total revenue from the sale of the product {} is ${}". format ("FCF", format(df['revenue_FCF'].sum(), '.2f'))
print(revenue_FCF)


The total revenue from the sale of the product FCF is $59.90


Product FG

In [124]:
df[df['postComment'].str.contains('FG', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,productBought_WP13,revenue_AKLH,revenue_AKLT,revenue_BMC,revenue_BSS,revenue_BST,revenue_CP14,revenue_CP299,revenue_CS,revenue_FCF
72,2 X FLOWER GROUPER 300-400G/ PCS @$9.90 Comment FG+1 below to join the Sale,OceanStar Seafood,00:13:26,1,14,0,0,$9.90 FG+1,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77,FG+1,Anne Ng,00:14:25,0,1,0,1,0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78,FG+1,Hear Their Voices,00:14:31,0,1,0,1,0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
121,2 X FLOWER GROUPER 300-400G/ PCS @$9.90 Comment FG+1 below to join the Sale,OceanStar Seafood,00:25:47,1,14,0,0,$9.90 FG+1,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [125]:
df['productBought_FG'] = df['productBought_FG'].map(lambda x:float(9.90) if x == int(1) else 0)

In [126]:
df['revenue_FG'] = np.multiply(df['productBought_FG'], df['salesQuantity'])

In [127]:
revenue_FG = "The total revenue from the sale of the product {} is ${}". format ("FG", format(df['revenue_FG'].sum(), '.2f'))
print(revenue_FG)


The total revenue from the sale of the product FG is $29.70


Product FJ

In [128]:
df[df['postComment'].str.contains('FJ', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_AKLH,revenue_AKLT,revenue_BMC,revenue_BSS,revenue_BST,revenue_CP14,revenue_CP299,revenue_CS,revenue_FCF,revenue_FG
197,FJ+1,Esther Loo,00:48:07,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [129]:
df[df['postComment'].str.contains('fj', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_AKLH,revenue_AKLT,revenue_BMC,revenue_BSS,revenue_BST,revenue_CP14,revenue_CP299,revenue_CS,revenue_FCF,revenue_FG


Product KBP

In [130]:
df[df['postComment'].str.contains('KBP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_AKLH,revenue_AKLT,revenue_BMC,revenue_BSS,revenue_BST,revenue_CP14,revenue_CP299,revenue_CS,revenue_FCF,revenue_FG
206,KBP+1,Yvonne Chia,00:50:39,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
211,KONG BAK PAO 4 PCS / PKT @$ 5.90 Comment KBP+1 below to join the Sale,OceanStar Seafood,00:52:25,1,16,0,0,$ 5.90 KBP+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
212,KBP+1,Adeline Neo,00:52:52,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214,KBP+1,Jolynn Toh,00:53:18,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
269,KBP+2,Wendy Chia,01:08:07,0,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [131]:
df['productBought_KBP'] = df['productBought_KBP'].map(lambda x:float(5.90) if x == int(1) else 0)

In [132]:
df['revenue_KBP'] = np.multiply(df['productBought_KBP'], df['salesQuantity'])

In [133]:
revenue_KBP = "The total revenue from the sale of the product {} is ${}". format ("KBP", format(df['revenue_KBP'].sum(), '.2f'))
print(revenue_KBP)


The total revenue from the sale of the product KBP is $29.50


Product KFC

In [134]:
df[df['postComment'].str.contains('KFC', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_AKLT,revenue_BMC,revenue_BSS,revenue_BST,revenue_CP14,revenue_CP299,revenue_CS,revenue_FCF,revenue_FG,revenue_KBP
85,2 X KOREAN FLOWER CLAM 500G/ PKT @$8.00 Comment KFC+1 below to join the Sale,OceanStar Seafood,00:15:28,1,15,0,0,$8.00 KFC+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,2 X KOREAN FLOWER CLAM 500G/ PKT @$8.00 Comment KFC+1 below to join the Sale,OceanStar Seafood,00:15:34,1,15,0,0,$8.00 KFC+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89,KFC+1,Anne Ng,00:16:29,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
142,KFC+1,Yvonne Chia,00:29:28,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [135]:
df['productBought_KFC'] = df['productBought_KFC'].map(lambda x:float(8.00) if x == int(1) else 0)

In [136]:
df['revenue_KFC'] = np.multiply(df['productBought_KFC'], df['salesQuantity'])

In [137]:
revenue_KFC = "The total revenue from the sale of the product {} is ${}". format ("KFC", format(df['revenue_KFC'].sum(), '.2f'))
print(revenue_KFC)


The total revenue from the sale of the product KFC is $24.00


Product KN119

In [138]:
df[df['postComment'].str.contains('KN119', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_BMC,revenue_BSS,revenue_BST,revenue_CP14,revenue_CP299,revenue_CS,revenue_FCF,revenue_FG,revenue_KBP,revenue_KFC
201,KN119+1,Adeline Neo,00:48:33,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
246,2 X KUNNING 500G/ PKT @$11.90 Comment KN119+1 below to join the Sale.,OceanStar Seafood,01:01:46,1,13,0,0,$11.90 KN119+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [139]:
df['productBought_KN119'] = df['productBought_KN119'].map(lambda x:float(11.90) if x == int(1) else 0)

In [140]:
df['revenue_KN119'] = np.multiply(df['productBought_KN119'], df['salesQuantity'])

In [141]:
revenue_KN119 = "The total revenue from the sale of the product {} is ${}". format ("KN119", format(df['revenue_KN119'].sum(), '.2f'))
print(revenue_KN119)


The total revenue from the sale of the product KN119 is $23.80


Product KSB

In [142]:
df[df['postComment'].str.contains('KSB', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_BSS,revenue_BST,revenue_CP14,revenue_CP299,revenue_CS,revenue_FCF,revenue_FG,revenue_KBP,revenue_KFC,revenue_KN119
179,KING SALMON BELLY 2 PCS / PKT @$16.00 Comment KSB+1 below to join the Sale,OceanStar Seafood,00:41:11,1,15,0,0,$16.00 KSB+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
183,KSB+1,Joy Mah,00:42:25,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
187,LAST PKT!!! KING SALMON BELLY 2 PCS / PKT @$16.00 Comment KSB+1 below to join the Sale,OceanStar Seafood,00:43:06,1,17,0,0,$16.00 KSB+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [143]:
df['productBought_KSB'] = df['productBought_KSB'].map(lambda x:float(16.00) if x == int(1) else 0)

In [144]:
df['revenue_KSB'] = np.multiply(df['productBought_KSB'], df['salesQuantity'])

In [145]:
revenue_KSB = "The total revenue from the sale of the product {} is ${}". format ("KSB", format(df['revenue_KSB'].sum(), '.2f'))
print(revenue_KSB)


The total revenue from the sale of the product KSB is $48.00


Product KSF

In [146]:
df[df['postComment'].str.contains('KSF', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_BST,revenue_CP14,revenue_CP299,revenue_CS,revenue_FCF,revenue_FG,revenue_KBP,revenue_KFC,revenue_KN119,revenue_KSB
165,2 X KING SALMON FILLET 300-350G / PCS @ $39.90 Comment KSF+1 below to join the Sale.,OceanStar Seafood,00:36:29,1,17,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
167,KSF+1,Pearlyn Chua,00:36:58,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
169,KSF+1,Ng Fui Yong,00:37:19,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
171,KSF+1,Anne Ng,00:37:47,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173,LAST 3 SET!!! 2 X KING SALMON FILLET 300-350G / PCS @ $39.90 Comment KSF+1 below to join the Sale.,OceanStar Seafood,00:38:50,1,20,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
178,KSF+1,June Ng,00:40:10,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
181,KSF+1,Cindy Ong,00:41:53,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [147]:
df['productBought_KSF'] = df['productBought_KSF'].map(lambda x:float(39.90) if x == int(1) else 0)

In [148]:
df['revenue_KSF'] = np.multiply(df['productBought_KSF'], df['salesQuantity'])

In [149]:
revenue_KSF = "The total revenue from the sale of the product {} is ${}". format ("KSF", format(df['revenue_KSF'].sum(), '.2f'))
print(revenue_KSF)


The total revenue from the sale of the product KSF is $319.20


Product KSH

In [150]:
df[df['postComment'].str.contains('KSH', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_CP14,revenue_CP299,revenue_CS,revenue_FCF,revenue_FG,revenue_KBP,revenue_KFC,revenue_KN119,revenue_KSB,revenue_KSF
174,KING SALMON HEAD / PCS @$ 5.50 Comment KSH+1 below to join the Sale,OceanStar Seafood,00:39:04,1,14,0,0,$ 5.50 KSH+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
175,LAST 2 PCS!!! KING SALMON HEAD / PCS @$ 5.50 Comment KSH+1 below to join the Sale,OceanStar Seafood,00:39:15,1,17,0,0,$ 5.50 KSH+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [151]:
df['productBought_KSH'] = df['productBought_KSH'].map(lambda x:float(5.50) if x == int(1) else 0)

In [152]:
df['revenue_KSH'] = np.multiply(df['productBought_KSH'], df['salesQuantity'])

In [153]:
revenue_KSH = "The total revenue from the sale of the product {} is ${}". format ("KSH", format(df['revenue_KSH'].sum(), '.2f'))
print(revenue_KSH)


The total revenue from the sale of the product KSH is $16.50


Product MBT11

In [154]:
df[df['postComment'].str.contains('MBT11', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_CP299,revenue_CS,revenue_FCF,revenue_FG,revenue_KBP,revenue_KFC,revenue_KN119,revenue_KSB,revenue_KSF,revenue_KSH
122,MINI BALAI THREADFIN 800-1.1KG/ PCS @$11.11 ( PWP) Comment MBT11+1 below to join the Sale,OceanStar Seafood,00:26:04,1,15,0,0,$11.11 ( PWP) MBT11+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
125,SUPER DEAL!!! MINI BALAI THREADFIN 800-1.1KG/ PCS @$11.11 ( PWP) Comment MBT11+1 below to join the Sale,OceanStar Seafood,00:26:53,1,17,0,0,$11.11 ( PWP) MBT11+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
128,MBT11+1,Ng Fui Yong,00:27:04,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
132,MINI BALAI THREADFIN 800-1.1KG/ PCS @$11.11 ( PWP) Comment MBT11+1 below to join the Sale,OceanStar Seafood,00:27:30,1,15,0,0,$11.11 ( PWP) MBT11+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
134,MBT11+2,Yvonne Chia,00:28:12,0,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
135,MINI BALAI THREADFIN 800-1.1KG/ PCS @$11.11 ( PWP) Comment MBT11+1 below to join the Sale,OceanStar Seafood,00:28:32,1,15,0,0,$11.11 ( PWP) MBT11+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
209,MUST GRAB!!! MINI BALAI THREADFIN 800-1.1KG/ PCS @$11.11 ( PWP) Comment MBT11+1 below to join the Sale,OceanStar Seafood,00:52:15,1,17,0,0,$11.11 ( PWP) MBT11+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
215,MUST GRAB!!! MINI BALAI THREADFIN 800-1.1KG/ PCS @$11.11 ( PWP) Comment MBT11+1 below to join the Sale,OceanStar Seafood,00:54:08,1,17,0,0,$11.11 ( PWP) MBT11+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
256,MINI BALAI THREADFIN 800-1.1KG/ PCS @$11.11 ( PWP) Comment MBT11+1 below to join the Sale,OceanStar Seafood,01:04:18,1,15,0,0,$11.11 ( PWP) MBT11+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
259,MBT11+1,Hear Their Voices,01:05:00,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [155]:
df['productBought_MBT11'] = df['productBought_MBT11'].map(lambda x:float(11.11) if x == int(1) else 0)

In [156]:
df['revenue_MBT11'] = np.multiply(df['productBought_MBT11'], df['salesQuantity'])

In [157]:
revenue_MBT11 = "The total revenue from the sale of the product {} is ${}". format ("MBT11", format(df['revenue_MBT11'].sum(), '.2f'))
print(revenue_MBT11)


The total revenue from the sale of the product MBT11 is $122.21


Product MJ

In [158]:
df[df['postComment'].str.contains('MJ', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_CS,revenue_FCF,revenue_FG,revenue_KBP,revenue_KFC,revenue_KN119,revenue_KSB,revenue_KSF,revenue_KSH,revenue_MBT11
190,2 X IQF CHICKEN MID JOINT 450G+-/ P@$9.90 Comment MJ+1 below to join the Sale,OceanStar Seafood,00:45:08,1,15,0,0,$9.90 MJ+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
195,MJ+1,Yvonne Chia,00:47:58,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196,MJ+1,Iris Teoh,00:48:02,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199,2 X IQF CHICKEN MID JOINT 450G+-/ P@$9.90 Comment MJ+1 below to join the Sale,OceanStar Seafood,00:48:20,1,15,0,0,$9.90 MJ+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200,MJ+1,Esther Loo,00:48:29,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [159]:
df['productBought_MJ'] = df['productBought_MJ'].map(lambda x:float(9.90) if x == int(1) else 0)

In [160]:
df['revenue_MJ'] = np.multiply(df['productBought_MJ'], df['salesQuantity'])

In [161]:
revenue_MJ = "The total revenue from the sale of the product {} is ${}". format ("MJ", format(df['revenue_MJ'].sum(), '.2f'))
print(revenue_MJ)


The total revenue from the sale of the product MJ is $39.60


Product MP

In [162]:
df[df['postComment'].str.contains('MP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_FCF,revenue_FG,revenue_KBP,revenue_KFC,revenue_KN119,revenue_KSB,revenue_KSF,revenue_KSH,revenue_MBT11,revenue_MJ
24,SAMPAN CAUGHT QING YI 2.2-2.5 KG/ PCS @$29.90 Comment GP299+1 below to join the Sale,OceanStar Seafood,00:02:14,1,15,0,0,$29.90 GP299+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,ORGANIC COOKED SHRIMP 31-40PCS/ PKT @$ 18.80 Comment SCP+1 below to join the Sale.,OceanStar Seafood,00:19:27,1,14,0,0,$ 18.80 SCP+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,ORGANIC COOKED SHRIMP 31-40PCS/ PKT @$ 18.80 Comment SCP+1 below to join the Sale.,OceanStar Seafood,00:21:11,1,14,0,0,$ 18.80 SCP+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
186,MP+2,Esther Loo,00:43:04,0,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
216,SAMPAN CAUGHT QING YI 2.2-2.5 KG/ PCS @$29.90 Comment GP299+1 below to join the Sale,OceanStar Seafood,00:54:26,1,15,0,0,$29.90 GP299+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Product OLP239

In [163]:
df[df['postComment'].str.contains('OLP239', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_FCF,revenue_FG,revenue_KBP,revenue_KFC,revenue_KN119,revenue_KSB,revenue_KSF,revenue_KSH,revenue_MBT11,revenue_MJ
133,2 X ORGANIC LIVE PRAWN 500G/ PORTION @$23.90 Comment OLP239+1 below to join the Sale.,OceanStar Seafood,00:27:50,1,15,0,0,$23.90 OLP239+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139,2 X ORGANIC LIVE PRAWN 500G/ PORTION @$23.90 Comment OLP239+1 below to join the Sale.,OceanStar Seafood,00:29:13,1,15,0,0,$23.90 OLP239+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
141,OLP239+1,Esther Loo,00:29:27,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
280,2 X ORGANIC LIVE PRAWN 500G/ PORTION @$23.90 Comment OLP239+1 below to join the Sale.,OceanStar Seafood,01:10:37,1,15,0,0,$23.90 OLP239+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [164]:
df['productBought_OLP239'] = df['productBought_OLP239'].map(lambda x:float(23.90) if x == int(1) else 0)

In [165]:
df['revenue_OLP239'] = np.multiply(df['productBought_OLP239'], df['salesQuantity'])

In [166]:
revenue_OLP239 = "The total revenue from the sale of the product {} is ${}". format ("OLP239", format(df['revenue_OLP239'].sum(), '.2f'))
print(revenue_OLP239)


The total revenue from the sale of the product OLP239 is $23.90


Product PR

In [167]:
df[df['postComment'].str.contains('PR', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_FG,revenue_KBP,revenue_KFC,revenue_KN119,revenue_KSB,revenue_KSF,revenue_KSH,revenue_MBT11,revenue_MJ,revenue_OLP239
133,2 X ORGANIC LIVE PRAWN 500G/ PORTION @$23.90 Comment OLP239+1 below to join the Sale.,OceanStar Seafood,00:27:50,1,15,0,0,$23.90 OLP239+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
139,2 X ORGANIC LIVE PRAWN 500G/ PORTION @$23.90 Comment OLP239+1 below to join the Sale.,OceanStar Seafood,00:29:13,1,15,0,0,$23.90 OLP239+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
189,3 X IQF PRIME RIBS 500G+- / P @ $21.90 Comment PR+1 below to join the Sale,OceanStar Seafood,00:44:50,1,17,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191,PR+1,Esther Loo,00:46:09,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198,PR+1,Adeline Neo,00:48:16,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
205,2 X WILD CAUGHT BLUE TAIL PRAWN 500G/ PORTION @ $18.80 Comment BTP188+1 below to join the Sale.,OceanStar Seafood,00:50:34,1,18,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
277,2 X LIVE PRAWN 500G/ PORTION @$24.00 Comment LP24+1 below to join the Sale.,OceanStar Seafood,01:10:16,1,14,0,0,$24.00 LP24+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
279,2 X TIGER PRAWN 500G/ PORTION @$19.90 Comment TP199+1 below to join the Sale.,OceanStar Seafood,01:10:24,1,14,0,0,$19.90 TP199+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
280,2 X ORGANIC LIVE PRAWN 500G/ PORTION @$23.90 Comment OLP239+1 below to join the Sale.,OceanStar Seafood,01:10:37,1,15,0,0,$23.90 OLP239+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
284,3 X IQF PRIME RIBS 500G+- / P @ $21.90 Comment PR+1 below to join the Sale,OceanStar Seafood,01:11:59,1,17,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [168]:
df['productBought_PR'] = df['productBought_PR'].map(lambda x:float(21.90) if x == int(1) else 0)

In [169]:
df['revenue_PR'] = np.multiply(df['productBought_PR'], df['salesQuantity'])

In [170]:
revenue_PR = "The total revenue from the sale of the product {} is ${}". format ("PR", format(df['revenue_PR'].sum(), '.2f'))
print(revenue_PR)


The total revenue from the sale of the product PR is $43.80


Product RG239

In [171]:
df[df['postComment'].str.contains('RG239', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_KBP,revenue_KFC,revenue_KN119,revenue_KSB,revenue_KSF,revenue_KSH,revenue_MBT11,revenue_MJ,revenue_OLP239,revenue_PR
32,MUST GRAB!!! RED GROUPER WHOLE 1.1-1.3KG / PCS @ $23.90 Comment RG239+1 below to join the Sale,OceanStar Seafood,00:02:57,1,17,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,RG239+1,Anne Ng,00:04:08,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39,MUST GRAB!!! RED GROUPER WHOLE 1.1-1.3KG / PCS @ $23.90 Comment RG239+1 below to join the Sale,OceanStar Seafood,00:05:24,1,17,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,RG239+1,Joy Mah,00:06:01,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42,MUST GRAB!!! RED GROUPER WHOLE 1.1-1.3KG / PCS @ $23.90 Comment RG239+1 below to join the Sale,OceanStar Seafood,00:06:11,1,17,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55,MUST GRAB!! RED GROUPER WHOLE 1.1-1.3KG / PCS @ $23.90 Comment RG239+1 below to join the Sale,OceanStar Seafood,00:09:41,1,17,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65,RED GROUPER WHOLE 1.1-1.3KG / PCS @ $23.90 Comment RG239+1 below to join the Sale,OceanStar Seafood,00:12:23,1,15,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79,MUST GRAB!!!! RED GROUPER WHOLE 1.1-1.3KG / PCS @ $23.90 Comment RG239+1 below to join the Sale,OceanStar Seafood,00:14:36,1,17,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87,RG239+1,June Ng,00:15:56,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,RED GROUPER WHOLE 1.1-1.3KG / PCS @ $23.90 Comment RG239+1 below to join the Sale,OceanStar Seafood,00:21:38,1,15,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [172]:
df['productBought_RG239'] = df['productBought_RG239'].map(lambda x:float(23.90) if x == int(1) else 0)

In [173]:
df['revenue_RG239'] = np.multiply(df['productBought_RG239'], df['salesQuantity'])

In [174]:
revenue_RG239 = "The total revenue from the sale of the product {} is ${}". format ("RG239", format(df['revenue_RG239'].sum(), '.2f'))
print(revenue_RG239)


The total revenue from the sale of the product RG239 is $334.60


Product RGF

In [175]:
df[df['postComment'].str.contains('RGF', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_KFC,revenue_KN119,revenue_KSB,revenue_KSF,revenue_KSH,revenue_MBT11,revenue_MJ,revenue_OLP239,revenue_PR,revenue_RG239
213,RGF,Winnie Wu,00:53:18,0,1,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
275,2 X RED GROUPER FILLET 300-350G / PCS @ $29.90 Comment RGF+1 below to join the Sale,OceanStar Seafood,01:09:07,1,17,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [176]:
df['productBought_RGF'] = df['productBought_RGF'].map(lambda x:float(29.90) if x == int(1) else 0)

In [177]:
df['revenue_RGF'] = np.multiply(df['productBought_RGF'], df['salesQuantity'])

In [178]:
revenue_RGF = "The total revenue from the sale of the product {} is ${}". format ("RGF", format(df['revenue_RGF'].sum(), '.2f'))
print(revenue_RGF)


The total revenue from the sale of the product RGF is $29.90


Product RGSS

In [179]:
df[df['postComment'].str.contains('RGSS', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_KN119,revenue_KSB,revenue_KSF,revenue_KSH,revenue_MBT11,revenue_MJ,revenue_OLP239,revenue_PR,revenue_RG239,revenue_RGF
227,2 X RED GROUPER SOUP SLICED 300G/ PKT @$29.90 Comment RGSS+1 below to join the Sale,OceanStar Seafood,00:56:52,1,16,0,0,$29.90 RGSS+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
239,RGSS+1,June Ng,00:59:29,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [180]:
df['productBought_RGSS'] = df['productBought_RGSS'].map(lambda x:float(29.90) if x == int(1) else 0)

In [181]:
df['revenue_RGSS'] = np.multiply(df['productBought_RGSS'], df['salesQuantity'])

In [182]:
revenue_RGSS = "The total revenue from the sale of the product {} is ${}". format ("RGSS", format(df['revenue_RGSS'].sum(), '.2f'))
print(revenue_RGSS)


The total revenue from the sale of the product RGSS is $59.80


Product SP

In [183]:
df[df['postComment'].str.contains('SP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_KSB,revenue_KSF,revenue_KSH,revenue_MBT11,revenue_MJ,revenue_OLP239,revenue_PR,revenue_RG239,revenue_RGF,revenue_RGSS
252,2 X SILVER POMFRET 350-450G/PCS@ $9.90 Comment SP+1 below to join the Sale,OceanStar Seafood,01:03:38,1,13,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [184]:
df['productBought_SP'] = df['productBought_SP'].map(lambda x:float(9.90) if x == int(1) else 0)

In [185]:
df['revenue_SP'] = np.multiply(df['productBought_SP'], df['salesQuantity'])

In [186]:
revenue_SP = "The total revenue from the sale of the product {} is ${}". format ("SP", format(df['revenue_SP'].sum(), '.2f'))
print(revenue_SP)


The total revenue from the sale of the product SP is $9.90


Product SQ

In [187]:
df[df['postComment'].str.contains('SQ', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_KSF,revenue_KSH,revenue_MBT11,revenue_MJ,revenue_OLP239,revenue_PR,revenue_RG239,revenue_RGF,revenue_RGSS,revenue_SP
164,2 X SQUID 500-600G/ PORTION @$21.90 Comment SQ+1 below to join the Sale.,OceanStar Seafood,00:36:15,1,13,0,0,$21.90 SQ+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
245,2 X SQUID 500-600G/ PORTION @$21.90 Comment SQ+1 below to join the Sale.,OceanStar Seafood,01:01:30,1,13,0,0,$21.90 SQ+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
248,SQ+1,Jolynn Toh,01:02:43,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [188]:
df['productBought_SQ'] = df['productBought_SQ'].map(lambda x:float(21.90) if x == int(1) else 0)

In [189]:
df['revenue_SQ'] = np.multiply(df['productBought_SQ'], df['salesQuantity'])

In [190]:
revenue_SQ = "The total revenue from the sale of the product {} is ${}". format ("SQ", format(df['revenue_SQ'].sum(), '.2f'))
print(revenue_SQ)


The total revenue from the sale of the product SQ is $21.90


Product WAK

In [191]:
df[df['postComment'].str.contains('WAK', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_KSH,revenue_MBT11,revenue_MJ,revenue_OLP239,revenue_PR,revenue_RG239,revenue_RGF,revenue_RGSS,revenue_SP,revenue_SQ
144,2 X WILD CAUGHT ANG KA 500G/ POR @$11.11 ( PWP) Comment WAK+1 below to join the Sale.,OceanStar Seafood,00:30:13,1,18,0,0,$11.11 ( PWP) WAK+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
145,SUPER DEAL!!! 2 X WILD CAUGHT ANG KA 500G/ POR @$11.11 ( PWP) Comment WAK+1 below to join the Sale.,OceanStar Seafood,00:30:23,1,20,0,0,$11.11 ( PWP) WAK+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
146,SUPER DEAL!!! 2 X WILD CAUGHT ANG KA 500G/ POR @$11.11 ( PWP) Comment WAK+1 below to join the Sale.,OceanStar Seafood,00:31:28,1,20,0,0,$11.11 ( PWP) WAK+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
147,WAK+1,Yvonne Chia,00:31:36,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
149,WAK+1,Iris Teoh,00:31:52,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150,WAK+1,June Ng,00:32:03,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151,WAK+1,Cynthia Sew,00:32:23,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153,WAK+1,Joy Mah,00:32:50,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
163,MUST GRAB!!! 2 X WILD CAUGHT ANG KA 500G/ POR @$11.11 ( PWP) Comment WAK+1 below to join the Sale.,OceanStar Seafood,00:35:57,1,20,0,0,$11.11 ( PWP) WAK+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
166,WAK+1,Anne Ng,00:36:29,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [192]:
df['productBought_WAK'] = df['productBought_WAK'].map(lambda x:float(11.11) if x == int(1) else 0)

In [193]:
df['revenue_WAK'] = np.multiply(df['productBought_WAK'], df['salesQuantity'])

In [194]:
revenue_WAK = "The total revenue from the sale of the product {} is ${}". format ("WAK", format(df['revenue_WAK'].sum(), '.2f'))
print(revenue_WAK)


The total revenue from the sale of the product WAK is $133.32


Product WB69

In [195]:
df[df['postComment'].str.contains('WB69', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_MBT11,revenue_MJ,revenue_OLP239,revenue_PR,revenue_RG239,revenue_RGF,revenue_RGSS,revenue_SP,revenue_SQ,revenue_WAK
49,WILD BARRAMUNDI 450-600G/ PCS @$6.90 Comment WB69+1 below to join the Sale,OceanStar Seafood,00:08:17,1,12,0,0,$6.90 WB69+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50,WILD BARRAMUNDI 450-600G/ PCS @$6.90 Comment WB69+1 below to join the Sale,OceanStar Seafood,00:08:17,1,12,0,0,$6.90 WB69+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51,WILD BARRAMUNDI 450-600G/ PCS @$6.90 Comment WB69+1 below to join the Sale,OceanStar Seafood,00:09:16,1,12,0,0,$6.90 WB69+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54,WB69+3,Anne Ng,00:09:32,0,1,0,3,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [196]:
df['productBought_WB69'] = df['productBought_WB69'].map(lambda x:float(6.90) if x == int(1) else 0)

In [197]:
df['revenue_WB69'] = np.multiply(df['productBought_WB69'], df['salesQuantity'])

In [198]:
revenue_WB69 = "The total revenue from the sale of the product {} is ${}". format ("WB69", format(df['revenue_WB69'].sum(), '.2f'))
print(revenue_WB69)


The total revenue from the sale of the product WB69 is $34.50


Product WP13

In [199]:
df[df['postComment'].str.contains('WP13', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_MJ,revenue_OLP239,revenue_PR,revenue_RG239,revenue_RGF,revenue_RGSS,revenue_SP,revenue_SQ,revenue_WAK,revenue_WB69
247,LAST 3 PCS!!! LOCAL CATCH WHITE POMFRET 300-400G/ PCS @$13.00 Comment WP13+1 below to join the Sale,OceanStar Seafood,01:02:12,1,17,0,0,$13.00 WP13+1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
258,WP13+1,Pearlyn Chua,01:04:56,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [200]:
df['productBought_WP13'] = df['productBought_WP13'].map(lambda x:float(13.00) if x == int(1) else 0)

In [201]:
df['revenue_WP13'] = np.multiply(df['productBought_WP13'], df['salesQuantity'])

In [202]:
revenue_WP13 = "The total revenue from the sale of the product {} is ${}". format ("WP13", format(df['revenue_WP13'].sum(), '.2f'))
print(revenue_WP13)


The total revenue from the sale of the product WP13 is $39.00


In [203]:
# iterating the columns
for col in df.columns:
    print(col)

postComment
postCommentAuthor
postCommentTime_final
isSeller
postCommentLength
lns
salesQuantity
productPrice
productBought_AKLH
productBought_AKLT
productBought_BMC
productBought_BSS
productBought_BST
productBought_CP14
productBought_CP299
productBought_CS
productBought_ES99
productBought_FCF
productBought_FG
productBought_FJ
productBought_KBP
productBought_KFC
productBought_KN119
productBought_KSB
productBought_KSF
productBought_KSH
productBought_MBT11
productBought_MJ
productBought_MP
productBought_OLP239
productBought_PR
productBought_RG239
productBought_RGF
productBought_RGSS
productBought_SP
productBought_SQ
productBought_WAK
productBought_WB69
productBought_WP13
revenue_AKLH
revenue_AKLT
revenue_BMC
revenue_BSS
revenue_BST
revenue_CP14
revenue_CP299
revenue_CS
revenue_FCF
revenue_FG
revenue_KBP
revenue_KFC
revenue_KN119
revenue_KSB
revenue_KSF
revenue_KSH
revenue_MBT11
revenue_MJ
revenue_OLP239
revenue_PR
revenue_RG239
revenue_RGF
revenue_RGSS
revenue_SP
revenue_SQ
revenue_WAK
r

**Sum of total revenue from the video**

In [204]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_OLP239,revenue_PR,revenue_RG239,revenue_RGF,revenue_RGSS,revenue_SP,revenue_SQ,revenue_WAK,revenue_WB69,revenue_WP13
0,Morning OSS,Jennifer Quek,00:00:34,0,2,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hello miko,Ting Wen,00:00:36,0,2,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,LNS,Jennie Gan,00:00:43,0,1,1,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,LNS OSS,Jennifer Quek,00:00:45,0,2,1,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Morning oss,Goh Hui Ling Veron,00:00:46,0,2,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [205]:
#total revenue from the video
total_revenue = df.loc[:, 'revenue_AKLH': 'revenue_WP13'].values.sum()

#round the total revenue to 2 decimals places
total_revenue_rounded = format(total_revenue, '.2f')

print(f"The total revenue for the sale of products for the video is ${total_revenue_rounded}")


The total revenue for the sale of products for the video is $1717.83


In [206]:
va['totalRevenue'] = total_revenue_rounded
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue
0,OCEANSTARLIVE/videos/2174138006061999,27,1200,4413,97,1972,9,105,34,1717.83


**New Column for the total revenue at that comment**

In [207]:
#https://stackoverflow.com/questions/42063716/pandas-sum-up-multiple-columns-into-one-column-without-last-column
df['revenue'] = df.loc[:, 'revenue_AKLH': 'revenue_WP13'].sum(axis=1)
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_AKLH,productBought_AKLT,...,revenue_PR,revenue_RG239,revenue_RGF,revenue_RGSS,revenue_SP,revenue_SQ,revenue_WAK,revenue_WB69,revenue_WP13,revenue
0,Morning OSS,Jennifer Quek,00:00:34,0,2,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hello miko,Ting Wen,00:00:36,0,2,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,LNS,Jennie Gan,00:00:43,0,1,1,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,LNS OSS,Jennifer Quek,00:00:45,0,2,1,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Morning oss,Goh Hui Ling Veron,00:00:46,0,2,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Good morning Miko and Kelvin,Angela Tay,00:00:47,0,5,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Good morning miko and OSS team,Jennie Gan,00:00:49,0,6,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Oss,Angela Tay,00:00:53,0,1,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,OSS,Jennie Gan,00:00:53,0,1,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,yes i wake up le,Clarice Goh,00:00:55,0,5,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [208]:
#https://www.geeksforgeeks.org/how-to-move-a-column-to-first-position-in-pandas-dataframe/
#shift the revenue column to be the 8th column, i.e at position 7
eighth_column = df.pop('revenue')

# insert column using insert(position,column_name,ninth_column) function
df.insert(7, 'revenue', eighth_column)
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,productPrice,productBought_AKLH,...,revenue_OLP239,revenue_PR,revenue_RG239,revenue_RGF,revenue_RGSS,revenue_SP,revenue_SQ,revenue_WAK,revenue_WB69,revenue_WP13
0,Morning OSS,Jennifer Quek,00:00:34,0,2,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hello miko,Ting Wen,00:00:36,0,2,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,LNS,Jennie Gan,00:00:43,0,1,1,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,LNS OSS,Jennifer Quek,00:00:45,0,2,1,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Morning oss,Goh Hui Ling Veron,00:00:46,0,2,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Good morning Miko and Kelvin,Angela Tay,00:00:47,0,5,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Good morning miko and OSS team,Jennie Gan,00:00:49,0,6,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Oss,Angela Tay,00:00:53,0,1,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,OSS,Jennie Gan,00:00:53,0,1,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,yes i wake up le,Clarice Goh,00:00:55,0,5,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


After the revenue of the sales have been calculated, all the dummified product columns and the column 'productPrice' will be dropped as they are no longer required to be checked against to identify the price of the product.

In [209]:
df = df.loc[: ,'postComment':'revenue']
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue
0,Morning OSS,Jennifer Quek,00:00:34,0,2,0,0,0.0
1,hello miko,Ting Wen,00:00:36,0,2,0,0,0.0
2,LNS,Jennie Gan,00:00:43,0,1,1,0,0.0
3,LNS OSS,Jennifer Quek,00:00:45,0,2,1,0,0.0
4,Morning oss,Goh Hui Ling Veron,00:00:46,0,2,0,0,0.0
5,Good morning Miko and Kelvin,Angela Tay,00:00:47,0,5,0,0,0.0
6,Good morning miko and OSS team,Jennie Gan,00:00:49,0,6,0,0,0.0
7,Oss,Angela Tay,00:00:53,0,1,0,0,0.0
8,OSS,Jennie Gan,00:00:53,0,1,0,0,0.0
9,yes i wake up le,Clarice Goh,00:00:55,0,5,0,0,0.0


**New Column to identify the frequency of the seller's comments in the video**

In [210]:
#frequency of seller's comments
va['frequencySeller']= np.divide(va['videoLength'].iloc[0],va['numSellerComments'])
#seller's comment appears on average of every 45 seconds

In [211]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue,frequencySeller
0,OCEANSTARLIVE/videos/2174138006061999,27,1200,4413,97,1972,9,105,34,1717.83,45.494845


**New Column to identify the seller**

In [212]:
df['seller'] = 'OCEANSTARLIVE'

In [213]:
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller
0,Morning OSS,Jennifer Quek,00:00:34,0,2,0,0,0.0,OCEANSTARLIVE
1,hello miko,Ting Wen,00:00:36,0,2,0,0,0.0,OCEANSTARLIVE
2,LNS,Jennie Gan,00:00:43,0,1,1,0,0.0,OCEANSTARLIVE
3,LNS OSS,Jennifer Quek,00:00:45,0,2,1,0,0.0,OCEANSTARLIVE
4,Morning oss,Goh Hui Ling Veron,00:00:46,0,2,0,0,0.0,OCEANSTARLIVE


**New Column for the Average Compound Score from Sentiment Analysis on raw & uncleaned comments for video**

In [214]:
# Instantiate Sentiment Intensity Analyzer
sent = SentimentIntensityAnalyzer()

In [215]:
df['sentiment_score'] = df['postComment'].apply(sent.polarity_scores)
df['compound'] = [sent.polarity_scores(x)['compound'] for x in df['postComment']]
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller,sentiment_score,compound
0,Morning OSS,Jennifer Quek,00:00:34,0,2,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
1,hello miko,Ting Wen,00:00:36,0,2,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
2,LNS,Jennie Gan,00:00:43,0,1,1,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
3,LNS OSS,Jennifer Quek,00:00:45,0,2,1,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
4,Morning oss,Goh Hui Ling Veron,00:00:46,0,2,0,0,0.0,OCEANSTARLIVE,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0


In [216]:
#average compound scores for the video
#df.shape[0] calculates the total number of rows in the dataframe
va['averageCompound']= (df['compound'].sum())/(df['compound'].sum())/df.shape[0]
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue,frequencySeller,averageCompound
0,OCEANSTARLIVE/videos/2174138006061999,27,1200,4413,97,1972,9,105,34,1717.83,45.494845,0.003448


In [217]:
#drop the columns with regarding to the sentiment analysis on the raw & uncleaned comments
#as we will perform sentiment analysis at the comment level on processed comments
df = df.loc[: ,'postComment':'seller']
df.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller
0,Morning OSS,Jennifer Quek,00:00:34,0,2,0,0,0.0,OCEANSTARLIVE
1,hello miko,Ting Wen,00:00:36,0,2,0,0,0.0,OCEANSTARLIVE
2,LNS,Jennie Gan,00:00:43,0,1,1,0,0.0,OCEANSTARLIVE
3,LNS OSS,Jennifer Quek,00:00:45,0,2,1,0,0.0,OCEANSTARLIVE
4,Morning oss,Goh Hui Ling Veron,00:00:46,0,2,0,0,0.0,OCEANSTARLIVE


### Saving the cleaned dataframes

In [218]:
# export to csv - change the name of the data file for each video
va.to_csv('../../data/cleaned_data/cleaned_va_OCEANSTARLIVE_2174138006061999.csv', index=False)

In [219]:
#check for nulls
#displaying only the columns with nulls and their sum
df[df.columns[df.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [220]:
# export to csv - change the name of the data file for each video
df.to_csv('../../data/cleaned_data/cleaned_OCEANSTARLIVE_2174138006061999.csv', index=False)