# Data Import & Cleaning

### Contents:

- [Data Import](#Data-Import)
- [Data Cleaning](#Data-Cleaning)
- [Feature Engineering](#Feature-Engineering)

### Import Libraries

In [1]:
#import standard libraries
import pandas as pd
import numpy as np

#import emoji
import emoji

from natsort import natsorted, index_natsorted, order_by_index

#import warnings to ignore flags when the project is complete
#import warnings
#warnings.filterwarnings('ignore')

#import pre-processing libraries for data cleaning
import string
import re
import nltk
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

## Data Import

**Read scrapped data for the following videos**

In [2]:
va = pd.read_csv('../../data/scrapped_data/va_ebeveadmin_331382632125428.csv')

In [3]:
va

Unnamed: 0,video_for,totalEmojiReaction,views
0,ebeveadmin/videos/331382632125428,21,1.1K


In [4]:
#retrieve the number of views for the video
va['views'].iloc[0]

'1.1K'

In [5]:
#drop the K and replace it with 2 '0's behind
va['views'] = va['views'].str.replace("K", "00", regex=True)
#drop the dot
va['views'] = va['views'].str.replace(".", "", regex=True)
#change the string to be an integer
va['views'] = int(va['views'].iloc[0])

In [6]:
va

Unnamed: 0,video_for,totalEmojiReaction,views
0,ebeveadmin/videos/331382632125428,21,1100


In [7]:
va.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   video_for           1 non-null      object
 1   totalEmojiReaction  1 non-null      int64 
 2   views               1 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 152.0+ bytes


In [8]:
df = pd.read_csv('../../data/scrapped_data/ebeveadmin_331382632125428.csv', encoding='utf-8')

In [9]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime
0,Have lobster today?,Leonardo Lim,0:16
1,Good morning,David Koh,0:37
2,Hello hello,Shuganya Devi,0:42
3,"WHOLE BATANG 2.8KG $44</div><div dir=""auto"" style=""text-align: start;"">WBT+1",E-Beve,1:01:43
4,WBT+1,Shirley Ang,1:01:46
5,Thank u,Shirley Ang,1:02:52
6,"Tiff Batang full steak cut, about half inch thickness. Thank u",Shirley Ang,1:04:22
7,What else u have left?,Shirley Ang,1:04:44
8,Hello… Got what fish good to heal injuries ah… like elderly go for surgery,Clement Lim,1:05:22
9,How much is the ngor he ah? Got fillet kind?,Clement Lim,1:06:19


In [10]:
#https://stackoverflow.com/questions/32072076/find-the-unique-values-in-a-column-and-then-sort-them
#check if the seller uses multiple accounts to reply
postCommentAuthor_unique = df['postCommentAuthor'].unique()
print(sorted(postCommentAuthor_unique))

['Aaron Looi', 'Aisha Ahmad', 'Aysha Khamarudin Al Takhi', 'Christine Lai', 'Clement Lim', 'David Koh', 'Diana Ng', 'E-Beve', 'Firdaus Nordin', 'Helen Lee', 'Irene Lee', 'Kham N Ash Koh', 'Leonardo Lim', 'Lily Koh', 'Melissa Kang', 'Min Xuan', 'Nani Baqawali', 'Pauline Ng', 'Raji Andrew', 'Richard Ling', 'Shirley Ang', 'Shuganya Devi', 'Stefanie Teo', 'Veon Veon', '马小玲']


In [11]:
#find comments posted by the seller only
df.loc[df['postCommentAuthor'] == 'E-Beve']

Unnamed: 0,postComment,postCommentAuthor,postCommentTime
3,"WHOLE BATANG 2.8KG $44</div><div dir=""auto"" style=""text-align: start;"">WBT+1",E-Beve,1:01:43
22,Whatsapp 88896368,E-Beve,4:08
31,STINGRAY +1,E-Beve,8:25
35,"GOL+1 </div><div dir=""auto"" style=""text-align: start;"">Golden pomfret $6",E-Beve,11:00
41,"RED SNAPPER 600G $11</div><div dir=""auto"" style=""text-align: start;"">RS+1",E-Beve,12:43
45,"TIGER PRAWN $18</div><div dir=""auto"" style=""text-align: start;"">TP+1",E-Beve,14:46
55,"LIVE PRAWNS $20</div><div dir=""auto"" style=""text-align: start;"">LP+1",E-Beve,17:21
56,"COCKLES $8</div><div dir=""auto"" style=""text-align: start;"">CC+1",E-Beve,19:10
60,"TERUBOK 700G $15</div><div dir=""auto"" style=""text-align: start;"">TRB+1",E-Beve,21:24
61,"CENCARU SET OF5 $10</div><div dir=""auto"" style=""text-align: start;"">CEN+1",E-Beve,24:52


In [12]:
#check the comments to have a gauge
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' Thanks <span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu"><img alt="🙏" height="16" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/t1f/2/16/1f64f.png" width="16"/></span> ', '1 wing', '88896368', '<span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu"><img alt="👋" height="16" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/t99/2/16/1f44b.png" width="16"/></span> ', '<span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 k7cz35w2 bsnbvmp4"><img alt="👌" height="32" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/td4/2/32/1f44c.png" width="32"/></span>', 'All miy orders in?', 'All seafoods are fresh', 'All sister, brother good morning<span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 

## Data Cleaning

### Convert the emojis to text for easy cleaning

We noticed that the emojis have html parsers attached to it. Hence, we will convert the images of the emojis to text first, to remove the html parsers to the emojis, while retaining the emoji's text. We will convert it back to emoji afterwards.

In [13]:
df['postComment'] = df['postComment'].apply(emoji.demojize)

In [14]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' Thanks <span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu"><img alt=":folded_hands:" height="16" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/t1f/2/16/1f64f.png" width="16"/></span> ', '1 wing', '88896368', '<span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 gl3lb2sf hhz5lgdu"><img alt=":waving_hand:" height="16" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/t99/2/16/1f44b.png" width="16"/></span> ', '<span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 k7cz35w2 bsnbvmp4"><img alt=":OK_hand:" height="32" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/td4/2/32/1f44c.png" width="32"/></span>', 'All miy orders in?', 'All seafoods are fresh', 'All sister, brother good morning<span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 dit

### Clean the comments using Regex

From the above unique values of the comments, there are several cleaning issues that needs to be addressed.

    1. Removal of links or URLs. 
        - URLs are present when the Facebook users manually type a link in the comment. 
        - Additionally, when users are tagged, their Facebook profile URL is printed as a result.
        - Similarly, there is a unique URL linked to each emoji as well. 
        
    2. Removal of HTML special entities
        - Examples of HTML special entities are '&amp' and '&gt'. 
        
    3. Removal of other HTML special terms
        - Examples of other HTML special entities are whitespace HTML special entities like '#x200B' and '#xa0'.
        
    4. Removal of HTML Parsers to the Emojis
        - Previously, the emojis have been demojized to text already. However, the HTML parsers to the mojis remain. Hence, they are required to be removed as well.
        
    5. Removal of HTML Parcers to Whitespaces
        - When a next line is entered in the same comment, there will be a HTML parser to the this whitespace. Hence, this are to be removed as well.
        
    6. Removal of HTML Parsers to tagged names
        - When users are tagged in the comments, in addition to their Facebook profile URL, there will be HTML parsers to the tagged names as well. Hence, this are to be removed as well.
        
    7. Removsl of other HTML Parsers   

In [15]:
def clean(row):
    
    # Remove links or URLs
    row['postComment'] = re.sub(
        pattern=r'https?:\/\/.*\/\w*', 
        repl='', 
        string=row['postComment'],
        flags=re.M)
    
    # Remove HTML special entities (e.g.. &amp, &gt;)
    row['postComment'] = re.sub(
        pattern=r'\&\w*;',
        repl='',
        string=row['postComment'],
        flags=re.M) 
    
    # Remove emoji html parsers
    #https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
    row['postComment'] = re.sub(
        pattern=r'<span\sclass=\"([a-z0-9]{8}\s)+[a-z0-9]{8}\">',
        repl='', 
        string=row['postComment'],
        flags=re.M)
 
    # Remove emoji html parsers
    row['postComment'] = re.sub(
        pattern = r'<img\salt=\"(\:\w*)(\-?)(\w*\:)\"\s[a-z]{6}=\"\d\d\"\s[a-z]{14}=.{26}\s[a-z]{3}=\">',
        repl=r'\1\2\3',
        string=row['postComment'],
        flags=re.M)
    
    # Remove whitespaces
    row['postComment'] = re.sub(
        pattern=r'<\/div><div\s.*\s.*>',
        repl=' ',
        string=row['postComment'],
        flags=re.M)

    # Remove whitespaces
    row['postComment'] = re.sub(
        pattern=r'<div\sdir=\"auto\"\s.*\s.*>',
        repl=' ',
        string=row['postComment'],
        flags=re.M)

    
    # Remove tagged names
    row['postComment'] = re.sub(
        pattern=r'<a\sclass=\"([a-z0-9]{8}\s)+[a-z0-9]{8}\"\shref=\">',
        repl='',
        string=row['postComment'],
        flags=re.M)
    

    # Remove consecutive non-ASCII characters
    # This will remove the chinese comments
    #https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
    row['postComment'] = re.sub(
        pattern=r'[^\x00-\x7F]+', 
        repl=' ', 
        string=row['postComment'],
        flags=re.M)
    
    return row

In [16]:
df2 = df.apply(clean, axis=1)

In [17]:
postComment_unique2 = df2['postComment'].unique()
print(sorted(postComment_unique2))

[' ', ' Thanks :folded_hands: ', '1 wing', '88896368', ':OK_hand:', ':waving_hand: ', 'All miy orders in?', 'All seafoods are fresh', 'All sister, brother good morning:person_raising_hand:', 'Any day u hv whole salmon..i want..thks', 'Any tuna fish', 'Asam pedas', 'BBAG+1', 'Baby ang go li $10 BBAG+1', 'Bag+1', 'Banana Leaf sambal chili', 'Bbag+1', 'CC + 1', 'CC+1', 'CEN+1', 'CENCARU SET OF5 $10 CEN+1', 'COCKLES $8 CC+1', 'Cc+1', 'Cen+1', 'Cereal prawns', 'Code for ikan duri', 'Congratulations', 'Curry cook', 'Cut a bit smaller for me', 'Cut steak..head cut 2..thks', 'Cute cute', 'DR26', 'Dear..u owned me tiger prawn:grinning_face:', 'Deep dry  wasabi sauce', 'Deep fry BBQ sauce', 'Deep fry can eat whole fish', 'Deep fry sambal chili', 'Did i get my seabass', 'Do you have crab,crayfish or lobster ', 'Don t forget my sotong ', "Don't want flower lala", 'Drunken prawns', 'Drunken prawns ', 'Duri - $26, $33, $35 DR26, DR33,DR35', 'FLL +1', 'FLOWER CLAM 700G $6 FLL+1', 'GLASS FISH 1KG $3.5

**Convert encoded emoji text back to emojis**

In [18]:
df2['postComment'] = df2['postComment'].apply(emoji.emojize)

In [19]:
postComment_unique2 = df2['postComment'].unique()
print(sorted(postComment_unique2))

[' ', ' Thanks 🙏 ', '1 wing', '88896368', 'All miy orders in?', 'All seafoods are fresh', 'All sister, brother good morning🙋', 'Any day u hv whole salmon..i want..thks', 'Any tuna fish', 'Asam pedas', 'BBAG+1', 'Baby ang go li $10 BBAG+1', 'Bag+1', 'Banana Leaf sambal chili', 'Bbag+1', 'CC + 1', 'CC+1', 'CEN+1', 'CENCARU SET OF5 $10 CEN+1', 'COCKLES $8 CC+1', 'Cc+1', 'Cen+1', 'Cereal prawns', 'Code for ikan duri', 'Congratulations', 'Curry cook', 'Cut a bit smaller for me', 'Cut steak..head cut 2..thks', 'Cute cute', 'DR26', 'Dear..u owned me tiger prawn😀', 'Deep dry  wasabi sauce', 'Deep fry BBQ sauce', 'Deep fry can eat whole fish', 'Deep fry sambal chili', 'Did i get my seabass', 'Do you have crab,crayfish or lobster ', 'Don t forget my sotong ', "Don't want flower lala", 'Drunken prawns', 'Drunken prawns ', 'Duri - $26, $33, $35 DR26, DR33,DR35', 'FLL +1', 'FLOWER CLAM 700G $6 FLL+1', 'GLASS FISH 1KG $3.50 GLASS+1', 'GOL+1', 'GOL+1  Golden pomfret $6', 'Glass+1', 'Gol+1', 'Good mor

### Reindexing the dataframe 
**New Column to reindex the dataframe in accordance to time**

From the data, we can tell that there is an inconsistent timestamp being used in the column 'postCommentTime'. For example, there are times like '0:57' and '1:00:14' which indicates 0 hour 0 mins 57 secs, and 1 hour 0 mins 14 secs. 

Hence, a new column 'postCommentTime_final' is created to ensure that a timestamp of HH:MM:SS is being used consistently throughout. However, we note that the number of days is being included for TimeDeltaIndex.

As a result, we will read the timestamp, by excluding the number of days.  

In [20]:
#TimedeltaIndex
#https://stackoverflow.com/questions/54877467/pandas-convert-hhmm-and-hhmmss-to-standard-hhmmss-in-python
# for example, time of '0:57' will then be 00:00:57; 0 hours 0 mins 57 secs
df2['postCommentTime_final'] = pd.to_timedelta(np.where(df2['postCommentTime'].str.count(':') == 1, '00:' + df2['postCommentTime'], df2['postCommentTime']))

In [21]:
df2.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Have lobster today?,Leonardo Lim,0:16,0 days 00:00:16
1,Good morning,David Koh,0:37,0 days 00:00:37
2,Hello hello,Shuganya Devi,0:42,0 days 00:00:42
3,WHOLE BATANG 2.8KG $44 WBT+1,E-Beve,1:01:43,0 days 01:01:43
4,WBT+1,Shirley Ang,1:01:46,0 days 01:01:46


In [22]:
df2['postCommentTime_final'] = df2['postCommentTime_final'].astype(str).map(lambda x: x[7:])

In [23]:
df2

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Have lobster today?,Leonardo Lim,0:16,00:00:16
1,Good morning,David Koh,0:37,00:00:37
2,Hello hello,Shuganya Devi,0:42,00:00:42
3,WHOLE BATANG 2.8KG $44 WBT+1,E-Beve,1:01:43,01:01:43
4,WBT+1,Shirley Ang,1:01:46,01:01:46
5,Thank u,Shirley Ang,1:02:52,01:02:52
6,"Tiff Batang full steak cut, about half inch thickness. Thank u",Shirley Ang,1:04:22,01:04:22
7,What else u have left?,Shirley Ang,1:04:44,01:04:44
8,Hello Got what fish good to heal injuries ah like elderly go for surgery,Clement Lim,1:05:22,01:05:22
9,How much is the ngor he ah? Got fillet kind?,Clement Lim,1:06:19,01:06:19


In [24]:
#reindex according to postCommentTime_final
#previous natsort in data collection didnt take into account the different timestamp format
df3 = df2.reindex(index=order_by_index(df2.index, index_natsorted(df2.postCommentTime_final)))

In [25]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Have lobster today?,Leonardo Lim,0:16,00:00:16
1,Good morning,David Koh,0:37,00:00:37
2,Hello hello,Shuganya Devi,0:42,00:00:42
18,I cctv.. U also know..,马小玲,2:42,00:02:42
19,👋,Helen Lee,2:52,00:02:52
20,👌,Helen Lee,3:20,00:03:20
21,How to preorder??,马小玲,3:40,00:03:40
22,Whatsapp 88896368,E-Beve,4:08,00:04:08
23,Noted..,马小玲,4:12,00:04:12
24,Lns,Lily Koh,5:05,00:05:05


In [26]:
#reset the index for the dataframe
#https://stackoverflow.com/questions/20490274/how-to-reset-index-in-a-pandas-dataframe

df3 = df3.reset_index(drop=True)

In [27]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Have lobster today?,Leonardo Lim,0:16,00:00:16
1,Good morning,David Koh,0:37,00:00:37
2,Hello hello,Shuganya Devi,0:42,00:00:42
3,I cctv.. U also know..,马小玲,2:42,00:02:42
4,👋,Helen Lee,2:52,00:02:52
5,👌,Helen Lee,3:20,00:03:20
6,How to preorder??,马小玲,3:40,00:03:40
7,Whatsapp 88896368,E-Beve,4:08,00:04:08
8,Noted..,马小玲,4:12,00:04:12
9,Lns,Lily Koh,5:05,00:05:05


**Obtain the length of the video**

Assuming that the last comment of the video is the total length of the video, we will input the length of the video from the comments dataframe into the video attributes dataframe.

In [28]:
#retrieve last comment to obtain the length of the video
df3['postCommentTime_final'].iloc[-1]

'01:10:46'

In [29]:
#https://stackoverflow.com/questions/6402812/how-to-convert-an-hmmss-time-string-to-seconds-in-python
def get_sec(time_str):
    """Get Seconds from time."""
    h, m, s = time_str.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

In [30]:
#retrieve last comment to obtain the length of the video in seconds
va['videoLength']= get_sec(df3['postCommentTime_final'].iloc[-1])

In [31]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength
0,ebeveadmin/videos/331382632125428,21,1100,4246


## Feature Engineering

### Comments made by the Seller

**New Column to identify the number of comments made by the seller in the video**

From the total sum of comments made by the seller, we will input it into the video attributes dataframe.

In [32]:
(df3['postCommentAuthor']=='E-Beve').sum()

21

In [33]:
va['numSellerComments'] = (df3['postCommentAuthor']=='E-Beve').sum()

In [34]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments
0,ebeveadmin/videos/331382632125428,21,1100,4246,21


**New Column to identify if the comment is made by the Seller or not**

In [35]:
#create a new column to show if the comment is made by the seller or not
df3['isSeller'] = df3['postCommentAuthor'].map(lambda x:1 if x =='E-Beve' else 0)

In [36]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller
0,Have lobster today?,Leonardo Lim,0:16,00:00:16,0
1,Good morning,David Koh,0:37,00:00:37,0
2,Hello hello,Shuganya Devi,0:42,00:00:42,0
3,I cctv.. U also know..,马小玲,2:42,00:02:42,0
4,👋,Helen Lee,2:52,00:02:52,0


In [37]:
df3['isSeller'].value_counts()

0    126
1     21
Name: isSeller, dtype: int64

In [38]:
#show all the seller's comments
df3.loc[df3['isSeller'] == 1]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller
7,Whatsapp 88896368,E-Beve,4:08,00:04:08,1
16,STINGRAY +1,E-Beve,8:25,00:08:25,1
20,GOL+1 Golden pomfret $6,E-Beve,11:00,00:11:00,1
26,RED SNAPPER 600G $11 RS+1,E-Beve,12:43,00:12:43,1
30,TIGER PRAWN $18 TP+1,E-Beve,14:46,00:14:46,1
40,LIVE PRAWNS $20 LP+1,E-Beve,17:21,00:17:21,1
41,COCKLES $8 CC+1,E-Beve,19:10,00:19:10,1
45,TERUBOK 700G $15 TRB+1,E-Beve,21:24,00:21:24,1
46,CENCARU SET OF5 $10 CEN+1,E-Beve,24:52,00:24:52,1
52,GLASS FISH 1KG $3.50 GLASS+1,E-Beve,27:18,00:27:18,1


### Length of comments

**New Column to identify the length of each comment**

From the comments, we take the length of each comment as the total number of words each comment has.

In [39]:
#length of each comment
#https://stackoverflow.com/questions/37483470/how-to-calculate-number-of-words-in-a-string-in-dataframe
df3['postCommentLength'] = df3['postComment'].str.split().str.len()

In [40]:
df3.head(10)

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength
0,Have lobster today?,Leonardo Lim,0:16,00:00:16,0,3
1,Good morning,David Koh,0:37,00:00:37,0,2
2,Hello hello,Shuganya Devi,0:42,00:00:42,0,2
3,I cctv.. U also know..,马小玲,2:42,00:02:42,0,5
4,👋,Helen Lee,2:52,00:02:52,0,1
5,👌,Helen Lee,3:20,00:03:20,0,1
6,How to preorder??,马小玲,3:40,00:03:40,0,3
7,Whatsapp 88896368,E-Beve,4:08,00:04:08,1,2
8,Noted..,马小玲,4:12,00:04:12,0,1
9,Lns,Lily Koh,5:05,00:05:05,0,1


**New Column to identify the total number of comments in the video**

From the total number of comments in the video, we will input it into the video attributes dataframe.

In [41]:
#total number of comments
df3['postCommentLength'].sum()

515

In [42]:
va['numComments'] = df3['postCommentLength'].sum()

In [43]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments
0,ebeveadmin/videos/331382632125428,21,1100,4246,21,515


### LNS

LNS is an acronym that stands for 'like and share'. It is a form of customer engagement as it indicates by the customers to the sellers that they have liked and shared the video on their Facebook wall. 

**New Column to identify if Customers are engaging in liking and sharing the video**

In [44]:
#if the customer has commented 'lns' or 'ls' which stands for 'like & shared' & 'like shared' respectively
def lns(comment):
    if re.search(r'(l)(n?)(s)', comment, re.IGNORECASE):
        return int(1)
    else:
        return int(0)

In [45]:
df3['lns'] = df3['postComment'].map(lambda x:lns(x))

In [46]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns
0,Have lobster today?,Leonardo Lim,0:16,00:00:16,0,3,0
1,Good morning,David Koh,0:37,00:00:37,0,2,0
2,Hello hello,Shuganya Devi,0:42,00:00:42,0,2,0
3,I cctv.. U also know..,马小玲,2:42,00:02:42,0,5,1
4,👋,Helen Lee,2:52,00:02:52,0,1,0


**New Column to identify if the number of Customers who explicitly inform the sellers that they are engaging in liking and sharing the video**

In [47]:
#range of customer's engagement for LNS
df3['lns'].value_counts()

0    141
1      6
Name: lns, dtype: int64

In [48]:
(df3['lns']==1).sum()

6

In [49]:
va['lnsQuantity'] = (df3['lns']==1).sum()

In [50]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity
0,ebeveadmin/videos/331382632125428,21,1100,4246,21,515,6


## Sales Quantity

**New Columns to identify the quantity of sales made**

In [51]:
#overview of the sales
df3[df3['postComment'].str.contains('(\w*)(\s)*(\+)(\s)*(\d*)', regex=True)]

  df3[df3['postComment'].str.contains('(\w*)(\s)*(\+)(\s)*(\d*)', regex=True)]


Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns
14,Stingray +1,Helen Lee,7:09,00:07:09,0,2,0
16,STINGRAY +1,E-Beve,8:25,00:08:25,1,2,0
20,GOL+1 Golden pomfret $6,E-Beve,11:00,00:11:00,1,4,0
21,GOL+1,Helen Lee,11:26,00:11:26,0,1,0
23,Gol+1,Melissa Kang,11:49,00:11:49,0,1,0
26,RED SNAPPER 600G $11 RS+1,E-Beve,12:43,00:12:43,1,5,0
28,RS+1,Aisha Ahmad,13:31,00:13:31,0,1,0
30,TIGER PRAWN $18 TP+1,E-Beve,14:46,00:14:46,1,4,0
31,TP+1,Aisha Ahmad,14:56,00:14:56,0,1,0
40,LIVE PRAWNS $20 LP+1,E-Beve,17:21,00:17:21,1,4,0


In [52]:
def sale(comment):
    if re.findall(r'(\+)(\s)?(\d)', comment):
        results = re.findall(r'\+\s?\d', comment)
        total = 0
        for r in results:
            total += int(r[-1])
        return total
    else:
        return int(0)

In [53]:
df3['sales'] = df3['postComment'].apply(lambda x:sale(x))

In [54]:
df3.head(10)

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales
0,Have lobster today?,Leonardo Lim,0:16,00:00:16,0,3,0,0
1,Good morning,David Koh,0:37,00:00:37,0,2,0,0
2,Hello hello,Shuganya Devi,0:42,00:00:42,0,2,0,0
3,I cctv.. U also know..,马小玲,2:42,00:02:42,0,5,1,0
4,👋,Helen Lee,2:52,00:02:52,0,1,0,0
5,👌,Helen Lee,3:20,00:03:20,0,1,0,0
6,How to preorder??,马小玲,3:40,00:03:40,0,3,0,0
7,Whatsapp 88896368,E-Beve,4:08,00:04:08,1,2,0,0
8,Noted..,马小玲,4:12,00:04:12,0,1,0,0
9,Lns,Lily Koh,5:05,00:05:05,0,1,1,0


In [55]:
#if the comments consist the sale information for the product, we will indicate it as '0', otherwise '1'
def no_sale_info(comment):
    if re.search(r'(\$)(\s)?(.*)', comment, re.IGNORECASE):
        return int(0)
    else:
        return int(1)

In [56]:
df3['no_sale_info'] = df3['postComment'].map(lambda x:no_sale_info(x))

In [57]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info
0,Have lobster today?,Leonardo Lim,0:16,00:00:16,0,3,0,0,1
1,Good morning,David Koh,0:37,00:00:37,0,2,0,0,1
2,Hello hello,Shuganya Devi,0:42,00:00:42,0,2,0,0,1
3,I cctv.. U also know..,马小玲,2:42,00:02:42,0,5,1,0,1
4,👋,Helen Lee,2:52,00:02:52,0,1,0,0,1
5,👌,Helen Lee,3:20,00:03:20,0,1,0,0,1
6,How to preorder??,马小玲,3:40,00:03:40,0,3,0,0,1
7,Whatsapp 88896368,E-Beve,4:08,00:04:08,1,2,0,0,1
8,Noted..,马小玲,4:12,00:04:12,0,1,0,0,1
9,Lns,Lily Koh,5:05,00:05:05,0,1,1,0,1


In [58]:
#sales made by the customers; exclude the seller's comments on the codes for the sales
#https://stackoverflow.com/questions/19914937/applying-function-with-multiple-arguments-to-create-a-new-pandas-column
df3['salesQuantity'] = np.multiply(df3['no_sale_info'], df3['sales'])

In [59]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info,salesQuantity
0,Have lobster today?,Leonardo Lim,0:16,00:00:16,0,3,0,0,1,0
1,Good morning,David Koh,0:37,00:00:37,0,2,0,0,1,0
2,Hello hello,Shuganya Devi,0:42,00:00:42,0,2,0,0,1,0
3,I cctv.. U also know..,马小玲,2:42,00:02:42,0,5,1,0,1,0
4,👋,Helen Lee,2:52,00:02:52,0,1,0,0,1,0
5,👌,Helen Lee,3:20,00:03:20,0,1,0,0,1,0
6,How to preorder??,马小玲,3:40,00:03:40,0,3,0,0,1,0
7,Whatsapp 88896368,E-Beve,4:08,00:04:08,1,2,0,0,1,0
8,Noted..,马小玲,4:12,00:04:12,0,1,0,0,1,0
9,Lns,Lily Koh,5:05,00:05:05,0,1,1,0,1,0


In [60]:
#range of sales quantity
df3['salesQuantity'].value_counts()

0    123
1     20
2      4
Name: salesQuantity, dtype: int64

In [61]:
#total number of orders made
df3['salesQuantity'].sum()

28

In [62]:
va['salesQuantity'] = df3['salesQuantity'].sum()

In [63]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity
0,ebeveadmin/videos/331382632125428,21,1100,4246,21,515,6,28


## Products 

The seller will comment and post the unique product codes for each product. Thereafter, customers who are keen on purchasing the products will explicitly comment out the specific & unique product codes, in addition to the quanityt of each product that they wish to purchase.

**New Columns to identify the products purchased by the Customers**

Regex is used to identify the products being offered and the products being purchased as well.

In [64]:
#function to identify the code of the product bought
def sale2(comment):
    if re.search(r'(\w*)(\s?)[\-]?\+(\s?)(\d)', comment):
        return str(re.search(r'(\w*)(\s?)[\-]?\+(\s?)(\d)', comment).group(0)[:-2])
    else:
        return int(0)

In [65]:
#identifies all comments that have the codes of the products purchased by the Customers
#this column will be dropped afterwards.
df3['productBought'] = df3['postComment'].apply(lambda x:sale2(x))

In [66]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info,salesQuantity,productBought
0,Have lobster today?,Leonardo Lim,0:16,00:00:16,0,3,0,0,1,0,0
1,Good morning,David Koh,0:37,00:00:37,0,2,0,0,1,0,0
2,Hello hello,Shuganya Devi,0:42,00:00:42,0,2,0,0,1,0,0
3,I cctv.. U also know..,马小玲,2:42,00:02:42,0,5,1,0,1,0,0
4,👋,Helen Lee,2:52,00:02:52,0,1,0,0,1,0,0
5,👌,Helen Lee,3:20,00:03:20,0,1,0,0,1,0,0
6,How to preorder??,马小玲,3:40,00:03:40,0,3,0,0,1,0,0
7,Whatsapp 88896368,E-Beve,4:08,00:04:08,1,2,0,0,1,0,0
8,Noted..,马小玲,4:12,00:04:12,0,1,0,0,1,0,0
9,Lns,Lily Koh,5:05,00:05:05,0,1,1,0,1,0,0


In [67]:
df3['productBought'].unique()

array([0, 'Stingray ', 'STINGRAY ', 'GOL', 'Gol', 'RS', 'TP', 'LP', 'CC',
       'CC +', 'Cc', 'TRB', 'CEN', 'Cen', 'GLASS', 'Glass', 'Ray +',
       'STR', 'Str', 'RE', 'SB', 'Sb', 'FLL', 'FLL ', 'ST', 'BBAG',
       'Bbag', 'Bag', 'WBT'], dtype=object)

**Change the produce codes to be uppercase for consistency**

In [68]:
#change the produce codes to be uppercase for consistency, and since python is case sensitive.
#https://stackoverflow.com/questions/39512002/convert-whole-dataframe-from-lower-case-to-upper-case-with-pandas
df3['productBought'] = df3['productBought'].astype(str).str.upper()

In [69]:
df3['productBought'].unique()

array(['0', 'STINGRAY ', 'GOL', 'RS', 'TP', 'LP', 'CC', 'CC +', 'TRB',
       'CEN', 'GLASS', 'RAY +', 'STR', 'RE', 'SB', 'FLL', 'FLL ', 'ST',
       'BBAG', 'BAG', 'WBT'], dtype=object)

In [70]:
df3['productBought'] = df3['productBought'].str.replace(pat='+', repl='', regex=True)

In [71]:
df3['productBought'].unique()

array(['0', 'STINGRAY ', 'GOL', 'RS', 'TP', 'LP', 'CC', 'CC ', 'TRB',
       'CEN', 'GLASS', 'RAY ', 'STR', 'RE', 'SB', 'FLL', 'FLL ', 'ST',
       'BBAG', 'BAG', 'WBT'], dtype=object)

Remove whitespaces at the end of the string

In [72]:
df3['productBought'] = df3['productBought'].str.rstrip()

In [73]:
df3['productBought'].unique()

array(['0', 'STINGRAY', 'GOL', 'RS', 'TP', 'LP', 'CC', 'TRB', 'CEN',
       'GLASS', 'RAY', 'STR', 'RE', 'SB', 'FLL', 'ST', 'BBAG', 'BAG',
       'WBT'], dtype=object)

In [74]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info,salesQuantity,productBought
0,Have lobster today?,Leonardo Lim,0:16,00:00:16,0,3,0,0,1,0,0
1,Good morning,David Koh,0:37,00:00:37,0,2,0,0,1,0,0
2,Hello hello,Shuganya Devi,0:42,00:00:42,0,2,0,0,1,0,0
3,I cctv.. U also know..,马小玲,2:42,00:02:42,0,5,1,0,1,0,0
4,👋,Helen Lee,2:52,00:02:52,0,1,0,0,1,0,0
5,👌,Helen Lee,3:20,00:03:20,0,1,0,0,1,0,0
6,How to preorder??,马小玲,3:40,00:03:40,0,3,0,0,1,0,0
7,Whatsapp 88896368,E-Beve,4:08,00:04:08,1,2,0,0,1,0,0
8,Noted..,马小玲,4:12,00:04:12,0,1,0,0,1,0,0
9,Lns,Lily Koh,5:05,00:05:05,0,1,1,0,1,0,0


### Price of Products

**New Column to identify the price of the products**

This new column is created to identify the regex of the unique product codes and their corresponding prices, as adviced by the seller.

In [75]:
#products offered by the seller
df3[df3['postComment'].str.contains('(\$)(\s)?(.*)', regex=True)]

  df3[df3['postComment'].str.contains('(\$)(\s)?(.*)', regex=True)]


Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info,salesQuantity,productBought
20,GOL+1 Golden pomfret $6,E-Beve,11:00,00:11:00,1,4,0,1,0,0,GOL
26,RED SNAPPER 600G $11 RS+1,E-Beve,12:43,00:12:43,1,5,0,1,0,0,RS
30,TIGER PRAWN $18 TP+1,E-Beve,14:46,00:14:46,1,4,0,1,0,0,TP
40,LIVE PRAWNS $20 LP+1,E-Beve,17:21,00:17:21,1,4,0,1,0,0,LP
41,COCKLES $8 CC+1,E-Beve,19:10,00:19:10,1,3,0,1,0,0,CC
45,TERUBOK 700G $15 TRB+1,E-Beve,21:24,00:21:24,1,4,0,1,0,0,TRB
46,CENCARU SET OF5 $10 CEN+1,E-Beve,24:52,00:24:52,1,5,0,1,0,0,CEN
52,GLASS FISH 1KG $3.50 GLASS+1,E-Beve,27:18,00:27:18,1,5,0,1,0,0,GLASS
72,STINGRAY 800G $12 STR+1,E-Beve,32:41,00:32:41,1,4,0,1,0,0,STR
89,RED EMPEROR $18 RE+1,E-Beve,38:40,00:38:40,1,4,0,1,0,0,RE


In [76]:
def price(comment):
    if re.search(r'(\$)(\s)?(.*)', comment):
        return str(re.search(r'(\$)(\s)?(.*)', comment).group(0))
    else:
        return int(0)

In [77]:
df3['productPrice'] = df3['postComment'].apply(lambda x:price(x))

In [78]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,sales,no_sale_info,salesQuantity,productBought,productPrice
0,Have lobster today?,Leonardo Lim,0:16,00:00:16,0,3,0,0,1,0,0,0
1,Good morning,David Koh,0:37,00:00:37,0,2,0,0,1,0,0,0
2,Hello hello,Shuganya Devi,0:42,00:00:42,0,2,0,0,1,0,0,0
3,I cctv.. U also know..,马小玲,2:42,00:02:42,0,5,1,0,1,0,0,0
4,👋,Helen Lee,2:52,00:02:52,0,1,0,0,1,0,0,0
5,👌,Helen Lee,3:20,00:03:20,0,1,0,0,1,0,0,0
6,How to preorder??,马小玲,3:40,00:03:40,0,3,0,0,1,0,0,0
7,Whatsapp 88896368,E-Beve,4:08,00:04:08,1,2,0,0,1,0,0,0
8,Noted..,马小玲,4:12,00:04:12,0,1,0,0,1,0,0,0
9,Lns,Lily Koh,5:05,00:05:05,0,1,1,0,1,0,0,0


In [79]:
df3['productPrice'].unique()

array([0, '$6', '$11 RS+1', '$18 TP+1', '$20 LP+1', '$8 CC+1',
       '$15 TRB+1', '$10 CEN+1', '$3.50 GLASS+1', '$12 STR+1', '$18 RE+1',
       '$6.50 SB+1', '$6 FLL+1', '$20 ST+1', '$10 BBAG+1',
       '$26, $33, $35 DR26, DR33,DR35', '$44 WBT+1'], dtype=object)

In [80]:
df3.loc[20, 'productPrice']= 'GOL+1 $6'
df3.loc[123, 'productPrice']= '$26 DR26, $33 DR$33, $35 DR35'

In [81]:
df3['productPrice'].unique()

array([0, 'GOL+1 $6', '$11 RS+1', '$18 TP+1', '$20 LP+1', '$8 CC+1',
       '$15 TRB+1', '$10 CEN+1', '$3.50 GLASS+1', '$12 STR+1', '$18 RE+1',
       '$6.50 SB+1', '$6 FLL+1', '$20 ST+1', '$10 BBAG+1',
       '$26 DR26, $33 DR$33, $35 DR35', '$44 WBT+1'], dtype=object)

Excluding the comments where there was no product codes as adviced by the seller, we find the number of unique products offered by the seller.

In [82]:
#number of unique products offered by the seller
int(df3['productPrice'].nunique()) - int(1) + int(2) #since row 123 has 3 product codes instead of 1

18

In [83]:
#total number of products offered
va['numProducts'] = int(df3['productPrice'].nunique()) - int(1) + int(2)

In [84]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts
0,ebeveadmin/videos/331382632125428,21,1100,4246,21,515,6,28,18


**Drop irrelevant columns**

The following column was dropped for the following reasons:

1. 'postCommentTime'
- Since a new column 'postCommentTime_final' was created to ensure that a consistent timestamp of HH:MM:SS is used consistently throughout the dataframe, and the dataframe has been thereafter reindex and sorted in accordance to time in ascending order, we dropped the original inconsistent time column 'postCommentTime' as it had varying timestamp formats of HH:MM:SS, MM:SS and M:SS.

2. 'no_sale_info'
- This column 'notSeller' was solely created to calculate the quantity of sales made, and the products purchased by the customers. Hence, we are able to delete it after the quantity of sales have been calculated and the identification of the products purchased by the customers have been identified.

3. 'sales'
- This column identifies the quantity of products to the unique & specific product codes. However, it includes the comments which advises for the product sale information as well. Hence, this column was solely created to be multiplied against the column 'no_sale_info' to calculate the true quantity of sales made by customers only. Hence, we are able to delete it after the quantity of sales have been calculated.

In [85]:
#drop unwanted columns
df3.drop(['postCommentTime', 'no_sale_info', 'sales'], axis=1, inplace=True)

In [86]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productBought,productPrice
0,Have lobster today?,Leonardo Lim,00:00:16,0,3,0,0,0,0
1,Good morning,David Koh,00:00:37,0,2,0,0,0,0
2,Hello hello,Shuganya Devi,00:00:42,0,2,0,0,0,0
3,I cctv.. U also know..,马小玲,00:02:42,0,5,1,0,0,0
4,👋,Helen Lee,00:02:52,0,1,0,0,0,0


### Revenue from the sale of the products

**Dummify the products bought to find the revenue**

In [87]:
#getdummies the products bought
df3 = pd.get_dummies(df3, columns = ['productBought'], drop_first = True)

In [88]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,productBought_RAY,productBought_RE,productBought_RS,productBought_SB,productBought_ST,productBought_STINGRAY,productBought_STR,productBought_TP,productBought_TRB,productBought_WBT
0,Have lobster today?,Leonardo Lim,00:00:16,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Good morning,David Koh,00:00:37,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Hello hello,Shuganya Devi,00:00:42,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,I cctv.. U also know..,马小玲,00:02:42,0,5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,👋,Helen Lee,00:02:52,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
# iterating the columns
for col in df3.columns:
    print(col)

postComment
postCommentAuthor
postCommentTime_final
isSeller
postCommentLength
lns
salesQuantity
productPrice
productBought_BAG
productBought_BBAG
productBought_CC
productBought_CEN
productBought_FLL
productBought_GLASS
productBought_GOL
productBought_LP
productBought_RAY
productBought_RE
productBought_RS
productBought_SB
productBought_ST
productBought_STINGRAY
productBought_STR
productBought_TP
productBought_TRB
productBought_WBT


After the column 'productBought' has been dummified, the cells will return a '1' if that particular item is bought, and a '0' if it is not. Hence, we will replace all the '1' with the price of the product.

Then, we will create a new revenue column which is a multiplication of the column 'salesQuantity' against the price of the product (i.e. the dummified 'productBought' columnns).

Product BAG

In [90]:
df3[df3['postComment'].str.contains('BAG', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,productBought_RAY,productBought_RE,productBought_RS,productBought_SB,productBought_ST,productBought_STINGRAY,productBought_STR,productBought_TP,productBought_TRB,productBought_WBT
117,Baby ang go li $10 BBAG+1,E-Beve,00:52:50,1,6,0,0,$10 BBAG+1,0,1,...,0,0,0,0,0,0,0,0,0,0
120,BBAG+1,Nani Baqawali,00:53:01,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


We noticed that there is no sale information with regards to the product code 'BAG'. Perhaps a wrong product code has been typed out and posted. Hence, we will exclude the sales related to the code 'BAG'.

Product BBAG

In [91]:
df3[df3['postComment'].str.contains('BBAG', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,productBought_RAY,productBought_RE,productBought_RS,productBought_SB,productBought_ST,productBought_STINGRAY,productBought_STR,productBought_TP,productBought_TRB,productBought_WBT
117,Baby ang go li $10 BBAG+1,E-Beve,00:52:50,1,6,0,0,$10 BBAG+1,0,1,...,0,0,0,0,0,0,0,0,0,0
120,BBAG+1,Nani Baqawali,00:53:01,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [92]:
df3['productBought_BBAG'] = df3['productBought_BBAG'].map(lambda x:float(10.00) if x == int(1) else 0)

In [93]:
df3['revenue_BBAG'] = np.multiply(df3['productBought_BBAG'], df3['salesQuantity'])

In [94]:
revenue_BBAG = "The total revenue from the sale of the product {} is ${}". format ("BBAG", format(df3['revenue_BBAG'].sum(), '.2f'))
print(revenue_BBAG)


The total revenue from the sale of the product BBAG is $20.00


Product CC

In [95]:
df3[df3['postComment'].str.contains('CC', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,productBought_RE,productBought_RS,productBought_SB,productBought_ST,productBought_STINGRAY,productBought_STR,productBought_TP,productBought_TRB,productBought_WBT,revenue_BBAG
41,COCKLES $8 CC+1,E-Beve,00:19:10,1,3,0,0,$8 CC+1,0,0.0,...,0,0,0,0,0,0,0,0,0,0.0
42,CC+1,Aisha Ahmad,00:19:20,0,1,0,1,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0.0
43,CC + 1,David Koh,00:19:26,0,3,0,1,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0.0


In [96]:
df3['productBought_CC'] = df3['productBought_CC'].map(lambda x:float(8.00) if x == int(1) else 0)

In [97]:
df3['revenue_CC'] = np.multiply(df3['productBought_CC'], df3['salesQuantity'])

In [98]:
revenue_CC = "The total revenue from the sale of the product {} is ${}". format ("CC", format(df3['revenue_CC'].sum(), '.2f'))
print(revenue_CC)


The total revenue from the sale of the product CC is $24.00


Product CEN

In [99]:
df3[df3['postComment'].str.contains('CEN', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,productBought_RS,productBought_SB,productBought_ST,productBought_STINGRAY,productBought_STR,productBought_TP,productBought_TRB,productBought_WBT,revenue_BBAG,revenue_CC
46,CENCARU SET OF5 $10 CEN+1,E-Beve,00:24:52,1,5,0,0,$10 CEN+1,0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0
48,CEN+1,Aisha Ahmad,00:24:57,0,1,0,1,0,0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0


In [100]:
df3['productBought_CEN'] = df3['productBought_CEN'].map(lambda x:float(10.00) if x == int(1) else 0)

In [101]:
df3['revenue_CEN'] = np.multiply(df3['productBought_CEN'], df3['salesQuantity'])

In [102]:
revenue_CEN = "The total revenue from the sale of the product {} is ${}". format ("CEN", format(df3['revenue_CEN'].sum(), '.2f'))
print(revenue_CEN)


The total revenue from the sale of the product CEN is $20.00


Product FLL

In [103]:
df3[df3['postComment'].str.contains('FLL', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,productBought_SB,productBought_ST,productBought_STINGRAY,productBought_STR,productBought_TP,productBought_TRB,productBought_WBT,revenue_BBAG,revenue_CC,revenue_CEN
105,FLOWER CLAM 700G $6 FLL+1,E-Beve,00:47:03,1,5,0,0,$6 FLL+1,0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
107,FLL +1,Stefanie Teo,00:47:20,0,2,0,1,0,0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
108,Sis I comment FLL,Stefanie Teo,00:47:45,0,4,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0


In [104]:
df3['productBought_FLL'] = df3['productBought_FLL'].map(lambda x:float(6.00) if x == int(1) else 0)

In [105]:
df3['revenue_FLL'] = np.multiply(df3['productBought_FLL'], df3['salesQuantity'])

In [106]:
revenue_FLL = "The total revenue from the sale of the product {} is ${}". format ("FLL", format(df3['revenue_FLL'].sum(), '.2f'))
print(revenue_FLL)


The total revenue from the sale of the product FLL is $6.00


Product GLASS

In [107]:
df3[df3['postComment'].str.contains('GLASS', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,productBought_ST,productBought_STINGRAY,productBought_STR,productBought_TP,productBought_TRB,productBought_WBT,revenue_BBAG,revenue_CC,revenue_CEN,revenue_FLL
52,GLASS FISH 1KG $3.50 GLASS+1,E-Beve,00:27:18,1,5,0,0,$3.50 GLASS+1,0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
86,Repeat STR+2..GLASS FISH 1KG..THKS,Kham N Ash Koh,00:37:34,0,4,0,2,0,0,0.0,...,0,0,1,0,0,0,0.0,0.0,0.0,0.0


In [108]:
df3['productBought_GLASS'] = df3['productBought_GLASS'].map(lambda x:float(3.50) if x == int(1) else 0)

In [109]:
df3['revenue_GLASS'] = np.multiply(df3['productBought_GLASS'], df3['salesQuantity'])

In [110]:
revenue_GLASS = "The total revenue from the sale of the product {} is ${}". format ("GLASS", format(df3['revenue_GLASS'].sum(), '.2f'))
print(revenue_GLASS)


The total revenue from the sale of the product GLASS is $3.50


Product GOL

In [111]:
df3[df3['postComment'].str.contains('GOL', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,productBought_STINGRAY,productBought_STR,productBought_TP,productBought_TRB,productBought_WBT,revenue_BBAG,revenue_CC,revenue_CEN,revenue_FLL,revenue_GLASS
20,GOL+1 Golden pomfret $6,E-Beve,00:11:00,1,4,0,0,GOL+1 $6,0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
21,GOL+1,Helen Lee,00:11:26,0,1,0,1,0,0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


In [112]:
df3['productBought_GOL'] = df3['productBought_GOL'].map(lambda x:float(6.00) if x == int(1) else 0)

In [113]:
df3['revenue_GOL'] = np.multiply(df3['productBought_GOL'], df3['salesQuantity'])

In [114]:
revenue_GOL = "The total revenue from the sale of the product {} is ${}". format ("GOL", format(df3['revenue_GOL'].sum(), '.2f'))
print(revenue_GOL)


The total revenue from the sale of the product GOL is $12.00


Product LP

In [115]:
df3[df3['postComment'].str.contains('LP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,productBought_STR,productBought_TP,productBought_TRB,productBought_WBT,revenue_BBAG,revenue_CC,revenue_CEN,revenue_FLL,revenue_GLASS,revenue_GOL
40,LIVE PRAWNS $20 LP+1,E-Beve,00:17:21,1,4,0,0,$20 LP+1,0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
df3['productBought_LP'] = df3['productBought_LP'].map(lambda x:float(20.00) if x == int(1) else 0)

In [117]:
df3['revenue_LP'] = np.multiply(df3['productBought_LP'], df3['salesQuantity'])

In [118]:
revenue_LP = "The total revenue from the sale of the product {} is ${}". format ("LP", format(df3['revenue_LP'].sum(), '.2f'))
print(revenue_LP)


The total revenue from the sale of the product LP is $0.00


Product RAY

In [119]:
df3[df3['postComment'].str.contains('RAY', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,productBought_TP,productBought_TRB,productBought_WBT,revenue_BBAG,revenue_CC,revenue_CEN,revenue_FLL,revenue_GLASS,revenue_GOL,revenue_LP
16,STINGRAY +1,E-Beve,00:08:25,1,2,0,1,0,0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,STINGRAY 800G $12 STR+1,E-Beve,00:32:41,1,4,0,0,$12 STR+1,0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We noticed that there is no sale information with regards to the product code 'RAY'. Perhaps a wrong product code has been typed out and posted. Hence, we will exclude the sales related to the code 'RAY'.

Product RE

In [120]:
df3[df3['postComment'].str.contains('RE', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,productBought_TP,productBought_TRB,productBought_WBT,revenue_BBAG,revenue_CC,revenue_CEN,revenue_FLL,revenue_GLASS,revenue_GOL,revenue_LP
26,RED SNAPPER 600G $11 RS+1,E-Beve,00:12:43,1,5,0,0,$11 RS+1,0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89,RED EMPEROR $18 RE+1,E-Beve,00:38:40,1,4,0,0,$18 RE+1,0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [121]:
df3['productBought_RE'] = df3['productBought_RE'].map(lambda x:float(11.00) if x == int(1) else 0)

In [122]:
df3['revenue_RE'] = np.multiply(df3['productBought_RE'], df3['salesQuantity'])

In [123]:
revenue_RE = "The total revenue from the sale of the product {} is ${}". format ("RE", format(df3['revenue_RE'].sum(), '.2f'))
print(revenue_RE)


The total revenue from the sale of the product RE is $0.00


Product RS

In [124]:
df3[df3['postComment'].str.contains('RS', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,productBought_TRB,productBought_WBT,revenue_BBAG,revenue_CC,revenue_CEN,revenue_FLL,revenue_GLASS,revenue_GOL,revenue_LP,revenue_RE
26,RED SNAPPER 600G $11 RS+1,E-Beve,00:12:43,1,5,0,0,$11 RS+1,0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,RS+1,Aisha Ahmad,00:13:31,0,1,0,1,0,0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [125]:
df3['productBought_RS'] = df3['productBought_RS'].map(lambda x:float(11.00) if x == int(1) else 0)

In [126]:
df3['revenue_RS'] = np.multiply(df3['productBought_RS'], df3['salesQuantity'])

In [127]:
revenue_RS = "The total revenue from the sale of the product {} is ${}". format ("RS", format(df3['revenue_RS'].sum(), '.2f'))
print(revenue_RS)


The total revenue from the sale of the product RS is $11.00


Product SB

In [128]:
df3[df3['postComment'].str.contains('SB', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,productBought_WBT,revenue_BBAG,revenue_CC,revenue_CEN,revenue_FLL,revenue_GLASS,revenue_GOL,revenue_LP,revenue_RE,revenue_RS
96,SEABASS $6.50 SB+1,E-Beve,00:42:41,1,3,0,0,$6.50 SB+1,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,SB+2,Kham N Ash Koh,00:42:58,0,1,0,2,0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,"SB+2, 1 fillet n 1 cut steak",Stefanie Teo,00:44:01,0,7,0,2,0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [129]:
df3['productBought_SB'] = df3['productBought_SB'].map(lambda x:float(6.50) if x == int(1) else 0)

In [130]:
df3['revenue_SB'] = np.multiply(df3['productBought_SB'], df3['salesQuantity'])

In [131]:
revenue_SB = "The total revenue from the sale of the product {} is ${}". format ("SB", format(df3['revenue_SB'].sum(), '.2f'))
print(revenue_SB)


The total revenue from the sale of the product SB is $32.50


Product ST

In [132]:
df3[df3['postComment'].str.contains('ST', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,revenue_BBAG,revenue_CC,revenue_CEN,revenue_FLL,revenue_GLASS,revenue_GOL,revenue_LP,revenue_RE,revenue_RS,revenue_SB
16,STINGRAY +1,E-Beve,00:08:25,1,2,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,STINGRAY 800G $12 STR+1,E-Beve,00:32:41,1,4,0,0,$12 STR+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73,STR+1,Nani Baqawali,00:33:01,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,Repeat STR+2..GLASS FISH 1KG..THKS,Kham N Ash Koh,00:37:34,0,4,0,2,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112,SOTONG 1.15KG $20 ST+1,E-Beve,00:49:17,1,4,0,0,$20 ST+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [133]:
df3['productBought_ST'] = df3['productBought_ST'].map(lambda x:float(20.00) if x == int(1) else 0)

In [134]:
df3['revenue_ST'] = np.multiply(df3['productBought_ST'], df3['salesQuantity'])

In [135]:
revenue_ST = "The total revenue from the sale of the product {} is ${}". format ("ST", format(df3['revenue_ST'].sum(), '.2f'))
print(revenue_ST)


The total revenue from the sale of the product ST is $0.00


Product STINGRAY

In [136]:
df3[df3['postComment'].str.contains('STINGRAY', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,revenue_CC,revenue_CEN,revenue_FLL,revenue_GLASS,revenue_GOL,revenue_LP,revenue_RE,revenue_RS,revenue_SB,revenue_ST
16,STINGRAY +1,E-Beve,00:08:25,1,2,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,STINGRAY 800G $12 STR+1,E-Beve,00:32:41,1,4,0,0,$12 STR+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We asssume that the customer has typed out the full name of the product instead of the product code for the product. 

In [137]:
df3['productBought_STINGRAY'] = df3['productBought_STINGRAY'].map(lambda x:float(12.00) if x == int(1) else 0)

In [138]:
df3['revenue_STINGRAY'] = np.multiply(df3['productBought_STINGRAY'], df3['salesQuantity'])

In [139]:
revenue_STINGRAY = "The total revenue from the sale of the product {} is ${}". format ("STINGRAY", format(df3['revenue_STINGRAY'].sum(), '.2f'))
print(revenue_STINGRAY)


The total revenue from the sale of the product STINGRAY is $24.00


Product STR

In [140]:
df3[df3['postComment'].str.contains('STR', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,revenue_CEN,revenue_FLL,revenue_GLASS,revenue_GOL,revenue_LP,revenue_RE,revenue_RS,revenue_SB,revenue_ST,revenue_STINGRAY
72,STINGRAY 800G $12 STR+1,E-Beve,00:32:41,1,4,0,0,$12 STR+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73,STR+1,Nani Baqawali,00:33:01,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,Repeat STR+2..GLASS FISH 1KG..THKS,Kham N Ash Koh,00:37:34,0,4,0,2,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [141]:
df3['productBought_STR'] = df3['productBought_STR'].map(lambda x:float(12.00) if x == int(1) else 0)

In [142]:
df3['revenue_STR'] = np.multiply(df3['productBought_STR'], df3['salesQuantity'])

In [143]:
revenue_STR = "The total revenue from the sale of the product {} is ${}". format ("STR", format(df3['revenue_STR'].sum(), '.2f'))
print(revenue_STR)


The total revenue from the sale of the product STR is $60.00


Product TP

In [144]:
df3[df3['postComment'].str.contains('TP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,revenue_FLL,revenue_GLASS,revenue_GOL,revenue_LP,revenue_RE,revenue_RS,revenue_SB,revenue_ST,revenue_STINGRAY,revenue_STR
30,TIGER PRAWN $18 TP+1,E-Beve,00:14:46,1,4,0,0,$18 TP+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,TP+1,Aisha Ahmad,00:14:56,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [145]:
df3['productBought_TP'] = df3['productBought_TP'].map(lambda x:float(18.00) if x == int(1) else 0)

In [146]:
df3['revenue_TP'] = np.multiply(df3['productBought_TP'], df3['salesQuantity'])

In [147]:
revenue_TP = "The total revenue from the sale of the product {} is ${}". format ("TP", format(df3['revenue_TP'].sum(), '.2f'))
print(revenue_TP)


The total revenue from the sale of the product TP is $18.00


Product TRB

In [148]:
df3[df3['postComment'].str.contains('TRB', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,revenue_GLASS,revenue_GOL,revenue_LP,revenue_RE,revenue_RS,revenue_SB,revenue_ST,revenue_STINGRAY,revenue_STR,revenue_TP
45,TERUBOK 700G $15 TRB+1,E-Beve,00:21:24,1,4,0,0,$15 TRB+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [149]:
df3['productBought_TRB'] = df3['productBought_TRB'].map(lambda x:float(15.00) if x == int(1) else 0)

In [150]:
df3['revenue_TRB'] = np.multiply(df3['productBought_TRB'], df3['salesQuantity'])

In [151]:
revenue_TRB = "The total revenue from the sale of the product {} is ${}". format ("TRB", format(df3['revenue_TRB'].sum(), '.2f'))
print(revenue_TRB)


The total revenue from the sale of the product TRB is $0.00


Product WBT

In [152]:
df3[df3['postComment'].str.contains('WBT', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,revenue_GOL,revenue_LP,revenue_RE,revenue_RS,revenue_SB,revenue_ST,revenue_STINGRAY,revenue_STR,revenue_TP,revenue_TRB
132,WHOLE BATANG 2.8KG $44 WBT+1,E-Beve,01:01:43,1,5,0,0,$44 WBT+1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
133,WBT+1,Shirley Ang,01:01:46,0,1,0,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
df3['productBought_WBT'] = df3['productBought_WBT'].map(lambda x:float(44.00) if x == int(1) else 0)

In [154]:
df3['revenue_WBT'] = np.multiply(df3['productBought_WBT'], df3['salesQuantity'])

In [155]:
revenue_WBT = "The total revenue from the sale of the product {} is ${}". format ("WBT", format(df3['revenue_WBT'].sum(), '.2f'))
print(revenue_WBT)


The total revenue from the sale of the product WBT is $44.00


In [156]:
# iterating the columns
for col in df3.columns:
    print(col)

postComment
postCommentAuthor
postCommentTime_final
isSeller
postCommentLength
lns
salesQuantity
productPrice
productBought_BAG
productBought_BBAG
productBought_CC
productBought_CEN
productBought_FLL
productBought_GLASS
productBought_GOL
productBought_LP
productBought_RAY
productBought_RE
productBought_RS
productBought_SB
productBought_ST
productBought_STINGRAY
productBought_STR
productBought_TP
productBought_TRB
productBought_WBT
revenue_BBAG
revenue_CC
revenue_CEN
revenue_FLL
revenue_GLASS
revenue_GOL
revenue_LP
revenue_RE
revenue_RS
revenue_SB
revenue_ST
revenue_STINGRAY
revenue_STR
revenue_TP
revenue_TRB
revenue_WBT


**Sum of total revenue from the video**

In [157]:
#total revenue from the video
total_revenue = df3.loc[:, 'revenue_BBAG': 'revenue_WBT'].values.sum()

#round the total revenue to 2 decimals places
total_revenue_rounded = format(total_revenue, '.2f')

print(f"The total revenue for the sale of products for the video is ${total_revenue_rounded}")


The total revenue for the sale of products for the video is $275.00


In [158]:
va['totalRevenue'] = total_revenue_rounded
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue
0,ebeveadmin/videos/331382632125428,21,1100,4246,21,515,6,28,18,275.0


**New Column for the total revenue at that comment**

In [159]:
#https://stackoverflow.com/questions/42063716/pandas-sum-up-multiple-columns-into-one-column-without-last-column
df3['revenue'] = df3.loc[:, 'revenue_BBAG': 'revenue_WBT'].sum(axis=1)
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BAG,productBought_BBAG,...,revenue_RE,revenue_RS,revenue_SB,revenue_ST,revenue_STINGRAY,revenue_STR,revenue_TP,revenue_TRB,revenue_WBT,revenue
0,Have lobster today?,Leonardo Lim,00:00:16,0,3,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Good morning,David Koh,00:00:37,0,2,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Hello hello,Shuganya Devi,00:00:42,0,2,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,I cctv.. U also know..,马小玲,00:02:42,0,5,1,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,👋,Helen Lee,00:02:52,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,👌,Helen Lee,00:03:20,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,How to preorder??,马小玲,00:03:40,0,3,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Whatsapp 88896368,E-Beve,00:04:08,1,2,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Noted..,马小玲,00:04:12,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Lns,Lily Koh,00:05:05,0,1,1,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [160]:
#https://www.geeksforgeeks.org/how-to-move-a-column-to-first-position-in-pandas-dataframe/
#shift the revenue column to be the 8th column, i.e at position 7
eighth_column = df3.pop('revenue')

# insert column using insert(position,column_name,ninth_column) function
df3.insert(7, 'revenue', eighth_column)
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,productPrice,productBought_BAG,...,revenue_LP,revenue_RE,revenue_RS,revenue_SB,revenue_ST,revenue_STINGRAY,revenue_STR,revenue_TP,revenue_TRB,revenue_WBT
0,Have lobster today?,Leonardo Lim,00:00:16,0,3,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Good morning,David Koh,00:00:37,0,2,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Hello hello,Shuganya Devi,00:00:42,0,2,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,I cctv.. U also know..,马小玲,00:02:42,0,5,1,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,👋,Helen Lee,00:02:52,0,1,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,👌,Helen Lee,00:03:20,0,1,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,How to preorder??,马小玲,00:03:40,0,3,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Whatsapp 88896368,E-Beve,00:04:08,1,2,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Noted..,马小玲,00:04:12,0,1,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Lns,Lily Koh,00:05:05,0,1,1,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


After the revenue of the sales have been calculated, all the dummified product columns and the column 'productPrice' will be dropped as they are no longer required to be checked against to identify the price of the product.

In [161]:
df3 = df3.loc[: ,'postComment':'revenue']
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue
0,Have lobster today?,Leonardo Lim,00:00:16,0,3,0,0,0.0
1,Good morning,David Koh,00:00:37,0,2,0,0,0.0
2,Hello hello,Shuganya Devi,00:00:42,0,2,0,0,0.0
3,I cctv.. U also know..,马小玲,00:02:42,0,5,1,0,0.0
4,👋,Helen Lee,00:02:52,0,1,0,0,0.0
5,👌,Helen Lee,00:03:20,0,1,0,0,0.0
6,How to preorder??,马小玲,00:03:40,0,3,0,0,0.0
7,Whatsapp 88896368,E-Beve,00:04:08,1,2,0,0,0.0
8,Noted..,马小玲,00:04:12,0,1,0,0,0.0
9,Lns,Lily Koh,00:05:05,0,1,1,0,0.0


**New Column to identify the frequency of the seller's comments in the video**

In [162]:
#frequency of seller's comments
va['frequencySeller']= np.divide(va['videoLength'].iloc[0],va['numSellerComments'])
#seller's comment appears on average of every 202 seconds

In [163]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue,frequencySeller
0,ebeveadmin/videos/331382632125428,21,1100,4246,21,515,6,28,18,275.0,202.190476


**New Column to identify the seller**

In [164]:
df3['seller'] = 'ebeveadmin'

In [165]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller
0,Have lobster today?,Leonardo Lim,00:00:16,0,3,0,0,0.0,ebeveadmin
1,Good morning,David Koh,00:00:37,0,2,0,0,0.0,ebeveadmin
2,Hello hello,Shuganya Devi,00:00:42,0,2,0,0,0.0,ebeveadmin
3,I cctv.. U also know..,马小玲,00:02:42,0,5,1,0,0.0,ebeveadmin
4,👋,Helen Lee,00:02:52,0,1,0,0,0.0,ebeveadmin


**New Column for the Average Compound Score from Sentiment Analysis on raw & uncleaned comments for video**

In [166]:
# Instantiate Sentiment Intensity Analyzer
sent = SentimentIntensityAnalyzer()

In [167]:
df3['sentiment_score'] = df3['postComment'].apply(sent.polarity_scores)
df3['compound'] = [sent.polarity_scores(x)['compound'] for x in df3['postComment']]
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller,sentiment_score,compound
0,Have lobster today?,Leonardo Lim,00:00:16,0,3,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
1,Good morning,David Koh,00:00:37,0,2,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 0.256, 'pos': 0.744, 'compound': 0.4404}",0.4404
2,Hello hello,Shuganya Devi,00:00:42,0,2,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
3,I cctv.. U also know..,马小玲,00:02:42,0,5,1,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
4,👋,Helen Lee,00:02:52,0,1,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}",0.0


In [168]:
#average compound scores for the video
#df3.shape[0] calculates the total number of rows in the dataframe
va['averageCompound']= (df3['compound'].sum())/(df3['compound'].sum())/df3.shape[0]
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue,frequencySeller,averageCompound
0,ebeveadmin/videos/331382632125428,21,1100,4246,21,515,6,28,18,275.0,202.190476,0.006803


In [169]:
#drop the columns with regarding to the sentiment analysis on the raw & uncleaned comments
#as we will perform sentiment analysis at the comment level on processed comments
df3 = df3.loc[: ,'postComment':'seller']
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller
0,Have lobster today?,Leonardo Lim,00:00:16,0,3,0,0,0.0,ebeveadmin
1,Good morning,David Koh,00:00:37,0,2,0,0,0.0,ebeveadmin
2,Hello hello,Shuganya Devi,00:00:42,0,2,0,0,0.0,ebeveadmin
3,I cctv.. U also know..,马小玲,00:02:42,0,5,1,0,0.0,ebeveadmin
4,👋,Helen Lee,00:02:52,0,1,0,0,0.0,ebeveadmin


### Saving the cleaned dataframes

In [170]:
# export to csv - change the name of the data file for each video
va.to_csv('../../data/cleaned_data/cleaned_va_ebeveadmin_331382632125428.csv', index=False)

In [171]:
#check for nulls
#displaying only the columns with nulls and their sum
df3[df3.columns[df3.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [172]:
# export to csv - change the name of the data file for each video
df3.to_csv('../../data/cleaned_data/cleaned_ebeveadmin_331382632125428.csv', index=False)