# Data Import & Cleaning

### Contents:

- [Data Import](#Data-Import)
- [Data Cleaning](#Data-Cleaning)
- [Feature Engineering](#Feature-Engineering)

### Import Libraries

In [1]:
#import standard libraries
import pandas as pd
import numpy as np

#import emoji
import emoji

from natsort import natsorted, index_natsorted, order_by_index

#import warnings to ignore flags when the project is complete
#import warnings
#warnings.filterwarnings('ignore')

#import pre-processing libraries for data cleaning
import string
import re
import nltk
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

## Data Import

**Read scrapped data for the following videos**

In [2]:
va = pd.read_csv('../../data/scrapped_data/va_ebeveadmin_241945361307367.csv')

In [3]:
va

Unnamed: 0,video_for,totalEmojiReaction,views
0,ebeveadmin/videos/241945361307367,46,1.8K


In [4]:
#retrieve the number of views for the video
va['views'].iloc[0]

'1.8K'

In [5]:
#drop the K and replace it with 2 '0's behind
va['views'] = va['views'].str.replace("K", "00", regex=True)
#drop the dot
va['views'] = va['views'].str.replace(".", "", regex=True)
#change the string to be an integer
va['views'] = int(va['views'].iloc[0])

In [6]:
va

Unnamed: 0,video_for,totalEmojiReaction,views
0,ebeveadmin/videos/241945361307367,46,1800


In [7]:
va.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   video_for           1 non-null      object
 1   totalEmojiReaction  1 non-null      int64 
 2   views               1 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 152.0+ bytes


In [8]:
df = pd.read_csv('../../data/scrapped_data/ebeveadmin_241945361307367.csv', encoding='utf-8')

In [9]:
df

Unnamed: 0,postComment,postCommentAuthor,postCommentTime
0,Arigato,き リーサン,0:00
1,Goodnite,き リーサン,0:00
2,Live+1,Tan Poh Kim Irene,1:00:23
3,Code For ang kar hia pls!,E-Beve,1:00:27
4,Monsoon season,Richard Ling,1:00:37
5,WAK+1,Lily Koh,1:00:42
6,China quality,Eaden Peh,1:01:02
7,Any small grouper or seabass?,Tan Poh Kim Irene,1:02:01
8,【Product】UNAGI - S$16.00 | Keyword: UNA,Eileen Fok,1:02:08
9,Flower crab can help to chop and clean. Thanks.,E-Beve,1:02:22


In [10]:
#https://stackoverflow.com/questions/32072076/find-the-unique-values-in-a-column-and-then-sort-them
#check if the seller uses multiple accounts to reply
postCommentAuthor_unique = df['postCommentAuthor'].unique()
print(sorted(postCommentAuthor_unique))

['Ad JC', 'Alex Ong', 'Alex Tan Jinhuang', 'Amir Abdul Majid', 'Angela Yeo', 'Aysha Khamarudin Al Takhi', 'Catherine Chua', 'Chelsia Lee', 'Chonghao Goh', 'E-Beve', 'Eaden Peh', 'Eileen Fok', 'Ellie Lee', 'Ernest Tan', 'Eugene Tan', 'Irene Kan', 'Jeffrey Ng', 'Jess Lim', 'Jimmy Chang', 'Justina Tan', 'Kham N Ash Koh', 'Li Li', 'Lily Koh', 'Lukie Neo', 'Max Boimax', 'Mike Tan', 'Min Xuan', "N'Ridz Kayla", 'Pauline Ang', 'Richard Ling', 'Shedah Rahman', 'Shuganya Devi', 'Simon Teo', 'Snowy Sue', 'Su Panda', "Sumi Adam's", 'Tan Poh Kim Irene', 'Veon Veon', 'Wong Chow Ching', 'Yap Yip', 'Zamzarina Hashim', 'اسماعيل ڤسچل', 'き リーサン', '梁文斌']


In [11]:
#find comments posted by the seller only
df.loc[df['postCommentAuthor'] == 'E-Beve']

Unnamed: 0,postComment,postCommentAuthor,postCommentTime
3,Code For ang kar hia pls!,E-Beve,1:00:27
9,Flower crab can help to chop and clean. Thanks.,E-Beve,1:02:22
21,Got 海瓜子?,E-Beve,1:06:10
26,FLL+2,E-Beve,1:08:47
34,TP+1,E-Beve,1:11:45
37,【Product】Kukup Golden Pomfret (400g) - S$6.50 | Keyword: GOL,E-Beve,1:13:36
38,FLL+1,E-Beve,1:14:01
39,【Product】Wild Grouper (850g) - S$16.00 | Keyword: WG16,E-Beve,1:14:31
41,【Product】Wild Grouper (850g) - S$16.00 | Keyword: WG16,E-Beve,1:14:39
42,【Product】Wild Grouper Head (Half) - S$5.00 | Keyword: WGH,E-Beve,1:14:52


In [12]:
#check the comments to have a gauge
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' Congratulations', '4Kg多少钱', '<span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 k7cz35w2 bsnbvmp4"><img alt="👏🏼" height="32" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/t13/2/32/1f44f_1f3fc.png" width="32"/></span><span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 k7cz35w2 bsnbvmp4"><img alt="👏🏼" height="32" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/t13/2/32/1f44f_1f3fc.png" width="32"/></span>', '<span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 k7cz35w2 bsnbvmp4"><img alt="🦀" height="32" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/te2/2/32/1f980.png" width="32"/></span>', 'After tis can see sotong..need to rest not feeling well..thks', 'All seafoods are fresh ', 'All your supporters ', 'Any small grouper or seabass?', 'Any snap

## Data Cleaning

### Convert the emojis to text for easy cleaning

We noticed that the emojis have html parsers attached to it. Hence, we will convert the images of the emojis to text first, to remove the html parsers to the emojis, while retaining the emoji's text. We will convert it back to emoji afterwards.

In [13]:
df['postComment'] = df['postComment'].apply(emoji.demojize)

In [14]:
postComment_unique = df['postComment'].unique()
print(sorted(postComment_unique))

[' Congratulations', '4Kg多少钱', '<span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 k7cz35w2 bsnbvmp4"><img alt=":clapping_hands_medium-light_skin_tone:" height="32" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/t13/2/32/1f44f_1f3fc.png" width="32"/></span><span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 k7cz35w2 bsnbvmp4"><img alt=":clapping_hands_medium-light_skin_tone:" height="32" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/t13/2/32/1f44f_1f3fc.png" width="32"/></span>', '<span class="pq6dq46d tbxw36s4 knj5qynh kvgmc6g5 ditlmg2l oygrvhab nvdbi5me sf5mxxl7 k7cz35w2 bsnbvmp4"><img alt=":crab:" height="32" referrerpolicy="origin-when-cross-origin" src="https://static.xx.fbcdn.net/images/emoji.php/v9/te2/2/32/1f980.png" width="32"/></span>', 'After tis can see sotong..need to rest not feeling well..thks', 'All seafoods 

### Clean the comments using Regex

From the above unique values of the comments, there are several cleaning issues that needs to be addressed.

    1. Removal of links or URLs. 
        - URLs are present when the Facebook users manually type a link in the comment. 
        - Additionally, when users are tagged, their Facebook profile URL is printed as a result.
        - Similarly, there is a unique URL linked to each emoji as well. 
        
    2. Removal of HTML special entities
        - Examples of HTML special entities are '&amp' and '&gt'. 
        
    3. Removal of other HTML special terms
        - Examples of other HTML special entities are whitespace HTML special entities like '#x200B' and '#xa0'.
        
    4. Removal of HTML Parsers to the Emojis
        - Previously, the emojis have been demojized to text already. However, the HTML parsers to the mojis remain. Hence, they are required to be removed as well.
        
    5. Removal of HTML Parcers to Whitespaces
        - When a next line is entered in the same comment, there will be a HTML parser to the this whitespace. Hence, this are to be removed as well.
        
    6. Removal of HTML Parsers to tagged names
        - When users are tagged in the comments, in addition to their Facebook profile URL, there will be HTML parsers to the tagged names as well. Hence, this are to be removed as well.
        
    7. Removsl of other HTML Parsers   

In [15]:
def clean(row):
    
    # Remove links or URLs
    row['postComment'] = re.sub(
        pattern=r'https?:\/\/.*\.png', 
        repl='', 
        string=row['postComment'],
        flags=re.M)
    
    # Remove HTML special entities (e.g.. &amp, &gt;)
    row['postComment'] = re.sub(
        pattern=r'\&\w*;',
        repl='',
        string=row['postComment'])    
    
    # Remove emoji html parsers
    #https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
    row['postComment'] = re.sub(
        pattern=r'<span\sclass=\"([a-z0-9]{8}\s)+[a-z0-9]{8}\">',
        repl='', 
        string=row['postComment'],
        flags=re.M)
 
    # Remove emoji html parsers
    row['postComment'] = re.sub(
        pattern = r'<img\salt=\"(\:\w*)(\-?)(\w*\:)\"\s[a-z]{6}\=\"\d\d\"\s[a-z]{14}\=.{26}\s[a-z]{3}\=\"\"\s[a-z]{5}\=\"\d\d\"\/>(<\/span>)?',
        repl=r'\1\2\3',
        string=row['postComment'],
        flags=re.M)
    
    
    # Remove whitespaces
    row['postComment'] = re.sub(
        pattern=r'<\/div><div\sdir\=\"auto\"\sstyle\=\"text\-align\:\sstart\;\">',
        repl=' ',
        string=row['postComment'],
        flags=re.M)
    
    # Remove tagged names
    row['postComment'] = re.sub(
        pattern=r'<a\sclass=\"([a-z0-9]{8}\s)+[a-z0-9]{8}\"\shref=\">',
        repl='',
        string=row['postComment'],
        flags=re.M)
    

    # Remove consecutive non-ASCII characters
    # This will remove the chinese comments
    #https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454
    row['postComment'] = re.sub(
        pattern=r'[^\x00-\x7F]+', 
        repl=' ', 
        string=row['postComment'],
        flags=re.M)
    
    return row

In [16]:
df2 = df.apply(clean, axis=1)

In [17]:
postComment_unique2 = df2['postComment'].unique()
print(sorted(postComment_unique2))

[' ', ' :two_hearts:', ' Congratulations', ' Product 2 pc Live mudcrabs (500g-600g each) - S$32.00 | Keyword: MC32', ' Product Baby Ang Go Li (1kg) - S$10.00 | Keyword: BBAG', ' Product Brown Stripe Snapper  - S$10.00 | Keyword: BSS', ' Product Brown Stripe Snapper  - S$10.00 | Keyword: BSS  Buyer Wong Chow Ching +1, Lukie Neo +1, Eileen Fok +1', ' Product Fish Eggs (500g) - S$10.00 | Keyword: FE', ' Product Flower Clams (Indonesia) - S$8.00 | Keyword: FLL', ' Product Fresh Flower Crabs (3-4pc) - S$20.00 | Keyword: FC', ' Product Gonggong - S$6.00 | Keyword: GG', ' Product Jiu Lor/Javelin Grunther - S$7.00 | Keyword: JL', ' Product Kukup Golden Pomfret (400g) - S$6.50 | Keyword: GOL', ' Product Kukup Red Snapper (500g-600g) - S$9.00 | Keyword: RS', ' Product Kukup Seabass (600g) - S$5.00 | Keyword: SB', ' Product LIVE PRAWN - S$20.00 | Keyword: LIVE', ' Product Lai Man Fish - S$5.00 | Keyword: LMF', ' Product Live mudcrabs (800g-900g each) - S$30.00 | Keyword: MCL', ' Product MUSSEL (O

**Convert encoded emoji text back to emojis**

In [18]:
df2['postComment'] = df2['postComment'].apply(emoji.emojize)

In [19]:
postComment_unique2 = df2['postComment'].unique()
print(sorted(postComment_unique2))

[' ', ' Congratulations', ' Product 2 pc Live mudcrabs (500g-600g each) - S$32.00 | Keyword: MC32', ' Product Baby Ang Go Li (1kg) - S$10.00 | Keyword: BBAG', ' Product Brown Stripe Snapper  - S$10.00 | Keyword: BSS', ' Product Brown Stripe Snapper  - S$10.00 | Keyword: BSS  Buyer Wong Chow Ching +1, Lukie Neo +1, Eileen Fok +1', ' Product Fish Eggs (500g) - S$10.00 | Keyword: FE', ' Product Flower Clams (Indonesia) - S$8.00 | Keyword: FLL', ' Product Fresh Flower Crabs (3-4pc) - S$20.00 | Keyword: FC', ' Product Gonggong - S$6.00 | Keyword: GG', ' Product Jiu Lor/Javelin Grunther - S$7.00 | Keyword: JL', ' Product Kukup Golden Pomfret (400g) - S$6.50 | Keyword: GOL', ' Product Kukup Red Snapper (500g-600g) - S$9.00 | Keyword: RS', ' Product Kukup Seabass (600g) - S$5.00 | Keyword: SB', ' Product LIVE PRAWN - S$20.00 | Keyword: LIVE', ' Product Lai Man Fish - S$5.00 | Keyword: LMF', ' Product Live mudcrabs (800g-900g each) - S$30.00 | Keyword: MCL', ' Product MUSSEL (ORANGE MEAT) - S$4

### Drop Empty Posts

Blank comments are dropped to ensure that unique comments are obtained.

In [20]:
#drop empty posts
df2 = df2.loc[((df2['postComment'] != ' ')),:]

In [21]:
postComment_unique_2 = df2['postComment'].unique()
print(sorted(postComment_unique_2))

[' Congratulations', ' Product 2 pc Live mudcrabs (500g-600g each) - S$32.00 | Keyword: MC32', ' Product Baby Ang Go Li (1kg) - S$10.00 | Keyword: BBAG', ' Product Brown Stripe Snapper  - S$10.00 | Keyword: BSS', ' Product Brown Stripe Snapper  - S$10.00 | Keyword: BSS  Buyer Wong Chow Ching +1, Lukie Neo +1, Eileen Fok +1', ' Product Fish Eggs (500g) - S$10.00 | Keyword: FE', ' Product Flower Clams (Indonesia) - S$8.00 | Keyword: FLL', ' Product Fresh Flower Crabs (3-4pc) - S$20.00 | Keyword: FC', ' Product Gonggong - S$6.00 | Keyword: GG', ' Product Jiu Lor/Javelin Grunther - S$7.00 | Keyword: JL', ' Product Kukup Golden Pomfret (400g) - S$6.50 | Keyword: GOL', ' Product Kukup Red Snapper (500g-600g) - S$9.00 | Keyword: RS', ' Product Kukup Seabass (600g) - S$5.00 | Keyword: SB', ' Product LIVE PRAWN - S$20.00 | Keyword: LIVE', ' Product Lai Man Fish - S$5.00 | Keyword: LMF', ' Product Live mudcrabs (800g-900g each) - S$30.00 | Keyword: MCL', ' Product MUSSEL (ORANGE MEAT) - S$4.00 |

### Reindexing the dataframe 
**New Column to reindex the dataframe in accordance to time**

From the data, we can tell that there is an inconsistent timestamp being used in the column 'postCommentTime'. For example, there are times like '0:57' and '1:00:14' which indicates 0 hour 0 mins 57 secs, and 1 hour 0 mins 14 secs. 

Hence, a new column 'postCommentTime_final' is created to ensure that a timestamp of HH:MM:SS is being used consistently throughout. However, we note that the number of days is being included for TimeDeltaIndex.

As a result, we will read the timestamp, by excluding the number of days.  

In [22]:
#TimedeltaIndex
#https://stackoverflow.com/questions/54877467/pandas-convert-hhmm-and-hhmmss-to-standard-hhmmss-in-python
# for example, time of '0:57' will then be 00:00:57; 0 hours 0 mins 57 secs
df2['postCommentTime_final'] = pd.to_timedelta(np.where(df2['postCommentTime'].str.count(':') == 1, '00:' + df2['postCommentTime'], df2['postCommentTime']))

In [23]:
df2.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Arigato,き リーサン,0:00,0 days 00:00:00
1,Goodnite,き リーサン,0:00,0 days 00:00:00
2,Live+1,Tan Poh Kim Irene,1:00:23,0 days 01:00:23
3,Code For ang kar hia pls!,E-Beve,1:00:27,0 days 01:00:27
4,Monsoon season,Richard Ling,1:00:37,0 days 01:00:37


In [24]:
df2['postCommentTime_final'] = df2['postCommentTime_final'].astype(str).map(lambda x: x[7:])

In [25]:
df2

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Arigato,き リーサン,0:00,00:00:00
1,Goodnite,き リーサン,0:00,00:00:00
2,Live+1,Tan Poh Kim Irene,1:00:23,01:00:23
3,Code For ang kar hia pls!,E-Beve,1:00:27,01:00:27
4,Monsoon season,Richard Ling,1:00:37,01:00:37
5,WAK+1,Lily Koh,1:00:42,01:00:42
6,China quality,Eaden Peh,1:01:02,01:01:02
7,Any small grouper or seabass?,Tan Poh Kim Irene,1:02:01,01:02:01
8,Product UNAGI - S$16.00 | Keyword: UNA,Eileen Fok,1:02:08,01:02:08
9,Flower crab can help to chop and clean. Thanks.,E-Beve,1:02:22,01:02:22


In [26]:
#reindex according to postCommentTime_final
#previous natsort in data collection didnt take into account the different timestamp format
df3 = df2.reindex(index=order_by_index(df2.index, index_natsorted(df2.postCommentTime_final)))

In [27]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Arigato,き リーサン,0:00,00:00:00
1,Goodnite,き リーサン,0:00,00:00:00
64,Helloooo ❤️,Ernest Tan,1:20,00:01:20
104,Hi got crab today??,Veon Veon,1:40,00:01:40
105,Halo beve,き リーサン,1:42,00:01:42
106,Hello everyone! Please remember to check out your cart!,Jeffrey Ng,1:45,00:01:45
107,Hello,き リーサン,1:48,00:01:48
108,Hello,Jeffrey Ng,2:41,00:02:41
109,Hello,Shuganya Devi,2:45,00:02:45
110,I am here,Eugene Tan,2:47,00:02:47


In [28]:
#reset the index for the dataframe
#https://stackoverflow.com/questions/20490274/how-to-reset-index-in-a-pandas-dataframe

df3 = df3.reset_index(drop=True)

In [29]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final
0,Arigato,き リーサン,0:00,00:00:00
1,Goodnite,き リーサン,0:00,00:00:00
2,Helloooo ❤️,Ernest Tan,1:20,00:01:20
3,Hi got crab today??,Veon Veon,1:40,00:01:40
4,Halo beve,き リーサン,1:42,00:01:42
5,Hello everyone! Please remember to check out your cart!,Jeffrey Ng,1:45,00:01:45
6,Hello,き リーサン,1:48,00:01:48
7,Hello,Jeffrey Ng,2:41,00:02:41
8,Hello,Shuganya Devi,2:45,00:02:45
9,I am here,Eugene Tan,2:47,00:02:47


**Obtain the length of the video**

Assuming that the last comment of the video is the total length of the video, we will input the length of the video from the comments dataframe into the video attributes dataframe.

In [30]:
#retrieve last comment to obtain the length of the video
df3['postCommentTime_final'].iloc[-1]

'01:35:46'

In [31]:
#https://stackoverflow.com/questions/6402812/how-to-convert-an-hmmss-time-string-to-seconds-in-python
def get_sec(time_str):
    """Get Seconds from time."""
    h, m, s = time_str.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

In [32]:
get_sec(df3['postCommentTime_final'].iloc[-1])

5746

In [33]:
#retrieve last comment to obtain the length of the video in seconds
va['videoLength']= get_sec(df3['postCommentTime_final'].iloc[-1])

In [34]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength
0,ebeveadmin/videos/241945361307367,46,1800,5746


## Feature Engineering

### Comments made by the Seller

**New Column to identify the number of comments made by the seller in the video**

From the total sum of comments made by the seller, we will input it into the video attributes dataframe.

In [35]:
(df3['postCommentAuthor']=='E-Beve').sum()

50

In [36]:
va['numSellerComments'] = (df3['postCommentAuthor']=='E-Beve').sum()

In [37]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments
0,ebeveadmin/videos/241945361307367,46,1800,5746,50


**New Column to identify if the comment is made by the Seller or not**

In [38]:
#create a new column to show if the comment is made by the seller or not
df3['isSeller'] = df3['postCommentAuthor'].map(lambda x:1 if x =='E-Beve' else 0)

In [39]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller
0,Arigato,き リーサン,0:00,00:00:00,0
1,Goodnite,き リーサン,0:00,00:00:00,0
2,Helloooo ❤️,Ernest Tan,1:20,00:01:20,0
3,Hi got crab today??,Veon Veon,1:40,00:01:40,0
4,Halo beve,き リーサン,1:42,00:01:42,0


In [40]:
df3['isSeller'].value_counts()

0    268
1     50
Name: isSeller, dtype: int64

In [41]:
#show all the seller's comments
df3.loc[df3['isSeller'] == 1]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller
21,Product Wild Angkah Prawns (1kg/mix sizes) - S$20.00 | Keyword: WAK,E-Beve,6:14,00:06:14,1
22,Hi sis ❤️,E-Beve,6:14,00:06:14,1
33,Kam heong crabs,E-Beve,11:39,00:11:39,1
43,All seafoods are fresh,E-Beve,14:59,00:14:59,1
57,4Kg,E-Beve,18:39,00:18:39,1
68,PM+1,E-Beve,21:29,00:21:29,1
78,Deep fry sambal sauce,E-Beve,23:28,00:23:28,1
85,PM+2,E-Beve,25:31,00:25:31,1
96,FE+1,E-Beve,27:42,00:27:42,1
111,How much is the delivery charges ?,E-Beve,30:37,00:30:37,1


### Length of comments

**New Column to identify the length of each comment**

From the comments, we take the length of each comment as the total number of words each comment has.

In [42]:
#length of each comment
#https://stackoverflow.com/questions/37483470/how-to-calculate-number-of-words-in-a-string-in-dataframe
df3['postCommentLength'] = df3['postComment'].str.split().str.len()

In [43]:
df3.head(10)

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength
0,Arigato,き リーサン,0:00,00:00:00,0,1
1,Goodnite,き リーサン,0:00,00:00:00,0,1
2,Helloooo ❤️,Ernest Tan,1:20,00:01:20,0,2
3,Hi got crab today??,Veon Veon,1:40,00:01:40,0,4
4,Halo beve,き リーサン,1:42,00:01:42,0,2
5,Hello everyone! Please remember to check out your cart!,Jeffrey Ng,1:45,00:01:45,0,9
6,Hello,き リーサン,1:48,00:01:48,0,1
7,Hello,Jeffrey Ng,2:41,00:02:41,0,1
8,Hello,Shuganya Devi,2:45,00:02:45,0,1
9,I am here,Eugene Tan,2:47,00:02:47,0,3


**New Column to identify the total number of comments in the video**

From the total number of comments in the video, we will input it into the video attributes dataframe.

In [44]:
#total number of comments
df3['postCommentLength'].sum()

1255

In [45]:
va['numComments'] = df3['postCommentLength'].sum()

In [46]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments
0,ebeveadmin/videos/241945361307367,46,1800,5746,50,1255


### LNS

LNS is an acronym that stands for 'like and share'. It is a form of customer engagement as it indicates by the customers to the sellers that they have liked and shared the video on their Facebook wall. 

**New Column to identify if Customers are engaging in liking and sharing the video**

In [47]:
#if the customer has commented 'lns' or 'ls' which stands for 'like & shared' & 'like shared' respectively
def lns(comment):
    if re.search(r'(l)(n?)(s)', comment, re.IGNORECASE):
        return int(1)
    else:
        return int(0)

In [48]:
df3['lns'] = df3['postComment'].map(lambda x:lns(x))

In [49]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns
0,Arigato,き リーサン,0:00,00:00:00,0,1,0
1,Goodnite,き リーサン,0:00,00:00:00,0,1,0
2,Helloooo ❤️,Ernest Tan,1:20,00:01:20,0,2,0
3,Hi got crab today??,Veon Veon,1:40,00:01:40,0,4,0
4,Halo beve,き リーサン,1:42,00:01:42,0,2,0


**New Column to identify if the number of Customers who explicitly inform the sellers that they are engaging in liking and sharing the video**

In [50]:
#range of customer's engagement for LNS
df3['lns'].value_counts()

0    294
1     24
Name: lns, dtype: int64

In [51]:
(df3['lns']==1).sum()

24

In [52]:
va['lnsQuantity'] = (df3['lns']==1).sum()

In [53]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity
0,ebeveadmin/videos/241945361307367,46,1800,5746,50,1255,24


## Sales Quantity

**New Columns to identify the quantity of sales made**

From the comments, using regex, we first see an overview of the comments that are related to the sale of the products. 

In [54]:
#products offered by the seller
df3[df3['postComment'].str.contains('(Keyword: )(\w*)', regex=True)]

  df3[df3['postComment'].str.contains('(Keyword: )(\w*)', regex=True)]


Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns
21,Product Wild Angkah Prawns (1kg/mix sizes) - S$20.00 | Keyword: WAK,E-Beve,6:14,00:06:14,1,11,0
27,Product Wild Angkah Prawns (1kg/mix sizes) - S$20.00 | Keyword: WAK,Richard Ling,8:44,00:08:44,0,11,0
39,Product Live mudcrabs (800g-900g each) - S$30.00 | Keyword: MCL,Yap Yip,12:46,00:12:46,0,10,0
48,Product MUSSEL (ORANGE MEAT) - S$4.00 | Keyword: MUS,Lily Koh,15:42,00:15:42,0,9,0
62,Product Prawn Maw (1pkt) - S$2.50 | Keyword: PM,Zamzarina Hashim,20:21,00:20:21,0,9,0
73,Product Fish Eggs (500g) - S$10.00 | Keyword: FE,Justina Tan,22:27,00:22:27,0,9,0
81,Product Wild Angkah Prawns (1kg/mix sizes) - S$20.00 | Keyword: WAK,Tan Poh Kim Irene,24:02,00:24:02,0,11,0
90,Product Wild Black Pomfret (400g) - S$6.50 | Keyword: BP,Eaden Peh,27:15,00:27:15,0,10,0
101,Product Lai Man Fish - S$5.00 | Keyword: LMF,Richard Ling,28:39,00:28:39,0,9,0
115,Product Wild Angkah Prawns (1kg/mix sizes) - S$20.00 | Keyword: WAK,Veon Veon,33:18,00:33:18,0,11,0


In [55]:
#overview of the sales
df3[df3['postComment'].str.contains('(\w*)(\s)*(\+)(\s)*(\d*)', regex=True)]

  df3[df3['postComment'].str.contains('(\w*)(\s)*(\+)(\s)*(\d*)', regex=True)]


Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns
41,MCL+2,Richard Ling,14:27,00:14:27,0,1,0
42,MCL +2,Zamzarina Hashim,14:44,00:14:44,0,2,0
54,MUS+1,Ellie Lee,17:49,00:17:49,0,1,0
61,PM+1,Lily Koh,20:16,00:20:16,0,1,0
63,PM+1,Lily Koh,20:28,00:20:28,0,1,0
66,PM+1,Eaden Peh,20:56,00:20:56,0,1,0
68,PM+1,E-Beve,21:29,00:21:29,1,1,0
76,FE+1,Tan Poh Kim Irene,23:17,00:23:17,0,1,0
79,FE +1,Eaden Peh,23:53,00:23:53,0,2,0
83,FE+1,梁文斌,24:17,00:24:17,0,1,0


In [56]:
def sale(comment):
    if re.findall(r'(\+)(\s)?(\d)', comment):
        results = re.findall(r'\+\s?\d', comment)
        total = 0
        for r in results:
            total += int(r[-1])
        return total
    else:
        return int(0)

In [57]:
df3['salesQuantity'] = df3['postComment'].apply(lambda x:sale(x))

In [58]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity
0,Arigato,き リーサン,0:00,00:00:00,0,1,0,0
1,Goodnite,き リーサン,0:00,00:00:00,0,1,0,0
2,Helloooo ❤️,Ernest Tan,1:20,00:01:20,0,2,0,0
3,Hi got crab today??,Veon Veon,1:40,00:01:40,0,4,0,0
4,Halo beve,き リーサン,1:42,00:01:42,0,2,0,0
5,Hello everyone! Please remember to check out your cart!,Jeffrey Ng,1:45,00:01:45,0,9,0,0
6,Hello,き リーサン,1:48,00:01:48,0,1,0,0
7,Hello,Jeffrey Ng,2:41,00:02:41,0,1,0,0
8,Hello,Shuganya Devi,2:45,00:02:45,0,1,0,0
9,I am here,Eugene Tan,2:47,00:02:47,0,3,0,0


The cells at row 58, 108, 177 and 248 are ordered without the '+' to the product code. For products code mentioned without the quantity, we assume that the product is ordered for a quantity of 1. Hence, the sales quantity are manually filled in.

In [59]:
df3.loc[58, 'salesQuantity'] = int(3)
df3.loc[109, 'salesQuantity'] = int(1)
df3.loc[179, 'salesQuantity'] = int(1)
df3.loc[250, 'salesQuantity'] = int(1)

In [60]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity
0,Arigato,き リーサン,0:00,00:00:00,0,1,0,0
1,Goodnite,き リーサン,0:00,00:00:00,0,1,0,0
2,Helloooo ❤️,Ernest Tan,1:20,00:01:20,0,2,0,0
3,Hi got crab today??,Veon Veon,1:40,00:01:40,0,4,0,0
4,Halo beve,き リーサン,1:42,00:01:42,0,2,0,0
5,Hello everyone! Please remember to check out your cart!,Jeffrey Ng,1:45,00:01:45,0,9,0,0
6,Hello,き リーサン,1:48,00:01:48,0,1,0,0
7,Hello,Jeffrey Ng,2:41,00:02:41,0,1,0,0
8,Hello,Shuganya Devi,2:45,00:02:45,0,1,0,0
9,I am here,Eugene Tan,2:47,00:02:47,0,3,0,0


In [61]:
#range of sales quantity
df3['salesQuantity'].value_counts()

0    230
1     66
2     18
3      4
Name: salesQuantity, dtype: int64

In [62]:
#total number of orders made
df3['salesQuantity'].sum()

114

In [63]:
va['salesQuantity'] = df3['salesQuantity'].sum()

In [64]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity
0,ebeveadmin/videos/241945361307367,46,1800,5746,50,1255,24,114


## Products 

The seller will comment and post the unique product codes for each product. Thereafter, customers who are keen on purchasing the products will explicitly comment out the specific & unique product codes, in addition to the quanityt of each product that they wish to purchase.

**New Columns to identify the products purchased by the Customers**

Regex is used to identify the products being offered and the products being purchased as well.

In [65]:
#function to identify the code of the product bought
def sale2(comment):
    if re.search(r'(\w*)(\s?)(\+)(\s?)(\d)', comment):
        return str(re.search(r'(\w*)(\s?)(\+)(\s?)(\d)', comment).group(0)[:-2])
    else:
        return int(0)

In [66]:
#identifies all comments that have the codes of the products purchased by the Customers
#this column will be dropped afterwards.
df3['productBought'] = df3['postComment'].apply(lambda x:sale2(x))

In [67]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productBought
0,Arigato,き リーサン,0:00,00:00:00,0,1,0,0,0
1,Goodnite,き リーサン,0:00,00:00:00,0,1,0,0,0
2,Helloooo ❤️,Ernest Tan,1:20,00:01:20,0,2,0,0,0
3,Hi got crab today??,Veon Veon,1:40,00:01:40,0,4,0,0,0
4,Halo beve,き リーサン,1:42,00:01:42,0,2,0,0,0
5,Hello everyone! Please remember to check out your cart!,Jeffrey Ng,1:45,00:01:45,0,9,0,0,0
6,Hello,き リーサン,1:48,00:01:48,0,1,0,0,0
7,Hello,Jeffrey Ng,2:41,00:02:41,0,1,0,0,0
8,Hello,Shuganya Devi,2:45,00:02:45,0,1,0,0,0
9,I am here,Eugene Tan,2:47,00:02:47,0,3,0,0,0


In [68]:
df3['productBought'].unique()

array([0, 'MCL', 'MCL ', 'MUS', 'PM', 'FE', 'FE ', 'Wak', 'BP', 'Bp',
       'LMF', 'LMF ', 'WAK', 'TRB', 'BSS', 'BSS+', 'Ching ', 'TK',
       'MC32 ', 'RE', 'Sca', 'SCA', 'Fc', 'FC', 'ST', 'Tp', 'TP', 'Live',
       'UNA', 'Gg', 'GG', 'WAK ', 'FLL', 'BBAG', 'GOL', 'WGH', 'WG18',
       'WG19', 'WGS', 'WGB', 'RS', 'SBH', 'SBF7', 'SB'], dtype=object)

The cells at row 135 & 162 for the column 'product' are erroneous as the values for the column 'postComment' had multiple orders in 1 comment. 

Additionally, the cells at rows 58, 109, 179 & 250 are ordered without the '+' to the product code. 

Hence, they are manually filled in.

In [69]:
#https://stackoverflow.com/questions/13842088/set-value-for-particular-cell-in-pandas-dataframe-using-index
df3.loc[135, 'productBought'] = 'BSS'
df3.loc[162, 'productBought'] = 'MC32 PM'
df3.loc[58, 'productBought'] = 'MUS'
df3.loc[109, 'productBought'] = 'LMF'
df3.loc[179, 'productBought'] = 'FC'
df3.loc[250, 'productBought'] = 'WAK'

Notwithstanding the above, we noticed that cell at row 132 has the product code for the column 'productBought' has a '+'. Hence, we manually remove the special character.

In [70]:
df3.loc[132, 'productBought'] = 'BSS'

In [71]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productBought
0,Arigato,き リーサン,0:00,00:00:00,0,1,0,0,0
1,Goodnite,き リーサン,0:00,00:00:00,0,1,0,0,0
2,Helloooo ❤️,Ernest Tan,1:20,00:01:20,0,2,0,0,0
3,Hi got crab today??,Veon Veon,1:40,00:01:40,0,4,0,0,0
4,Halo beve,き リーサン,1:42,00:01:42,0,2,0,0,0
5,Hello everyone! Please remember to check out your cart!,Jeffrey Ng,1:45,00:01:45,0,9,0,0,0
6,Hello,き リーサン,1:48,00:01:48,0,1,0,0,0
7,Hello,Jeffrey Ng,2:41,00:02:41,0,1,0,0,0
8,Hello,Shuganya Devi,2:45,00:02:45,0,1,0,0,0
9,I am here,Eugene Tan,2:47,00:02:47,0,3,0,0,0


In [72]:
df3['productBought'].unique()

array([0, 'MCL', 'MCL ', 'MUS', 'PM', 'FE', 'FE ', 'Wak', 'BP', 'Bp',
       'LMF', 'LMF ', 'WAK', 'TRB', 'BSS', 'TK', 'MC32 PM', 'RE', 'Sca',
       'SCA', 'Fc', 'FC', 'ST', 'Tp', 'TP', 'Live', 'UNA', 'Gg', 'GG',
       'WAK ', 'FLL', 'BBAG', 'GOL', 'WGH', 'WG18', 'WG19', 'WGS', 'WGB',
       'RS', 'SBH', 'SBF7', 'SB'], dtype=object)

Change the produce codes to be uppercase for consistency

In [73]:
#change the produce codes to be uppercase for consistency, and since python is case sensitive.
#https://stackoverflow.com/questions/39512002/convert-whole-dataframe-from-lower-case-to-upper-case-with-pandas
df3['productBought'] = df3['productBought'].astype(str).str.upper()

In [74]:
df3['productBought'].unique()

array(['0', 'MCL', 'MCL ', 'MUS', 'PM', 'FE', 'FE ', 'WAK', 'BP', 'LMF',
       'LMF ', 'TRB', 'BSS', 'TK', 'MC32 PM', 'RE', 'SCA', 'FC', 'ST',
       'TP', 'LIVE', 'UNA', 'GG', 'WAK ', 'FLL', 'BBAG', 'GOL', 'WGH',
       'WG18', 'WG19', 'WGS', 'WGB', 'RS', 'SBH', 'SBF7', 'SB'],
      dtype=object)

Remove whitespaces at the end of the string

In [75]:
df3['productBought'] = df3['productBought'].str.rstrip()

In [76]:
df3['productBought'].unique()

array(['0', 'MCL', 'MUS', 'PM', 'FE', 'WAK', 'BP', 'LMF', 'TRB', 'BSS',
       'TK', 'MC32 PM', 'RE', 'SCA', 'FC', 'ST', 'TP', 'LIVE', 'UNA',
       'GG', 'FLL', 'BBAG', 'GOL', 'WGH', 'WG18', 'WG19', 'WGS', 'WGB',
       'RS', 'SBH', 'SBF7', 'SB'], dtype=object)

In [77]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productBought
0,Arigato,き リーサン,0:00,00:00:00,0,1,0,0,0
1,Goodnite,き リーサン,0:00,00:00:00,0,1,0,0,0
2,Helloooo ❤️,Ernest Tan,1:20,00:01:20,0,2,0,0,0
3,Hi got crab today??,Veon Veon,1:40,00:01:40,0,4,0,0,0
4,Halo beve,き リーサン,1:42,00:01:42,0,2,0,0,0
5,Hello everyone! Please remember to check out your cart!,Jeffrey Ng,1:45,00:01:45,0,9,0,0,0
6,Hello,き リーサン,1:48,00:01:48,0,1,0,0,0
7,Hello,Jeffrey Ng,2:41,00:02:41,0,1,0,0,0
8,Hello,Shuganya Devi,2:45,00:02:45,0,1,0,0,0
9,I am here,Eugene Tan,2:47,00:02:47,0,3,0,0,0


### Price of Products

**New Column to identify the price of the products**

This new column is created to identify the regex of the unique product codes and their corresponding prices, as adviced by the seller.

In [78]:
#products offered by the seller
df3[df3['postComment'].str.contains('(Keyword: )(\w*)', regex=True)]

  df3[df3['postComment'].str.contains('(Keyword: )(\w*)', regex=True)]


Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productBought
21,Product Wild Angkah Prawns (1kg/mix sizes) - S$20.00 | Keyword: WAK,E-Beve,6:14,00:06:14,1,11,0,0,0
27,Product Wild Angkah Prawns (1kg/mix sizes) - S$20.00 | Keyword: WAK,Richard Ling,8:44,00:08:44,0,11,0,0,0
39,Product Live mudcrabs (800g-900g each) - S$30.00 | Keyword: MCL,Yap Yip,12:46,00:12:46,0,10,0,0,0
48,Product MUSSEL (ORANGE MEAT) - S$4.00 | Keyword: MUS,Lily Koh,15:42,00:15:42,0,9,0,0,0
62,Product Prawn Maw (1pkt) - S$2.50 | Keyword: PM,Zamzarina Hashim,20:21,00:20:21,0,9,0,0,0
73,Product Fish Eggs (500g) - S$10.00 | Keyword: FE,Justina Tan,22:27,00:22:27,0,9,0,0,0
81,Product Wild Angkah Prawns (1kg/mix sizes) - S$20.00 | Keyword: WAK,Tan Poh Kim Irene,24:02,00:24:02,0,11,0,0,0
90,Product Wild Black Pomfret (400g) - S$6.50 | Keyword: BP,Eaden Peh,27:15,00:27:15,0,10,0,0,0
101,Product Lai Man Fish - S$5.00 | Keyword: LMF,Richard Ling,28:39,00:28:39,0,9,0,0,0
115,Product Wild Angkah Prawns (1kg/mix sizes) - S$20.00 | Keyword: WAK,Veon Veon,33:18,00:33:18,0,11,0,0,0


In [79]:
def price(comment):
    if re.search(r'(S)(\$)(\d*)(.*)(\s|\s)(.*)(\:)(\s*)(.*)', comment):
        return str(re.search(r'(\$)(\d*)(.*)(\s|\s)(.*)(\:)(\s*)(.*)', comment).group(0))
    else:
        return int(0)

In [80]:
df3['productPrice'] = df3['postComment'].apply(lambda x:price(x))

In [81]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productBought,productPrice
0,Arigato,き リーサン,0:00,00:00:00,0,1,0,0,0,0
1,Goodnite,き リーサン,0:00,00:00:00,0,1,0,0,0,0
2,Helloooo ❤️,Ernest Tan,1:20,00:01:20,0,2,0,0,0,0
3,Hi got crab today??,Veon Veon,1:40,00:01:40,0,4,0,0,0,0
4,Halo beve,き リーサン,1:42,00:01:42,0,2,0,0,0,0
5,Hello everyone! Please remember to check out your cart!,Jeffrey Ng,1:45,00:01:45,0,9,0,0,0,0
6,Hello,き リーサン,1:48,00:01:48,0,1,0,0,0,0
7,Hello,Jeffrey Ng,2:41,00:02:41,0,1,0,0,0,0
8,Hello,Shuganya Devi,2:45,00:02:45,0,1,0,0,0,0
9,I am here,Eugene Tan,2:47,00:02:47,0,3,0,0,0,0


We noticed that each comment has the regex "\s\|\s\w*\:", where "\w*" is equivalent to the word 'Keyword' in the middle of the extracted string of comments for the column 'productPrice'. Hence, we will remove the mentioned regex.

In [82]:
#https://stackoverflow.com/questions/28986489/how-to-replace-text-in-a-column-of-a-pandas-dataframe
df3['productPrice'] = df3['productPrice'].str.replace("\s\|\s\w*\:", "", regex=True)

In [83]:
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productBought,productPrice
0,Arigato,き リーサン,0:00,00:00:00,0,1,0,0,0,
1,Goodnite,き リーサン,0:00,00:00:00,0,1,0,0,0,
2,Helloooo ❤️,Ernest Tan,1:20,00:01:20,0,2,0,0,0,
3,Hi got crab today??,Veon Veon,1:40,00:01:40,0,4,0,0,0,
4,Halo beve,き リーサン,1:42,00:01:42,0,2,0,0,0,
5,Hello everyone! Please remember to check out your cart!,Jeffrey Ng,1:45,00:01:45,0,9,0,0,0,
6,Hello,き リーサン,1:48,00:01:48,0,1,0,0,0,
7,Hello,Jeffrey Ng,2:41,00:02:41,0,1,0,0,0,
8,Hello,Shuganya Devi,2:45,00:02:45,0,1,0,0,0,
9,I am here,Eugene Tan,2:47,00:02:47,0,3,0,0,0,


In [84]:
df3['productPrice'].unique()

array([nan, '$20.00 WAK', '$30.00 MCL', '$4.00 MUS', '$2.50 PM',
       '$10.00 FE', '$6.50 BP', '$5.00 LMF', '$25.00 TRB', '$10.00 BSS',
       '$10.00 BSS  Buyer Wong Chow Ching +1, Lukie Neo +1, Eileen Fok +1',
       '$7.00 JL', '$14.00 TK', '$9.00 RS', '$32.00 MC32', '$18.00 RE',
       '$20.00 SCA', '$20.00 FC', '$18.00 ST', '$18.00 TP', '$20.00 LIVE',
       '$16.00 UNA', '$6.00 GG', '$8.00 FLL', '$10.00 BBAG', '$6.50 GOL',
       '$16.00 WG16', '$5.00 WGH', '$23.00 WG23', '$28.00 WG28',
       '$19.00 WG19', '$11.50 WGS', '$18.00 WG18', '$3.00 WGB',
       '$33.00 WRG', '$5.00 SBH', '$8.50 SBT8', '$7.00 SBF7', '$5.00 SB'],
      dtype=object)

Replace NaN values with an integer '0'

In [85]:
df3['productPrice'] = df3['productPrice'].fillna(0)

In [86]:
df3['productPrice'].unique()

array([0, '$20.00 WAK', '$30.00 MCL', '$4.00 MUS', '$2.50 PM',
       '$10.00 FE', '$6.50 BP', '$5.00 LMF', '$25.00 TRB', '$10.00 BSS',
       '$10.00 BSS  Buyer Wong Chow Ching +1, Lukie Neo +1, Eileen Fok +1',
       '$7.00 JL', '$14.00 TK', '$9.00 RS', '$32.00 MC32', '$18.00 RE',
       '$20.00 SCA', '$20.00 FC', '$18.00 ST', '$18.00 TP', '$20.00 LIVE',
       '$16.00 UNA', '$6.00 GG', '$8.00 FLL', '$10.00 BBAG', '$6.50 GOL',
       '$16.00 WG16', '$5.00 WGH', '$23.00 WG23', '$28.00 WG28',
       '$19.00 WG19', '$11.50 WGS', '$18.00 WG18', '$3.00 WGB',
       '$33.00 WRG', '$5.00 SBH', '$8.50 SBT8', '$7.00 SBF7', '$5.00 SB'],
      dtype=object)

Since there were multiple orders in row 135 and the orders did not follow the exact sequence for ordering, we will have to manually edit this cell.

In [87]:
df3.loc[135, 'productPrice'] = 'S$10.00 BSS'

In [88]:
df3['productPrice'].unique()

array([0, '$20.00 WAK', '$30.00 MCL', '$4.00 MUS', '$2.50 PM',
       '$10.00 FE', '$6.50 BP', '$5.00 LMF', '$25.00 TRB', '$10.00 BSS',
       'S$10.00 BSS', '$7.00 JL', '$14.00 TK', '$9.00 RS', '$32.00 MC32',
       '$18.00 RE', '$20.00 SCA', '$20.00 FC', '$18.00 ST', '$18.00 TP',
       '$20.00 LIVE', '$16.00 UNA', '$6.00 GG', '$8.00 FLL',
       '$10.00 BBAG', '$6.50 GOL', '$16.00 WG16', '$5.00 WGH',
       '$23.00 WG23', '$28.00 WG28', '$19.00 WG19', '$11.50 WGS',
       '$18.00 WG18', '$3.00 WGB', '$33.00 WRG', '$5.00 SBH',
       '$8.50 SBT8', '$7.00 SBF7', '$5.00 SB'], dtype=object)

Excluding the comments where there was no product codes as adviced by the seller, we find the number of unique products offered by the seller.

In [89]:
#number of unique products offered by the seller
int(df3['productPrice'].nunique()) - int(1)

38

In [90]:
#total number of products offered
va['numProducts'] = int(df3['productPrice'].nunique()) - int(1)

In [91]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts
0,ebeveadmin/videos/241945361307367,46,1800,5746,50,1255,24,114,38


**Drop irrelevant columns**

The following column was dropped for the following reasons:

1. 'postCommentTime'
- Since a new column 'postCommentTime_final' was created to ensure that a consistent timestamp of HH:MM:SS is used consistently throughout the dataframe, and the dataframe has been thereafter reindex and sorted in accordance to time in ascending order, we dropped the original inconsistent time column 'postCommentTime' as it had varying timestamp formats of HH:MM:SS, MM:SS and M:SS.

In [92]:
#drop unwanted columns
df3.drop(['postCommentTime'], axis=1, inplace=True)

In [93]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productBought,productPrice
0,Arigato,き リーサン,00:00:00,0,1,0,0,0,0
1,Goodnite,き リーサン,00:00:00,0,1,0,0,0,0
2,Helloooo ❤️,Ernest Tan,00:01:20,0,2,0,0,0,0
3,Hi got crab today??,Veon Veon,00:01:40,0,4,0,0,0,0
4,Halo beve,き リーサン,00:01:42,0,2,0,0,0,0


### Revenue from the sale of the products

**Dummify the products bought to find the revenue**

In [94]:
#getdummies the products bought
df3 = pd.get_dummies(df3, columns = ['productBought'], drop_first = True)

In [95]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,productBought_TK,productBought_TP,productBought_TRB,productBought_UNA,productBought_WAK,productBought_WG18,productBought_WG19,productBought_WGB,productBought_WGH,productBought_WGS
0,Arigato,き リーサン,00:00:00,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Goodnite,き リーサン,00:00:00,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Helloooo ❤️,Ernest Tan,00:01:20,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Hi got crab today??,Veon Veon,00:01:40,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Halo beve,き リーサン,00:01:42,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
def clean_columns(col):
    return col.replace(' ', '_')

In [97]:
df3.columns = [clean_columns(col) for col in df3.columns]
df3.head(1)

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,productBought_TK,productBought_TP,productBought_TRB,productBought_UNA,productBought_WAK,productBought_WG18,productBought_WG19,productBought_WGB,productBought_WGH,productBought_WGS
0,Arigato,き リーサン,00:00:00,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
# iterating the columns
for col in df3.columns:
    print(col)

postComment
postCommentAuthor
postCommentTime_final
isSeller
postCommentLength
lns
salesQuantity
productPrice
productBought_BBAG
productBought_BP
productBought_BSS
productBought_FC
productBought_FE
productBought_FLL
productBought_GG
productBought_GOL
productBought_LIVE
productBought_LMF
productBought_MC32_PM
productBought_MCL
productBought_MUS
productBought_PM
productBought_RE
productBought_RS
productBought_SB
productBought_SBF7
productBought_SBH
productBought_SCA
productBought_ST
productBought_TK
productBought_TP
productBought_TRB
productBought_UNA
productBought_WAK
productBought_WG18
productBought_WG19
productBought_WGB
productBought_WGH
productBought_WGS


After the column 'productBought' has been dummified, the cells will return a '1' if that particular item is bought, and a '0' if it is not. Hence, we will replace all the '1' with the price of the product.

Then, we will create a new revenue column which is a multiplication of the column 'salesQuantity' against the price of the product (i.e. the dummified 'productBought' columnns).

Product BBAG

In [99]:
df3[df3['postComment'].str.contains('BBAG', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,productBought_TK,productBought_TP,productBought_TRB,productBought_UNA,productBought_WAK,productBought_WG18,productBought_WG19,productBought_WGB,productBought_WGH,productBought_WGS
244,Product Baby Ang Go Li (1kg) - S$10.00 | Keyword: BBAG,Snowy Sue,01:09:45,0,11,0,0,$10.00 BBAG,0,0,...,0,0,0,0,0,0,0,0,0,0
246,BBAG+1,Lily Koh,01:11:19,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [100]:
df3['productBought_BBAG'] = df3['productBought_BBAG'].map(lambda x:float(10.00) if x == int(1) else 0)

In [101]:
df3['revenue_BBAG'] = np.multiply(df3['productBought_BBAG'], df3['salesQuantity'])

In [102]:
revenue_BBAG = "The total revenue from the sale of the product {} is ${}". format ("BBAG", format(df3['revenue_BBAG'].sum(), '.2f'))
print(revenue_BBAG)


The total revenue from the sale of the product BBAG is $10.00


Product BP

In [103]:
df3[df3['postComment'].str.contains('BP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,productBought_TP,productBought_TRB,productBought_UNA,productBought_WAK,productBought_WG18,productBought_WG19,productBought_WGB,productBought_WGH,productBought_WGS,revenue_BBAG
90,Product Wild Black Pomfret (400g) - S$6.50 | Keyword: BP,Eaden Peh,00:27:15,0,10,0,0,$6.50 BP,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0
91,BP+2,Zamzarina Hashim,00:27:17,0,1,0,2,0,0.0,1,...,0,0,0,0,0,0,0,0,0,0.0
94,BP+1,Lukie Neo,00:27:32,0,1,0,1,0,0.0,1,...,0,0,0,0,0,0,0,0,0,0.0
95,BP+2,Veon Veon,00:27:37,0,1,0,2,0,0.0,1,...,0,0,0,0,0,0,0,0,0,0.0


In [104]:
df3['productBought_BP'] = df3['productBought_BP'].map(lambda x:float(6.50) if x == int(1) else 0)

In [105]:
df3['revenue_BP'] = np.multiply(df3['productBought_BP'], df3['salesQuantity'])

In [106]:
revenue_BP = "The total revenue from the sale of the product {} is ${}". format ("BP", format(df3['revenue_BP'].sum(), '.2f'))
print(revenue_BP)

The total revenue from the sale of the product BP is $52.00


Product BSS

In [107]:
df3[df3['postComment'].str.contains('BSS', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,productBought_TRB,productBought_UNA,productBought_WAK,productBought_WG18,productBought_WG19,productBought_WGB,productBought_WGH,productBought_WGS,revenue_BBAG,revenue_BP
121,Product Brown Stripe Snapper - S$10.00 | Keyword: BSS,Justina Tan,00:35:21,0,9,0,0,$10.00 BSS,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0
126,Product Brown Stripe Snapper - S$10.00 | Keyword: BSS,Eaden Peh,00:36:33,0,9,0,0,$10.00 BSS,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0
129,BSS+1,Wong Chow Ching,00:37:38,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0
131,BSS+1,E-Beve,00:37:40,1,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0
132,BSS+ 1,Justina Tan,00:37:40,0,2,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0
133,BSS+1,E-Beve,00:38:02,1,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0
135,"Product Brown Stripe Snapper - S$10.00 | Keyword: BSS Buyer Wong Chow Ching +1, Lukie Neo +1, Eileen Fok +1",Richard Ling,00:38:45,0,20,0,3,S$10.00 BSS,0.0,0.0,...,0,0,0,0,0,0,0,0,0.0,0.0


In [108]:
df3['productBought_BSS'] = df3['productBought_BSS'].map(lambda x:float(10.00) if x == int(1) else 0)

In [109]:
df3['revenue_BSS'] = np.multiply(df3['productBought_BSS'], df3['salesQuantity'])

In [110]:
revenue_BSS = "The total revenue from the sale of the product {} is ${}". format ("BSS", format(df3['revenue_BSS'].sum(), '.2f'))
print(revenue_BSS)

The total revenue from the sale of the product BSS is $70.00


Product FC

In [111]:
df3[df3['postComment'].str.contains('FC', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,productBought_UNA,productBought_WAK,productBought_WG18,productBought_WG19,productBought_WGB,productBought_WGH,productBought_WGS,revenue_BBAG,revenue_BP,revenue_BSS
176,Product Fresh Flower Crabs (3-4pc) - S$20.00 | Keyword: FC,Justina Tan,00:53:08,0,10,0,0,$20.00 FC,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
178,FC+1,Kham N Ash Koh,00:53:37,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
179,FC,Richard Ling,00:53:53,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
183,FC+1,Eaden Peh,00:54:50,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
186,FC+1,Chelsia Lee,00:55:20,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
187,FC+2,Richard Ling,00:55:24,0,1,0,2,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
189,FC+2,Eaden Peh,00:55:41,0,1,0,2,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
199,FC+1,Richard Ling,00:56:34,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
203,FC+1,Eaden Peh,00:57:06,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0


In [112]:
df3['productBought_FC'] = df3['productBought_FC'].map(lambda x:float(20.00) if x == int(1) else 0)

In [113]:
df3['revenue_FC'] = np.multiply(df3['productBought_FC'], df3['salesQuantity'])

In [114]:
revenue_FC = "The total revenue from the sale of the product {} is ${}". format ("FC", format(df3['revenue_FC'].sum(), '.2f'))
print(revenue_FC)

The total revenue from the sale of the product FC is $220.00


Product FE

In [115]:
df3[df3['postComment'].str.contains('FE', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,productBought_WAK,productBought_WG18,productBought_WG19,productBought_WGB,productBought_WGH,productBought_WGS,revenue_BBAG,revenue_BP,revenue_BSS,revenue_FC
73,Product Fish Eggs (500g) - S$10.00 | Keyword: FE,Justina Tan,00:22:27,0,9,0,0,$10.00 FE,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
76,FE+1,Tan Poh Kim Irene,00:23:17,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
79,FE +1,Eaden Peh,00:23:53,0,2,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
83,FE+1,梁文斌,00:24:17,0,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
84,for FE can I have all Tengiri egg,Tan Poh Kim Irene,00:25:06,0,8,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
96,FE+1,E-Beve,00:27:42,1,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0


In [116]:
df3['productBought_FE'] = df3['productBought_FE'].map(lambda x:float(10.00) if x == int(1) else 0)

In [117]:
df3['revenue_FE'] = np.multiply(df3['productBought_FE'], df3['salesQuantity'])

In [118]:
revenue_FE = "The total revenue from the sale of the product {} is ${}". format ("FE", format(df3['revenue_FE'].sum(), '.2f'))
print(revenue_FE)

The total revenue from the sale of the product FE is $40.00


Product FLL

In [119]:
df3[df3['postComment'].str.contains('FLL', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,productBought_WG18,productBought_WG19,productBought_WGB,productBought_WGH,productBought_WGS,revenue_BBAG,revenue_BP,revenue_BSS,revenue_FC,revenue_FE
240,Product Flower Clams (Indonesia) - S$8.00 | Keyword: FLL,Richard Ling,01:07:07,0,9,0,0,$8.00 FLL,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
241,FLL+2,E-Beve,01:08:47,1,1,0,2,0,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
253,FLL+1,E-Beve,01:14:01,1,1,0,1,0,0.0,0.0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


In [120]:
df3['productBought_FLL'] = df3['productBought_FLL'].map(lambda x:float(8.00) if x == int(1) else 0)

In [121]:
df3['revenue_FLL'] = np.multiply(df3['productBought_FLL'], df3['salesQuantity'])

In [122]:
revenue_FLL = "The total revenue from the sale of the product {} is ${}". format ("FLL", format(df3['revenue_FLL'].sum(), '.2f'))
print(revenue_FLL)

The total revenue from the sale of the product FLL is $24.00


Product GG

In [123]:
df3[df3['postComment'].str.contains('GG', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,productBought_WG19,productBought_WGB,productBought_WGH,productBought_WGS,revenue_BBAG,revenue_BP,revenue_BSS,revenue_FC,revenue_FE,revenue_FLL
231,Product Gonggong - S$6.00 | Keyword: GG,Jess Lim,01:03:53,0,7,0,0,$6.00 GG,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
234,GG+3,Justina Tan,01:04:38,0,1,0,3,0,0.0,0.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [124]:
df3['productBought_GG'] = df3['productBought_GG'].map(lambda x:float(6.00) if x == int(1) else 0)

In [125]:
df3['revenue_GG'] = np.multiply(df3['productBought_GG'], df3['salesQuantity'])

In [126]:
revenue_GG = "The total revenue from the sale of the product {} is ${}". format ("GG", format(df3['revenue_GG'].sum(), '.2f'))
print(revenue_GG)

The total revenue from the sale of the product GG is $24.00


Product GOL

In [127]:
df3[df3['postComment'].str.contains('GOL', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,productBought_WGB,productBought_WGH,productBought_WGS,revenue_BBAG,revenue_BP,revenue_BSS,revenue_FC,revenue_FE,revenue_FLL,revenue_GG
252,Product Kukup Golden Pomfret (400g) - S$6.50 | Keyword: GOL,E-Beve,01:13:36,1,10,0,0,$6.50 GOL,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
255,GOL+1,Mike Tan,01:14:33,0,1,0,1,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
267,GOL+2,N'Ridz Kayla,01:18:29,0,1,0,2,0,0.0,0.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [128]:
df3['productBought_GOL'] = df3['productBought_GOL'].map(lambda x:float(6.50) if x == int(1) else 0)

In [129]:
df3['revenue_GOL'] = np.multiply(df3['productBought_GOL'], df3['salesQuantity'])

In [130]:
revenue_GOL = "The total revenue from the sale of the product {} is ${}". format ("GOL", format(df3['revenue_GOL'].sum(), '.2f'))
print(revenue_GOL)

The total revenue from the sale of the product GOL is $19.50


Product LIVE

In [131]:
df3[df3['postComment'].str.contains('LIVE', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,productBought_WGH,productBought_WGS,revenue_BBAG,revenue_BP,revenue_BSS,revenue_FC,revenue_FE,revenue_FLL,revenue_GG,revenue_GOL
216,Product LIVE PRAWN - S$20.00 | Keyword: LIVE,Irene Kan,00:59:55,0,8,0,0,$20.00 LIVE,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [132]:
df3['productBought_LIVE'] = df3['productBought_LIVE'].map(lambda x:float(20.00) if x == int(1) else 0)

In [133]:
df3['revenue_LIVE'] = np.multiply(df3['productBought_LIVE'], df3['salesQuantity'])

In [134]:
revenue_LIVE = "The total revenue from the sale of the product {} is ${}". format ("LIVE", format(df3['revenue_LIVE'].sum(), '.2f'))
print(revenue_LIVE)

The total revenue from the sale of the product LIVE is $20.00


Product LMF

In [135]:
df3[df3['postComment'].str.contains('LMF', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,productBought_WGS,revenue_BBAG,revenue_BP,revenue_BSS,revenue_FC,revenue_FE,revenue_FLL,revenue_GG,revenue_GOL,revenue_LIVE
101,Product Lai Man Fish - S$5.00 | Keyword: LMF,Richard Ling,00:28:39,0,9,0,0,$5.00 LMF,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103,LMF+1,Justina Tan,00:29:01,0,1,0,1,0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
107,LMF+1,Jess Lim,00:29:34,0,1,0,1,0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109,LMF,Eileen Fok,00:30:01,0,1,0,1,0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110,LMF +2,Jess Lim,00:30:26,0,2,0,2,0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112,LMF+2,Eileen Fok,00:30:45,0,1,0,2,0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [136]:
df3['productBought_LMF'] = df3['productBought_LMF'].map(lambda x:float(5.00) if x == int(1) else 0)

In [137]:
df3['revenue_LMF'] = np.multiply(df3['productBought_LMF'], df3['salesQuantity'])

In [138]:
revenue_LMF = "The total revenue from the sale of the product {} is ${}". format ("LMF", format(df3['revenue_LMF'].sum(), '.2f'))
print(revenue_LMF)

The total revenue from the sale of the product LMF is $35.00


Product MC32_PM

In [139]:
df3[df3['postComment'].str.contains('MC32', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_BBAG,revenue_BP,revenue_BSS,revenue_FC,revenue_FE,revenue_FLL,revenue_GG,revenue_GOL,revenue_LIVE,revenue_LMF
158,Product 2 pc Live mudcrabs (500g-600g each) - S$32.00 | Keyword: MC32,Richard Ling,00:44:21,0,12,0,0,$32.00 MC32,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162,MC32 +1 PM +2,Alex Ong,00:46:43,0,4,0,3,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [140]:
df3[df3['postComment'].str.contains('PM', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_BBAG,revenue_BP,revenue_BSS,revenue_FC,revenue_FE,revenue_FLL,revenue_GG,revenue_GOL,revenue_LIVE,revenue_LMF
61,PM+1,Lily Koh,00:20:16,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62,Product Prawn Maw (1pkt) - S$2.50 | Keyword: PM,Zamzarina Hashim,00:20:21,0,9,0,0,$2.50 PM,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63,PM+1,Lily Koh,00:20:28,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66,PM+1,Eaden Peh,00:20:56,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
68,PM+1,E-Beve,00:21:29,1,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85,PM+2,E-Beve,00:25:31,1,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
140,PM+1,E-Beve,00:40:26,1,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162,MC32 +1 PM +2,Alex Ong,00:46:43,0,4,0,3,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [141]:
# ($32*1)+($2.5*2)= $37
df3['revenue_MC32_PM'] = np.multiply(df3['productBought_MC32_PM'], float(37.00))

In [142]:
revenue_MC32_PM = "The total revenue from the sale of the product {} is ${}". format ("MC32_PM", format(df3['revenue_MC32_PM'].sum(), '.2f'))
print(revenue_MC32_PM)

The total revenue from the sale of the product MC32_PM is $37.00


Product MCL

In [143]:
df3[df3['postComment'].str.contains('MCL', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_BP,revenue_BSS,revenue_FC,revenue_FE,revenue_FLL,revenue_GG,revenue_GOL,revenue_LIVE,revenue_LMF,revenue_MC32_PM
39,Product Live mudcrabs (800g-900g each) - S$30.00 | Keyword: MCL,Yap Yip,00:12:46,0,10,0,0,$30.00 MCL,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,MCL+2,Richard Ling,00:14:27,0,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42,MCL +2,Zamzarina Hashim,00:14:44,0,2,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [144]:
df3['productBought_MCL'] = df3['productBought_MCL'].map(lambda x:float(30.00) if x == int(1) else 0)

In [145]:
df3['revenue_MCL'] = np.multiply(df3['productBought_MCL'], df3['salesQuantity'])

In [146]:
revenue_MCL = "The total revenue from the sale of the product {} is ${}". format ("MCL", format(df3['revenue_MCL'].sum(), '.2f'))
print(revenue_MCL)

The total revenue from the sale of the product MCL is $120.00


Product MUS

In [147]:
df3[df3['postComment'].str.contains('MUS', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_BSS,revenue_FC,revenue_FE,revenue_FLL,revenue_GG,revenue_GOL,revenue_LIVE,revenue_LMF,revenue_MC32_PM,revenue_MCL
48,Product MUSSEL (ORANGE MEAT) - S$4.00 | Keyword: MUS,Lily Koh,00:15:42,0,9,0,0,$4.00 MUS,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54,MUS+1,Ellie Lee,00:17:49,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [148]:
df3['productBought_MUS'] = df3['productBought_MUS'].map(lambda x:float(4.00) if x == int(1) else 0)

In [149]:
df3['revenue_MUS'] = np.multiply(df3['productBought_MUS'], df3['salesQuantity'])

In [150]:
revenue_MUS = "The total revenue from the sale of the product {} is ${}". format ("MUS", format(df3['revenue_MUS'].sum(), '.2f'))
print(revenue_MUS)


The total revenue from the sale of the product MUS is $16.00


Product PM

In [151]:
df3[df3['postComment'].str.contains('PM', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_FC,revenue_FE,revenue_FLL,revenue_GG,revenue_GOL,revenue_LIVE,revenue_LMF,revenue_MC32_PM,revenue_MCL,revenue_MUS
61,PM+1,Lily Koh,00:20:16,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62,Product Prawn Maw (1pkt) - S$2.50 | Keyword: PM,Zamzarina Hashim,00:20:21,0,9,0,0,$2.50 PM,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63,PM+1,Lily Koh,00:20:28,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66,PM+1,Eaden Peh,00:20:56,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
68,PM+1,E-Beve,00:21:29,1,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85,PM+2,E-Beve,00:25:31,1,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
140,PM+1,E-Beve,00:40:26,1,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162,MC32 +1 PM +2,Alex Ong,00:46:43,0,4,0,3,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.0,0.0,0.0


In [152]:
df3['productBought_PM'] = df3['productBought_PM'].map(lambda x:float(2.50) if x == int(1) else 0)

In [153]:
df3['revenue_PM'] = np.multiply(df3['productBought_PM'], df3['salesQuantity'])

In [154]:
revenue_PM = "The total revenue from the sale of the product {} is ${}". format ("PM", format(df3['revenue_PM'].sum(), '.2f'))
print(revenue_PM)


The total revenue from the sale of the product PM is $17.50


Product RE

In [155]:
df3[df3['postComment'].str.contains('RE', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_FE,revenue_FLL,revenue_GG,revenue_GOL,revenue_LIVE,revenue_LMF,revenue_MC32_PM,revenue_MCL,revenue_MUS,revenue_PM
166,Product red emperor - S$18.00 | Keyword: RE,Richard Ling,00:48:26,0,8,0,0,$18.00 RE,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
167,RE+1,Snowy Sue,00:48:39,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
170,RE+1,Eaden Peh,00:50:59,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [156]:
df3['productBought_RE'] = df3['productBought_RE'].map(lambda x:float(18.00) if x == int(1) else 0)

In [157]:
df3['revenue_RE'] = np.multiply(df3['productBought_RE'], df3['salesQuantity'])

In [158]:
revenue_RE = "The total revenue from the sale of the product {} is ${}". format ("RE", format(df3['revenue_RE'].sum(), '.2f'))
print(revenue_RE)


The total revenue from the sale of the product RE is $36.00


Product RS

In [159]:
df3[df3['postComment'].str.contains('RS', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_FLL,revenue_GG,revenue_GOL,revenue_LIVE,revenue_LMF,revenue_MC32_PM,revenue_MCL,revenue_MUS,revenue_PM,revenue_RE
153,Product Kukup Red Snapper (500g-600g) - S$9.00 | Keyword: RS,E-Beve,00:43:29,1,10,0,0,$9.00 RS,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
287,Product Kukup Red Snapper (500g-600g) - S$9.00 | Keyword: RS,Richard Ling,01:21:46,0,10,0,0,$9.00 RS,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
289,RS+1,Richard Ling,01:22:10,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
291,RS+1,E-Beve,01:23:06,1,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [160]:
df3['productBought_RS'] = df3['productBought_RS'].map(lambda x:float(9.00) if x == int(1) else 0)

In [161]:
df3['revenue_RS'] = np.multiply(df3['productBought_RS'], df3['salesQuantity'])

In [162]:
revenue_RS = "The total revenue from the sale of the product {} is ${}". format ("RS", format(df3['revenue_RS'].sum(), '.2f'))
print(revenue_RS)


The total revenue from the sale of the product RS is $18.00


Product SB

In [163]:
df3[df3['postComment'].str.contains('SB', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_GG,revenue_GOL,revenue_LIVE,revenue_LMF,revenue_MC32_PM,revenue_MCL,revenue_MUS,revenue_PM,revenue_RE,revenue_RS
296,SBH+1,E-Beve,01:26:29,1,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
297,Product Seabass Head (Whole) - S$5.00 | Keyword: SBH,Richard Ling,01:26:30,0,9,0,0,$5.00 SBH,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
298,Product Seabass Fillet Tail (350g-380g) - S$8.50 | Keyword: SBT8,Richard Ling,01:28:06,0,10,0,0,$8.50 SBT8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
300,Product Seabass Fillet (250g-300g) - S$7.00 | Keyword: SBF7,Richard Ling,01:28:37,0,9,0,0,$7.00 SBF7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
302,SBF7+2,Richard Ling,01:29:59,0,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
304,Product Kukup Seabass (600g) - S$5.00 | Keyword: SB,Snowy Sue,01:30:21,0,9,0,0,$5.00 SB,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
306,SB+1,Eileen Fok,01:31:22,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
307,SB+1,Richard Ling,01:31:37,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [164]:
df3['productBought_SB'] = df3['productBought_SB'].map(lambda x:float(5.00) if x == int(1) else 0)

In [165]:
df3['revenue_SB'] = np.multiply(df3['productBought_SB'], df3['salesQuantity'])

In [166]:
revenue_SB = "The total revenue from the sale of the product {} is ${}". format ("SB", format(df3['revenue_SB'].sum(), '.2f'))
print(revenue_SB)


The total revenue from the sale of the product SB is $10.00


Product SBF7

In [167]:
df3[df3['postComment'].str.contains('SBF7', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_GOL,revenue_LIVE,revenue_LMF,revenue_MC32_PM,revenue_MCL,revenue_MUS,revenue_PM,revenue_RE,revenue_RS,revenue_SB
300,Product Seabass Fillet (250g-300g) - S$7.00 | Keyword: SBF7,Richard Ling,01:28:37,0,9,0,0,$7.00 SBF7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
302,SBF7+2,Richard Ling,01:29:59,0,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [168]:
df3['productBought_SBF7'] = df3['productBought_SBF7'].map(lambda x:float(7.00) if x == int(1) else 0)

In [169]:
df3['revenue_SBF7'] = np.multiply(df3['productBought_SBF7'], df3['salesQuantity'])

In [170]:
revenue_SBF7 = "The total revenue from the sale of the product {} is ${}". format ("SBF7", format(df3['revenue_SBF7'].sum(), '.2f'))
print(revenue_SBF7)


The total revenue from the sale of the product SBF7 is $14.00


Product SBH

In [171]:
df3[df3['postComment'].str.contains('SBH', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_LIVE,revenue_LMF,revenue_MC32_PM,revenue_MCL,revenue_MUS,revenue_PM,revenue_RE,revenue_RS,revenue_SB,revenue_SBF7
296,SBH+1,E-Beve,01:26:29,1,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
297,Product Seabass Head (Whole) - S$5.00 | Keyword: SBH,Richard Ling,01:26:30,0,9,0,0,$5.00 SBH,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [172]:
df3['productBought_SBH'] = df3['productBought_SBH'].map(lambda x:float(5.00) if x == int(1) else 0)

In [173]:
df3['revenue_SBH'] = np.multiply(df3['productBought_SBH'], df3['salesQuantity'])

In [174]:
revenue_SBH = "The total revenue from the sale of the product {} is ${}". format ("SBH", format(df3['revenue_SBH'].sum(), '.2f'))
print(revenue_SBH)


The total revenue from the sale of the product SBH is $5.00


Product SCA

In [175]:
df3[df3['postComment'].str.contains('SCA', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_LMF,revenue_MC32_PM,revenue_MCL,revenue_MUS,revenue_PM,revenue_RE,revenue_RS,revenue_SB,revenue_SBF7,revenue_SBH
171,Product SCALLOP - S$20.00 | Keyword: SCA,Lukie Neo,00:51:39,0,7,0,0,$20.00 SCA,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
173,SCA+1,E-Beve,00:52:29,1,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [176]:
df3['productBought_SCA'] = df3['productBought_SCA'].map(lambda x:float(20.00) if x == int(1) else 0)

In [177]:
df3['revenue_SCA'] = np.multiply(df3['productBought_SCA'], df3['salesQuantity'])

In [178]:
revenue_SCA = "The total revenue from the sale of the product {} is ${}". format ("SCA", format(df3['revenue_SCA'].sum(), '.2f'))
print(revenue_SCA)


The total revenue from the sale of the product SCA is $40.00


Product ST

In [179]:
df3[df3['postComment'].str.contains('ST', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_MC32_PM,revenue_MCL,revenue_MUS,revenue_PM,revenue_RE,revenue_RS,revenue_SB,revenue_SBF7,revenue_SBH,revenue_SCA
190,Product Sotong/Squid - S$18.00 | Keyword: ST,Richard Ling,00:55:43,0,7,0,0,$18.00 ST,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193,Product Sotong/Squid - S$18.00 | Keyword: ST,Richard Ling,00:56:05,0,7,0,0,$18.00 ST,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
194,ST+1,Jess Lim,00:56:05,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
195,ST+1,Eileen Fok,00:56:10,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196,ST+1,Chelsia Lee,00:56:24,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198,ST+1,Kham N Ash Koh,00:56:30,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
202,ST+1,E-Beve,00:57:04,1,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [180]:
df3['productBought_ST'] = df3['productBought_ST'].map(lambda x:float(18.00) if x == int(1) else 0)

In [181]:
df3['revenue_ST'] = np.multiply(df3['productBought_ST'], df3['salesQuantity'])

In [182]:
revenue_ST = "The total revenue from the sale of the product {} is ${}". format ("ST", format(df3['revenue_ST'].sum(), '.2f'))
print(revenue_ST)


The total revenue from the sale of the product ST is $90.00


Product TK

In [183]:
df3[df3['postComment'].str.contains('TK', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_MCL,revenue_MUS,revenue_PM,revenue_RE,revenue_RS,revenue_SB,revenue_SBF7,revenue_SBH,revenue_SCA,revenue_ST
143,Product Tuna/Tongkol (1.3-1.4kg) - S$14.00 | Keyword: TK,Wong Chow Ching,00:41:00,0,8,0,0,$14.00 TK,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
144,TK+1,Richard Ling,00:41:11,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
146,TK+1,Jeffrey Ng,00:41:22,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [184]:
df3['productBought_TK'] = df3['productBought_TK'].map(lambda x:float(14.00) if x == int(1) else 0)

In [185]:
df3['revenue_TK'] = np.multiply(df3['productBought_TK'], df3['salesQuantity'])

In [186]:
revenue_TK = "The total revenue from the sale of the product {} is ${}". format ("TK", format(df3['revenue_TK'].sum(), '.2f'))
print(revenue_TK)


The total revenue from the sale of the product TK is $28.00


Product TP

In [187]:
df3[df3['postComment'].str.contains('TP', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_MUS,revenue_PM,revenue_RE,revenue_RS,revenue_SB,revenue_SBF7,revenue_SBH,revenue_SCA,revenue_ST,revenue_TK
206,Product Tiger Prawn (25pc) - S$18.00 | Keyword: TP,Richard Ling,00:57:42,0,9,0,0,$18.00 TP,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
207,TP+1,Richard Ling,00:58:36,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
227,TP+1,Tan Poh Kim Irene,01:02:35,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
249,TP+1,E-Beve,01:11:45,1,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [188]:
df3['productBought_TP'] = df3['productBought_TP'].map(lambda x:float(18.00) if x == int(1) else 0)

In [189]:
df3['revenue_TP'] = np.multiply(df3['productBought_TP'], df3['salesQuantity'])

In [190]:
revenue_TP = "The total revenue from the sale of the product {} is ${}". format ("TP", format(df3['revenue_TP'].sum(), '.2f'))
print(revenue_TP)


The total revenue from the sale of the product TP is $90.00


Product TRB

In [191]:
df3[df3['postComment'].str.contains('TRB', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_PM,revenue_RE,revenue_RS,revenue_SB,revenue_SBF7,revenue_SBH,revenue_SCA,revenue_ST,revenue_TK,revenue_TP
117,TRB+1,E-Beve,00:34:29,1,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
118,Product Terubok/Toli Shad - S$25.00 | Keyword: TRB,N'Ridz Kayla,00:34:29,0,8,0,0,$25.00 TRB,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120,TRB+2,Jimmy Chang,00:35:17,0,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [192]:
df3['productBought_TRB'] = df3['productBought_TRB'].map(lambda x:float(25.00) if x == int(1) else 0)

In [193]:
df3['revenue_TRB'] = np.multiply(df3['productBought_TRB'], df3['salesQuantity'])

In [194]:
revenue_TRB = "The total revenue from the sale of the product {} is ${}". format ("TRB", format(df3['revenue_TRB'].sum(), '.2f'))
print(revenue_TRB)


The total revenue from the sale of the product TRB is $75.00


Product UNA

In [195]:
df3[df3['postComment'].str.contains('UNA', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_RE,revenue_RS,revenue_SB,revenue_SBF7,revenue_SBH,revenue_SCA,revenue_ST,revenue_TK,revenue_TP,revenue_TRB
223,Product UNAGI - S$16.00 | Keyword: UNA,Eileen Fok,01:02:08,0,7,0,0,$16.00 UNA,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
228,UNA+1,Lukie Neo,01:02:35,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
229,UNA+1,Richard Ling,01:03:26,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [196]:
df3['productBought_UNA'] = df3['productBought_UNA'].map(lambda x:float(16.00) if x == int(1) else 0)

In [197]:
df3['revenue_UNA'] = np.multiply(df3['productBought_UNA'], df3['salesQuantity'])

In [198]:
revenue_UNA = "The total revenue from the sale of the product {} is ${}". format ("UNA", format(df3['revenue_UNA'].sum(), '.2f'))
print(revenue_UNA)


The total revenue from the sale of the product UNA is $32.00


Product WAK

In [199]:
df3[df3['postComment'].str.contains('WAK', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_RS,revenue_SB,revenue_SBF7,revenue_SBH,revenue_SCA,revenue_ST,revenue_TK,revenue_TP,revenue_TRB,revenue_UNA
21,Product Wild Angkah Prawns (1kg/mix sizes) - S$20.00 | Keyword: WAK,E-Beve,00:06:14,1,11,0,0,$20.00 WAK,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27,Product Wild Angkah Prawns (1kg/mix sizes) - S$20.00 | Keyword: WAK,Richard Ling,00:08:44,0,11,0,0,$20.00 WAK,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81,Product Wild Angkah Prawns (1kg/mix sizes) - S$20.00 | Keyword: WAK,Tan Poh Kim Irene,00:24:02,0,11,0,0,$20.00 WAK,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115,Product Wild Angkah Prawns (1kg/mix sizes) - S$20.00 | Keyword: WAK,Veon Veon,00:33:18,0,11,0,0,$20.00 WAK,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116,WAK+1,Kham N Ash Koh,00:34:03,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119,WAK+1,Kham N Ash Koh,00:34:40,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
220,WAK+1,Lily Koh,01:00:42,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
238,WAK +1,Chelsia Lee,01:06:38,0,2,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
250,WAK,Tan Poh Kim Irene,01:12:43,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [200]:
df3['productBought_WAK'] = df3['productBought_WAK'].map(lambda x:float(20.00) if x == int(1) else 0)

In [201]:
df3['revenue_WAK'] = np.multiply(df3['productBought_WAK'], df3['salesQuantity'])

In [202]:
revenue_WAK = "The total revenue from the sale of the product {} is ${}". format ("WAK", format(df3['revenue_WAK'].sum(), '.2f'))
print(revenue_WAK)


The total revenue from the sale of the product WAK is $160.00


Product WG18

In [203]:
df3[df3['postComment'].str.contains('WG18', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_SB,revenue_SBF7,revenue_SBH,revenue_SCA,revenue_ST,revenue_TK,revenue_TP,revenue_TRB,revenue_UNA,revenue_WAK
266,Product Wild Grouper (1kg) - S$18.00 | Keyword: WG18,Snowy Sue,01:17:59,0,9,0,0,$18.00 WG18,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
268,WG18+1,Mike Tan,01:18:33,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [204]:
df3['productBought_WG18'] = df3['productBought_WG18'].map(lambda x:float(18.00) if x == int(1) else 0)

In [205]:
df3['revenue_WG18'] = np.multiply(df3['productBought_WG18'], df3['salesQuantity'])

In [206]:
revenue_WG18 = "The total revenue from the sale of the product {} is ${}". format ("WG18", format(df3['revenue_WG18'].sum(), '.2f'))
print(revenue_WG18)


The total revenue from the sale of the product WG18 is $18.00


Product WG19

In [207]:
df3[df3['postComment'].str.contains('WG19', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_SBF7,revenue_SBH,revenue_SCA,revenue_ST,revenue_TK,revenue_TP,revenue_TRB,revenue_UNA,revenue_WAK,revenue_WG18
262,Product Wild Grouper (1.1kg) - S$19.00 | Keyword: WG19,E-Beve,01:16:08,1,9,0,0,$19.00 WG19,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
269,WG19+1,E-Beve,01:18:35,1,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [208]:
df3['productBought_WG19'] = df3['productBought_WG19'].map(lambda x:float(19.00) if x == int(1) else 0)

In [209]:
df3['revenue_WG19'] = np.multiply(df3['productBought_WG19'], df3['salesQuantity'])

In [210]:
revenue_WG19 = "The total revenue from the sale of the product {} is ${}". format ("WG19", format(df3['revenue_WG19'].sum(), '.2f'))
print(revenue_WG19)


The total revenue from the sale of the product WG19 is $19.00


Product WGB

In [211]:
df3[df3['postComment'].str.contains('WGB', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_SBH,revenue_SCA,revenue_ST,revenue_TK,revenue_TP,revenue_TRB,revenue_UNA,revenue_WAK,revenue_WG18,revenue_WG19
281,Product Wild Grouper Bones - S$3.00 | Keyword: WGB,Lily Koh,01:20:44,0,9,0,0,$3.00 WGB,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
284,WGB+1,Mike Tan,01:21:19,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [212]:
df3['productBought_WGB'] = df3['productBought_WGB'].map(lambda x:float(3.00) if x == int(1) else 0)

In [213]:
df3['revenue_WGB'] = np.multiply(df3['productBought_WGB'], df3['salesQuantity'])

In [214]:
revenue_WGB = "The total revenue from the sale of the product {} is ${}". format ("WGB", format(df3['revenue_WGB'].sum(), '.2f'))
print(revenue_WGB)


The total revenue from the sale of the product WGB is $3.00


Product WGH

In [215]:
df3[df3['postComment'].str.contains('WGH', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_SCA,revenue_ST,revenue_TK,revenue_TP,revenue_TRB,revenue_UNA,revenue_WAK,revenue_WG18,revenue_WG19,revenue_WGB
257,Product Wild Grouper Head (Half) - S$5.00 | Keyword: WGH,E-Beve,01:14:52,1,10,0,0,$5.00 WGH,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
260,WGH+2,E-Beve,01:15:30,1,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
288,WGH+2,Chelsia Lee,01:21:48,0,1,0,2,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [216]:
df3['productBought_WGH'] = df3['productBought_WGH'].map(lambda x:float(5.00) if x == int(1) else 0)

In [217]:
df3['revenue_WGH'] = np.multiply(df3['productBought_WGH'], df3['salesQuantity'])

In [218]:
revenue_WGH = "The total revenue from the sale of the product {} is ${}". format ("WGH", format(df3['revenue_WGH'].sum(), '.2f'))
print(revenue_WGH)


The total revenue from the sale of the product WGH is $20.00


Product WGS

In [219]:
df3[df3['postComment'].str.contains('WGS', na = False, regex = False)]

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_ST,revenue_TK,revenue_TP,revenue_TRB,revenue_UNA,revenue_WAK,revenue_WG18,revenue_WG19,revenue_WGB,revenue_WGH
264,Product Wild Grouper Slices (300g) - S$11.50 | Keyword: WGS,Wong Chow Ching,01:16:52,0,10,0,0,$11.50 WGS,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
275,Product Wild Grouper Slices (300g) - S$11.50 | Keyword: WGS,Lily Koh,01:19:17,0,10,0,0,$11.50 WGS,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276,WGS+1,Richard Ling,01:19:27,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278,WGS+1,Lukie Neo,01:19:31,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
279,WGS+1,Eileen Fok,01:20:07,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
282,WGS+1,Richard Ling,01:20:56,0,1,0,1,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [220]:
df3['productBought_WGS'] = df3['productBought_WGS'].map(lambda x:float(11.50) if x == int(1) else 0)

In [221]:
df3['revenue_WGS'] = np.multiply(df3['productBought_WGS'], df3['salesQuantity'])

In [222]:
revenue_WGS = "The total revenue from the sale of the product {} is ${}". format ("WGS", format(df3['revenue_WGS'].sum(), '.2f'))
print(revenue_WGS)


The total revenue from the sale of the product WGS is $46.00


In [223]:
# iterating the columns
for col in df3.columns:
    print(col)

postComment
postCommentAuthor
postCommentTime_final
isSeller
postCommentLength
lns
salesQuantity
productPrice
productBought_BBAG
productBought_BP
productBought_BSS
productBought_FC
productBought_FE
productBought_FLL
productBought_GG
productBought_GOL
productBought_LIVE
productBought_LMF
productBought_MC32_PM
productBought_MCL
productBought_MUS
productBought_PM
productBought_RE
productBought_RS
productBought_SB
productBought_SBF7
productBought_SBH
productBought_SCA
productBought_ST
productBought_TK
productBought_TP
productBought_TRB
productBought_UNA
productBought_WAK
productBought_WG18
productBought_WG19
productBought_WGB
productBought_WGH
productBought_WGS
revenue_BBAG
revenue_BP
revenue_BSS
revenue_FC
revenue_FE
revenue_FLL
revenue_GG
revenue_GOL
revenue_LIVE
revenue_LMF
revenue_MC32_PM
revenue_MCL
revenue_MUS
revenue_PM
revenue_RE
revenue_RS
revenue_SB
revenue_SBF7
revenue_SBH
revenue_SCA
revenue_ST
revenue_TK
revenue_TP
revenue_TRB
revenue_UNA
revenue_WAK
revenue_WG18
revenue_WG19


**Sum of total revenue from the video**

In [224]:
#total revenue from the video
total_revenue = df3.loc[:, 'revenue_BBAG': 'revenue_WGS'].values.sum()

#round the total revenue to 2 decimals places
total_revenue_rounded = format(total_revenue, '.2f')

print(f"The total revenue for the sale of products for the video is ${total_revenue_rounded}")


The total revenue for the sale of products for the video is $1409.00


In [225]:
va['totalRevenue'] = total_revenue_rounded

In [226]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue
0,ebeveadmin/videos/241945361307367,46,1800,5746,50,1255,24,114,38,1409.0


**New Column for the total revenue at that comment**

In [227]:
#https://stackoverflow.com/questions/42063716/pandas-sum-up-multiple-columns-into-one-column-without-last-column
df3['revenue'] = df3.loc[:, 'revenue_BBAG': 'revenue_WGS'].sum(axis=1)
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,productPrice,productBought_BBAG,productBought_BP,...,revenue_TP,revenue_TRB,revenue_UNA,revenue_WAK,revenue_WG18,revenue_WG19,revenue_WGB,revenue_WGH,revenue_WGS,revenue
0,Arigato,き リーサン,00:00:00,0,1,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Goodnite,き リーサン,00:00:00,0,1,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Helloooo ❤️,Ernest Tan,00:01:20,0,2,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Hi got crab today??,Veon Veon,00:01:40,0,4,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Halo beve,き リーサン,00:01:42,0,2,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Hello everyone! Please remember to check out your cart!,Jeffrey Ng,00:01:45,0,9,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Hello,き リーサン,00:01:48,0,1,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Hello,Jeffrey Ng,00:02:41,0,1,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Hello,Shuganya Devi,00:02:45,0,1,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,I am here,Eugene Tan,00:02:47,0,3,0,0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [228]:
#https://www.geeksforgeeks.org/how-to-move-a-column-to-first-position-in-pandas-dataframe/
#shift the revenue column to be the 8th column, i.e at position 7
eighth_column = df3.pop('revenue')

In [229]:
# insert column using insert(position,column_name,ninth_column) function
df3.insert(7, 'revenue', eighth_column)
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,productPrice,productBought_BBAG,...,revenue_TK,revenue_TP,revenue_TRB,revenue_UNA,revenue_WAK,revenue_WG18,revenue_WG19,revenue_WGB,revenue_WGH,revenue_WGS
0,Arigato,き リーサン,00:00:00,0,1,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Goodnite,き リーサン,00:00:00,0,1,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Helloooo ❤️,Ernest Tan,00:01:20,0,2,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Hi got crab today??,Veon Veon,00:01:40,0,4,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Halo beve,き リーサン,00:01:42,0,2,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Hello everyone! Please remember to check out your cart!,Jeffrey Ng,00:01:45,0,9,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Hello,き リーサン,00:01:48,0,1,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Hello,Jeffrey Ng,00:02:41,0,1,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Hello,Shuganya Devi,00:02:45,0,1,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,I am here,Eugene Tan,00:02:47,0,3,0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


After the revenue of the sales have been calculated, all the dummified product columns and the column 'productPrice' will be dropped as they are no longer required to be checked against to identify the price of the product.

In [230]:
df3 = df3.loc[: ,'postComment':'revenue']
df3

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue
0,Arigato,き リーサン,00:00:00,0,1,0,0,0.0
1,Goodnite,き リーサン,00:00:00,0,1,0,0,0.0
2,Helloooo ❤️,Ernest Tan,00:01:20,0,2,0,0,0.0
3,Hi got crab today??,Veon Veon,00:01:40,0,4,0,0,0.0
4,Halo beve,き リーサン,00:01:42,0,2,0,0,0.0
5,Hello everyone! Please remember to check out your cart!,Jeffrey Ng,00:01:45,0,9,0,0,0.0
6,Hello,き リーサン,00:01:48,0,1,0,0,0.0
7,Hello,Jeffrey Ng,00:02:41,0,1,0,0,0.0
8,Hello,Shuganya Devi,00:02:45,0,1,0,0,0.0
9,I am here,Eugene Tan,00:02:47,0,3,0,0,0.0


**New Column to identify the frequency of the seller's comments in the video**

In [231]:
#frequency of seller's comments
va['frequencySeller']= np.divide((va['videoLength'].iloc[0]),va['numSellerComments'])
#seller's comment appears on average of every 114 seconds

In [232]:
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue,frequencySeller
0,ebeveadmin/videos/241945361307367,46,1800,5746,50,1255,24,114,38,1409.0,114.92


**New Column to identify the seller**

In [233]:
df3['seller'] = 'ebeveadmin'

In [234]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller
0,Arigato,き リーサン,00:00:00,0,1,0,0,0.0,ebeveadmin
1,Goodnite,き リーサン,00:00:00,0,1,0,0,0.0,ebeveadmin
2,Helloooo ❤️,Ernest Tan,00:01:20,0,2,0,0,0.0,ebeveadmin
3,Hi got crab today??,Veon Veon,00:01:40,0,4,0,0,0.0,ebeveadmin
4,Halo beve,き リーサン,00:01:42,0,2,0,0,0.0,ebeveadmin


**New Column for the Average Compound Score from Sentiment Analysis on raw & uncleaned comments for video**

In [235]:
# Instantiate Sentiment Intensity Analyzer
sent = SentimentIntensityAnalyzer()

In [236]:
df3['sentiment_score'] = df3['postComment'].apply(sent.polarity_scores)
df3['compound'] = [sent.polarity_scores(x)['compound'] for x in df3['postComment']]
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller,sentiment_score,compound
0,Arigato,き リーサン,00:00:00,0,1,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
1,Goodnite,き リーサン,00:00:00,0,1,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
2,Helloooo ❤️,Ernest Tan,00:01:20,0,2,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
3,Hi got crab today??,Veon Veon,00:01:40,0,4,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
4,Halo beve,き リーサン,00:01:42,0,2,0,0,0.0,ebeveadmin,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0


In [237]:
#average compound scores for the video
#df3.shape[0] calculates the total number of rows in the dataframe
va['averageCompound']= (df3['compound'].sum())/(df3['compound'].sum())/df3.shape[0]
va

Unnamed: 0,video_for,totalEmojiReaction,views,videoLength,numSellerComments,numComments,lnsQuantity,salesQuantity,numProducts,totalRevenue,frequencySeller,averageCompound
0,ebeveadmin/videos/241945361307367,46,1800,5746,50,1255,24,114,38,1409.0,114.92,0.003145


In [238]:
#drop the columns with regarding to the sentiment analysis on the raw & uncleaned comments
#as we will perform sentiment analysis at the comment level on processed comments
df3 = df3.loc[: ,'postComment':'seller']

In [239]:
df3.head()

Unnamed: 0,postComment,postCommentAuthor,postCommentTime_final,isSeller,postCommentLength,lns,salesQuantity,revenue,seller
0,Arigato,き リーサン,00:00:00,0,1,0,0,0.0,ebeveadmin
1,Goodnite,き リーサン,00:00:00,0,1,0,0,0.0,ebeveadmin
2,Helloooo ❤️,Ernest Tan,00:01:20,0,2,0,0,0.0,ebeveadmin
3,Hi got crab today??,Veon Veon,00:01:40,0,4,0,0,0.0,ebeveadmin
4,Halo beve,き リーサン,00:01:42,0,2,0,0,0.0,ebeveadmin


### Saving the cleaned dataframes

In [240]:
# export to csv - change the name of the data file for each video
va.to_csv('../../data/cleaned_data/cleaned_va_ebeveadmin_241945361307367.csv', index=False)

In [241]:
#check for nulls
#displaying only the columns with nulls and their sum
df3[df3.columns[df3.isnull().any()]].isnull().sum()

Series([], dtype: float64)

In [242]:
# export to csv - change the name of the data file for each video
df3.to_csv('../../data/cleaned_data/cleaned_ebeveadmin_241945361307367.csv', index=False)