# Sentiment Analytics - Exploratory Data Analysis

# 1/ Import Libraries

In [None]:
import itertools
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter

# 2/ Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 3/ Get the Data

In [None]:
df_sales_main = pd.read_csv('/content/drive/My Drive/Colab Notebooks/CoTAI/Data Science Internship CoTAI 2021/Sales Analysis/all_clean_data.csv')

In [None]:
df_sales_main.head()

In [None]:
df_sales_main.shape[0]

314543

Set the first column as the Index and print out the table again

Add the name 'ID' for the Index column

In [None]:
df_sales_main.index.name = 'ID'
df_sales_main.index.name

'ID'

In [None]:
df_sales_main.head()

From this stage, I will split the tasks into 2 steps: Conversation & Conversation_Information.

# 4/ Check duplicates

Reference:

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.duplicated.html

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html

https://datatofish.com/count-duplicates-pandas/

Now I will create pivot tables based on the order of Fanpage, PSID, Message (this order can be changed at will). For this step, I will then check and drop duplicates in any.

If printing dups, it will shows internal conversations between the Customers and Sales team. So I will print only one first message without any sensitive data or information in it.

In [None]:
dups = df_sales_main.pivot_table(columns=['Fanpage', 'PSID', 'Message'], aggfunc='size')
print(dups)

In [None]:
df_sales_main.duplicated(subset=['Fanpage', 'PSID', 'Message'])

ID
0         False
1         False
2         False
3         False
4         False
          ...  
314538    False
314539    False
314540    False
314541    False
314542    False
Length: 314543, dtype: bool

In [None]:
df_sales_main.duplicated(subset=['Fanpage', 'PSID', 'Message'], keep='last')

ID
0         False
1         False
2         False
3         False
4         False
          ...  
314538    False
314539    False
314540    False
314541    False
314542    False
Length: 314543, dtype: bool

In [None]:
df_sales_main.drop_duplicates(subset=['Fanpage', 'PSID', 'Message'], keep='last')

Currently, there are no duplicates in those two data frames. Therefore, dropping duplicates is unnecessary. I just want to show you a way to drop them.

# 5/ Work with Conversation Data Frame

This data frame will contain only some variables as followings.

## 5.1/ Create a new table

In [None]:
# Create headers list
headers_Conversation = ['ID', 'Unnamed: 0', 'Fanpage', 'PSID', 'FanpageName', 'CusName', 'Message']
print("Headers of the Data Frame 'Conversation' \n", headers_Conversation)

Headers of the Data Frame 'Conversation' 
 ['ID', 'Unnamed: 0', 'Fanpage', 'PSID', 'FanpageName', 'CusName', 'Message']


In [None]:
df_Conversation = df_sales_main.filter(headers_Conversation, axis=1)
df_Conversation.head(10)

In [None]:
df_Conversation = df_Conversation.rename(columns={"Unnamed: 0": "Conversation_ID"})
df_Conversation.head()

We need to create 2 new columns for the df_Conversation, named 'Sender' and 'Order' to set Sender 0 as 'Customer', Sender 1 as 'Sales'. The Column 'Order' would be for the order index of each chat line within a conversation.

## 5.3 Split the message of each conversation into multiples rows.

In [None]:
%%time
temp_Conversation = df_Conversation['Message'].str.split('\n').apply(pd.Series, 1).stack()

CPU times: user 3min 13s, sys: 20.5 s, total: 3min 34s
Wall time: 3min 33s


In [None]:
temp_Conversation.head()

In [None]:
temp_Conversation.tail()

In [None]:
temp_Conversation.value_counts()

## 5.4 Create a list of indices based on different conversations

In [None]:
order_ids = temp_Conversation.index.droplevel(0)
order_ids

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
            ...
            16, 17, 18, 19,  0,  0,  1,  0,  0,  1],
           dtype='int64', length=4024005)

In [None]:
print(order_ids)

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
            ...
            16, 17, 18, 19,  0,  0,  1,  0,  0,  1],
           dtype='int64', length=4024005)


In [None]:
order_ids.value_counts()

0      314543
1      270499
2      243356
3      215953
4      192974
        ...  
610         1
609         1
608         1
607         1
658         1
Length: 659, dtype: int64

## 5.5 Assign those indices to corresponding chat lines

In [None]:
temp_Conversation.index = temp_Conversation.index.droplevel(-1)
temp_Conversation.head()

## 5.6 Rename the temp_Conversation

In [None]:
temp_Conversation.name = 'Message'
temp_Conversation.head()

## 5.7 Count number of chat lines of each conversation

In [None]:
temp_Conversation.str.len()

ID
0          8
0         40
0         23
0          9
0         43
          ..
314540    16
314540    30
314541     9
314542    23
314542     9
Name: Message, Length: 4024005, dtype: int64

## 5.8/ Join the temp_Conversation with the Conversation data frame correspondingly

In [None]:
df_Conversation.drop(columns=['Message'], inplace=True)
df_Conversation = df_Conversation.join(temp_Conversation)
df_Conversation['Message_ID'] = order_ids
df_Conversation.head()

In [None]:
df_Conversation.head(20)

In [None]:
df_Conversation.tail(20)

In [None]:
df_Conversation.shape[0]

4024005

## Remove the old Conversation_ID column

In [None]:
df_Conversation.drop(columns=['Conversation_ID'], inplace=True)

In [None]:
df_Conversation.head()

## Copy the current ID to a new Conversation_ID column




In [None]:
df_Conversation['Conversation_ID'] = df_Conversation.index
df_Conversation.head()

## Set the Message_ID as the main ID column for df_Conversation

In [None]:
df_Conversation.set_index('Message_ID', inplace=True)
df_Conversation.head()

## 5.9/ Check null values

In [None]:
df_Conversation.isnull().values.any()

False

In [None]:
df_Conversation.isnull().sum()

Fanpage            0
PSID               0
FanpageName        0
CusName            0
Message            0
Conversation_ID    0
dtype: int64

In [None]:
df_Conversation.shape[0]

4024005

There is no null values for the Conversation data frame so I can move on.

## 5.10/ Delete [KH] & [SALES] from chat lines & Convert Sender categories into 0: Sales, 1: Customer

Our approach will be creating a list to contain all values of the column Message. Then I go through each value by for loop to find if each chat line contain [KH] or [SALES] by using conditions. If yes, I wil replace them by "". The reason for this is that going through each row of a certain column in a data frame takes much more time and memory space. List is much easier to use for this case.

In [None]:
messages = df_Conversation['Message'].values
temp_array = [0] * len(messages)
for i in range(len(messages)):
    if messages[i].startswith('[KH]'):
        messages[i] = messages[i].replace('[KH]', "")
        temp_array[i] = 1
    elif messages[i].startswith('[SALES]'):
        messages[i] = messages[i].replace('[SALES]', "")
df_Conversation['Sender'] = temp_array 
df_Conversation['Message'] = messages

In [None]:
df_Conversation.head(20)

In [None]:
df_Conversation.tail(20)

## 5.11/ Filter the df_Conversation by Customer only (Sender = 1) and having only 3 columns: ID, Conversation_ID, Message, Sender = 1

In [None]:
selected_df_Conversation = df_Conversation[["Message", "Sender"]]
selected_df_Conversation.head(10)

In [None]:
selected_df_Conversation.tail(10)

In [None]:
customer_filtered_df_Conversation = selected_df_Conversation[selected_df_Conversation['Sender'] == 1]
customer_filtered_df_Conversation.shape

(1643453, 2)

In [None]:
customer_filtered_df_Conversation.head(50)

In [None]:
customer_filtered_df_Conversation.tail(50)

At this stage, I have to check manually the top 50 rows and the last 50 rows of this new dataframe and compare them with rows from the original dataframe of Converastion to make sure Conversation IDs are correct for each chat.

# 6/ Work with Conversation Information data frame

This other new data frame will contain the variables as below.

## 6.1/ Create a new table

In [None]:
headers_Conversation_Information = ['ID', 'Unnamed: 0', 'CustomerCount', 'SalesCount', 'StartTime', 'EndTime']
print("Headers of the Data Frame 'Conversation_Information' \n", headers_Conversation_Information)

Headers of the Data Frame 'Conversation_Information' 
 ['ID', 'Unnamed: 0', 'CustomerCount', 'SalesCount', 'StartTime', 'EndTime']


In [None]:
df_Conversation_Information = df_sales_main.filter(headers_Conversation_Information, axis=1)
df_Conversation_Information.head(5)

In [None]:
df_Conversation_Information = df_Conversation_Information.rename(columns={"Unnamed: 0": "Conversation_ID"})
df_Conversation_Information.head()

In [None]:
df_Conversation_Information.shape[0]

314543

In [None]:
boolean = df_Conversation_Information['Conversation_ID'].duplicated().any() 
boolean

True

In [None]:
df_Conversation_Information.drop_duplicates(subset=['Conversation_ID'])

In [None]:
boolean = df_Conversation_Information['Conversation_ID'].duplicated().any() 
boolean

True

In [None]:
duplicate = df_Conversation_Information[df_Conversation_Information.duplicated()]
print(duplicate)

Empty DataFrame
Columns: [Conversation_ID, CustomerCount, SalesCount, StartTime, EndTime]
Index: []


Customer Count seems to be longer as they talked more to ask questions for their requirements. 

## 6.2/ Check null values

In [None]:
df_Conversation_Information.isnull().values.any()

False

In [None]:
df_Conversation_Information.isnull().sum()

Conversation_ID    0
CustomerCount      0
SalesCount         0
StartTime          0
EndTime            0
dtype: int64

In [None]:
df_Conversation_Information.shape[0]

314543

There is no null values in the Conversation_Information data frame so I can save the data frames as CSV files now.

# 7/ Work with Customer data frame

## 7.1/ Create a new table

In [None]:
# Create headers list
headers_Customer = ['PSID', 'CusName']
print("Headers of the Data Frame 'Customer' \n", headers_Customer)

Headers of the Data Frame 'Customer' 
 ['PSID', 'CusName']


In [None]:
df_Customer = df_sales_main.filter(headers_Customer, axis=1)
df_Customer.head(5)

# 8/ Work with Fan Page data frame


## 8.1/ Create a new table

In [None]:
# Create headers list
headers_Fan_Page = ['Fanpage', 'FanpageName']
print("Headers of the Data Frame 'Fan Page' \n", headers_Fan_Page)

Headers of the Data Frame 'Fan Page' 
 ['Fanpage', 'FanpageName']


In [None]:
df_Fan_Page = df_sales_main.filter(headers_Fan_Page, axis=1)
df_Fan_Page.head(5)

# 9/ Save data frames into files

In [None]:
df_Conversation.to_csv('/content/drive/My Drive/Colab Notebooks/CoTAI/Data Science Internship CoTAI 2021/SQL Alchemy/Conversation.csv', encoding='utf-8')

In [None]:
customer_filtered_df_Conversation.to_csv('/content/drive/My Drive/Colab Notebooks/CoTAI/Data Science Internship CoTAI 2021/SQL Alchemy/customer_filtered_Conversation.csv', encoding='utf-8')

In [None]:
df_Conversation_Information.to_csv('/content/drive/My Drive/Colab Notebooks/CoTAI/Data Science Internship CoTAI 2021/SQL Alchemy/Conversation_Information.csv', encoding='utf-8')

In [None]:
df_Customer.to_csv('/content/drive/My Drive/Colab Notebooks/CoTAI/Data Science Internship CoTAI 2021/SQL Alchemy/Customer.csv', encoding='utf-8')

In [None]:
df_Fan_Page.to_csv('/content/drive/My Drive/Colab Notebooks/CoTAI/Data Science Internship CoTAI 2021/SQL Alchemy/Fan_Page.csv', encoding='utf-8')