# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

sns.set_context("talk")
sns.set_style("whitegrid")
%matplotlib inline

# Loading the Data

In [2]:
reaction = pd.read_csv("Reactions.csv")
reaction_types = pd.read_csv("ReactionTypes.csv")
content = pd.read_csv('Content.csv')

In [3]:
reaction.head()

Unnamed: 0.1,Unnamed: 0,Content ID,User ID,Type,Datetime
0,0,97522e57-d9ab-4bd6-97bf-c24d952602d2,,,2021-04-22 15:17:15
1,1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,2020-11-07 09:43:50
2,2,97522e57-d9ab-4bd6-97bf-c24d952602d2,92b87fa5-f271-43e0-af66-84fac21052e6,dislike,2021-06-17 12:22:51
3,3,97522e57-d9ab-4bd6-97bf-c24d952602d2,163daa38-8b77-48c9-9af6-37a6c1447ac2,scared,2021-04-18 05:13:58
4,4,97522e57-d9ab-4bd6-97bf-c24d952602d2,34e8add9-0206-47fd-a501-037b994650a2,disgust,2021-01-06 19:13:01


In [4]:
reaction_types.head()

Unnamed: 0.1,Unnamed: 0,Type,Sentiment,Score
0,0,heart,positive,60
1,1,want,positive,70
2,2,disgust,negative,0
3,3,hate,negative,5
4,4,interested,positive,30


In [5]:
content.head()

Unnamed: 0.1,Unnamed: 0,Content ID,User ID,Type,Category,URL
0,0,97522e57-d9ab-4bd6-97bf-c24d952602d2,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,photo,Studying,https://socialbuzz.cdn.com/content/storage/975...
1,1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/9f7...
2,2,230c4e4d-70c3-461d-b42c-ec09396efb3f,a5c65404-5894-4b87-82f2-d787cbee86b4,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/230...
3,3,356fff80-da4d-4785-9f43-bc1261031dc6,9fb4ce88-fac1-406c-8544-1a899cee7aaf,photo,technology,https://socialbuzz.cdn.com/content/storage/356...
4,4,01ab84dd-6364-4236-abbb-3f237db77180,e206e31b-5f85-4964-b6ea-d7ee5324def1,video,food,https://socialbuzz.cdn.com/content/storage/01a...


# Data Cleaning

In [6]:
from string import punctuation
def remove_quotation(text):
    return text.replace('"','').lower()
    
def get_row_with_null(df):
    '''
    This is to get the row with null values
    '''
    null_row_ind = []
    for i in range(0, df.shape[0]):
        if df.iloc[i,:].isnull().sum() > 0:
            null_row_ind.append(i)
    return null_row_ind

def rename_column__by_replacing_white_space(df):
    rename_column_map = {}

    for column in df.columns:
        rename_column = column.lower().replace(" ", "_")
        rename_column_map[column] = rename_column
    
    return rename_column_map

def identify_unique_columns(df):
    n_rows = df.shape[0]
    unique_cols = []
    for col in df.columns:
        if len(df[col].unique()) == n_rows:
            unique_cols.append(col)
    if len(unique_cols) < 2:
        return unique_cols[0]
    else:
        return unique_cols

In [7]:
reaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25553 entries, 0 to 25552
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  25553 non-null  int64 
 1   Content ID  25553 non-null  object
 2   User ID     22534 non-null  object
 3   Type        24573 non-null  object
 4   Datetime    25553 non-null  object
dtypes: int64(1), object(4)
memory usage: 998.3+ KB


In [8]:
reaction.drop( columns = list(reaction.columns)[0] ,  inplace = True )
reaction_types.drop(columns =list(reaction_types.columns)[0] ,  inplace = True )
content.drop( columns = list(content.columns)[0] ,  inplace = True )

In [9]:
reaction.rename(columns = rename_column__by_replacing_white_space(reaction), inplace = True) 
reaction_types.rename(columns = rename_column__by_replacing_white_space(reaction_types), inplace = True)
content.rename(columns = rename_column__by_replacing_white_space(content), inplace = True)

In [10]:
reaction.head()

Unnamed: 0,content_id,user_id,type,datetime
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,,,2021-04-22 15:17:15
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,2020-11-07 09:43:50
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,92b87fa5-f271-43e0-af66-84fac21052e6,dislike,2021-06-17 12:22:51
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,163daa38-8b77-48c9-9af6-37a6c1447ac2,scared,2021-04-18 05:13:58
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,34e8add9-0206-47fd-a501-037b994650a2,disgust,2021-01-06 19:13:01


In [11]:
reaction.drop(columns = 'user_id', inplace = True)
reaction.shape

(25553, 3)

In [12]:
null_row_inds = get_row_with_null(reaction)
reaction.drop(index = null_row_inds, axis = 0, inplace = True)
reaction.shape

(24573, 3)

In [13]:
reaction_types.groupby(['sentiment','type'])['type'].count()

sentiment  type       
negative   disgust        1
           dislike        1
           hate           1
           scared         1
           worried        1
neutral    indifferent    1
           peeking        1
positive   adore          1
           cherish        1
           heart          1
           interested     1
           intrigued      1
           like           1
           love           1
           super love     1
           want           1
Name: type, dtype: int64

In [14]:
content.head()

Unnamed: 0,content_id,user_id,type,category,url
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,photo,Studying,https://socialbuzz.cdn.com/content/storage/975...
1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/9f7...
2,230c4e4d-70c3-461d-b42c-ec09396efb3f,a5c65404-5894-4b87-82f2-d787cbee86b4,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/230...
3,356fff80-da4d-4785-9f43-bc1261031dc6,9fb4ce88-fac1-406c-8544-1a899cee7aaf,photo,technology,https://socialbuzz.cdn.com/content/storage/356...
4,01ab84dd-6364-4236-abbb-3f237db77180,e206e31b-5f85-4964-b6ea-d7ee5324def1,video,food,https://socialbuzz.cdn.com/content/storage/01a...


In [15]:
reaction.head()

Unnamed: 0,content_id,type,datetime
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2020-11-07 09:43:50
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,2021-06-17 12:22:51
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,scared,2021-04-18 05:13:58
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2021-01-06 19:13:01
5,97522e57-d9ab-4bd6-97bf-c24d952602d2,interested,2020-08-23 12:25:58


In [16]:
content.rename(columns = {'type':'content_type'}, inplace = True)
content.drop(columns = ['user_id', 'url'], inplace = True)
content.shape

(1000, 3)

In [17]:
content.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   content_id    1000 non-null   object
 1   content_type  1000 non-null   object
 2   category      1000 non-null   object
dtypes: object(3)
memory usage: 23.6+ KB


In [18]:
content['category'].value_counts()

category
technology           71
animals              67
travel               67
culture              63
science              63
fitness              61
food                 61
healthy eating       61
cooking              60
soccer               58
tennis               58
education            57
dogs                 56
studying             55
veganism             48
public speaking      48
Fitness               5
Animals               4
Science               4
"soccer"              3
"culture"             3
Soccer                3
"dogs"                2
Education             2
Studying              2
Travel                2
Food                  2
"veganism"            1
"public speaking"     1
Public Speaking       1
"technology"          1
"cooking"             1
Healthy Eating        1
"studying"            1
"food"                1
Culture               1
"tennis"              1
Technology            1
"animals"             1
Veganism              1
"science"             1
Name: c

In [19]:
content['category'] = content['category'].apply(lambda x:remove_quotation(x))
content['category'].value_counts()

category
technology         73
animals            72
travel             69
science            68
culture            67
fitness            66
food               64
soccer             64
healthy eating     62
cooking            61
tennis             59
education          59
studying           58
dogs               58
public speaking    50
veganism           50
Name: count, dtype: int64

In [20]:
content

Unnamed: 0,content_id,content_type,category
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,photo,studying
1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,photo,healthy eating
2,230c4e4d-70c3-461d-b42c-ec09396efb3f,photo,healthy eating
3,356fff80-da4d-4785-9f43-bc1261031dc6,photo,technology
4,01ab84dd-6364-4236-abbb-3f237db77180,video,food
...,...,...,...
995,b4cef9ef-627b-41d7-a051-5961b0204ebb,video,public speaking
996,7a79f4e4-3b7d-44dc-bdef-bc990740252c,GIF,technology
997,435007a5-6261-4d8b-b0a4-55fdc189754b,audio,veganism
998,4e4c9690-c013-4ee7-9e66-943d8cbd27b7,GIF,culture


In [21]:
identify_unique_columns(content)


'content_id'

In [22]:
reaction.head()

Unnamed: 0,content_id,type,datetime
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2020-11-07 09:43:50
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,2021-06-17 12:22:51
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,scared,2021-04-18 05:13:58
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2021-01-06 19:13:01
5,97522e57-d9ab-4bd6-97bf-c24d952602d2,interested,2020-08-23 12:25:58


In [23]:
reaction.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24573 entries, 1 to 25552
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   content_id  24573 non-null  object
 1   type        24573 non-null  object
 2   datetime    24573 non-null  object
dtypes: object(3)
memory usage: 767.9+ KB


In [24]:
reaction['datetime'] = pd.to_datetime(reaction['datetime'])
reaction['datetime']

1       2020-11-07 09:43:50
2       2021-06-17 12:22:51
3       2021-04-18 05:13:58
4       2021-01-06 19:13:01
5       2020-08-23 12:25:58
                ...        
25548   2020-06-27 09:46:48
25549   2021-02-16 17:17:02
25550   2020-09-12 03:54:58
25551   2020-11-04 20:08:31
25552   2021-01-04 04:55:11
Name: datetime, Length: 24573, dtype: datetime64[ns]

In [25]:
reaction.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24573 entries, 1 to 25552
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   content_id  24573 non-null  object        
 1   type        24573 non-null  object        
 2   datetime    24573 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 767.9+ KB


In [26]:
reaction_types

Unnamed: 0,type,sentiment,score
0,heart,positive,60
1,want,positive,70
2,disgust,negative,0
3,hate,negative,5
4,interested,positive,30
5,indifferent,neutral,20
6,love,positive,65
7,super love,positive,75
8,cherish,positive,70
9,adore,positive,72


In [27]:
identify_unique_columns(reaction_types)

'type'

In [28]:
reaction

Unnamed: 0,content_id,type,datetime
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2020-11-07 09:43:50
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,2021-06-17 12:22:51
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,scared,2021-04-18 05:13:58
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2021-01-06 19:13:01
5,97522e57-d9ab-4bd6-97bf-c24d952602d2,interested,2020-08-23 12:25:58
...,...,...,...
25548,75d6b589-7fae-4a6d-b0d0-752845150e56,dislike,2020-06-27 09:46:48
25549,75d6b589-7fae-4a6d-b0d0-752845150e56,intrigued,2021-02-16 17:17:02
25550,75d6b589-7fae-4a6d-b0d0-752845150e56,interested,2020-09-12 03:54:58
25551,75d6b589-7fae-4a6d-b0d0-752845150e56,worried,2020-11-04 20:08:31


In [29]:
reaction.isnull().sum()

content_id    0
type          0
datetime      0
dtype: int64

In [30]:
reaction_types.isnull().sum()

type         0
sentiment    0
score        0
dtype: int64

In [31]:
content.isnull().sum()

content_id      0
content_type    0
category        0
dtype: int64

In [32]:
reaction

Unnamed: 0,content_id,type,datetime
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2020-11-07 09:43:50
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,2021-06-17 12:22:51
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,scared,2021-04-18 05:13:58
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2021-01-06 19:13:01
5,97522e57-d9ab-4bd6-97bf-c24d952602d2,interested,2020-08-23 12:25:58
...,...,...,...
25548,75d6b589-7fae-4a6d-b0d0-752845150e56,dislike,2020-06-27 09:46:48
25549,75d6b589-7fae-4a6d-b0d0-752845150e56,intrigued,2021-02-16 17:17:02
25550,75d6b589-7fae-4a6d-b0d0-752845150e56,interested,2020-09-12 03:54:58
25551,75d6b589-7fae-4a6d-b0d0-752845150e56,worried,2020-11-04 20:08:31


In [33]:
reaction_types

Unnamed: 0,type,sentiment,score
0,heart,positive,60
1,want,positive,70
2,disgust,negative,0
3,hate,negative,5
4,interested,positive,30
5,indifferent,neutral,20
6,love,positive,65
7,super love,positive,75
8,cherish,positive,70
9,adore,positive,72


In [34]:
content

Unnamed: 0,content_id,content_type,category
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,photo,studying
1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,photo,healthy eating
2,230c4e4d-70c3-461d-b42c-ec09396efb3f,photo,healthy eating
3,356fff80-da4d-4785-9f43-bc1261031dc6,photo,technology
4,01ab84dd-6364-4236-abbb-3f237db77180,video,food
...,...,...,...
995,b4cef9ef-627b-41d7-a051-5961b0204ebb,video,public speaking
996,7a79f4e4-3b7d-44dc-bdef-bc990740252c,GIF,technology
997,435007a5-6261-4d8b-b0a4-55fdc189754b,audio,veganism
998,4e4c9690-c013-4ee7-9e66-943d8cbd27b7,GIF,culture


In [35]:
from pandasql import sqldf

In [36]:
reaction_types_join = reaction.join(reaction_types.set_index('type'), on = 'type', how = 'inner')
df = reaction_types_join.join(content.set_index('content_id'), on = 'content_id', how = 'inner')
df.head()

Unnamed: 0,content_id,type,datetime,sentiment,score,content_type,category
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2020-11-07 09:43:50,negative,0,photo,studying
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2021-01-06 19:13:01,negative,0,photo,studying
35,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2021-04-09 02:46:20,negative,0,photo,studying
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,2021-06-17 12:22:51,negative,10,photo,studying
38,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,2020-11-09 02:49:59,negative,10,photo,studying


In [37]:
df.tail()

Unnamed: 0,content_id,type,datetime,sentiment,score,content_type,category
17352,99fcc8a6-550f-47a1-b312-aba8a031f6c7,heart,2021-02-26 15:33:26,positive,60,video,travel
20400,004e820e-49c3-4ba2-9d02-62db0065410c,heart,2021-03-09 08:50:44,positive,60,audio,tennis
25200,a7849ef3-5930-4ba9-9cbe-e215811e713e,heart,2020-09-26 22:01:17,positive,60,photo,studying
3968,e5f1a4c6-2b27-4c8b-ac9a-21bb6ef7c946,want,2021-02-12 11:54:33,positive,70,audio,soccer
1507,c59e27e9-0439-4699-8ea0-5e93f662a05d,adore,2021-02-02 15:29:19,positive,72,GIF,fitness


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24573 entries, 1 to 1507
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   content_id    24573 non-null  object        
 1   type          24573 non-null  object        
 2   datetime      24573 non-null  datetime64[ns]
 3   sentiment     24573 non-null  object        
 4   score         24573 non-null  int64         
 5   content_type  24573 non-null  object        
 6   category      24573 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 1.5+ MB


In [39]:
df.isnull().sum()

content_id      0
type            0
datetime        0
sentiment       0
score           0
content_type    0
category        0
dtype: int64

In [40]:
df.reset_index()
df.to_csv('socialbuzz_data.csv')

In [45]:
reaction.to_excel('SQL/reaction.xlsx')
reaction_types.to_excel('SQL/reaction_type.xlsx')
content.to_excel('SQL/content.xlsx')

In [46]:
reaction

Unnamed: 0,content_id,type,datetime
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2020-11-07 09:43:50
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,2021-06-17 12:22:51
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,scared,2021-04-18 05:13:58
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2021-01-06 19:13:01
5,97522e57-d9ab-4bd6-97bf-c24d952602d2,interested,2020-08-23 12:25:58
...,...,...,...
25548,75d6b589-7fae-4a6d-b0d0-752845150e56,dislike,2020-06-27 09:46:48
25549,75d6b589-7fae-4a6d-b0d0-752845150e56,intrigued,2021-02-16 17:17:02
25550,75d6b589-7fae-4a6d-b0d0-752845150e56,interested,2020-09-12 03:54:58
25551,75d6b589-7fae-4a6d-b0d0-752845150e56,worried,2020-11-04 20:08:31


In [47]:
reaction_types

Unnamed: 0,type,sentiment,score
0,heart,positive,60
1,want,positive,70
2,disgust,negative,0
3,hate,negative,5
4,interested,positive,30
5,indifferent,neutral,20
6,love,positive,65
7,super love,positive,75
8,cherish,positive,70
9,adore,positive,72


In [48]:
content

Unnamed: 0,content_id,content_type,category
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,photo,studying
1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,photo,healthy eating
2,230c4e4d-70c3-461d-b42c-ec09396efb3f,photo,healthy eating
3,356fff80-da4d-4785-9f43-bc1261031dc6,photo,technology
4,01ab84dd-6364-4236-abbb-3f237db77180,video,food
...,...,...,...
995,b4cef9ef-627b-41d7-a051-5961b0204ebb,video,public speaking
996,7a79f4e4-3b7d-44dc-bdef-bc990740252c,GIF,technology
997,435007a5-6261-4d8b-b0a4-55fdc189754b,audio,veganism
998,4e4c9690-c013-4ee7-9e66-943d8cbd27b7,GIF,culture
