In [1]:
import os 
import json
import pandas as pd 

## Extracting the data into a dataframe

In [2]:
def extract():
    """ Returns a dataframe of name, timestamp, content, participants, group chat name. 
    """
    cwd = os.getcwd() 
    inbox_dir = cwd + "/data/inbox"
    # go into inbox 
    os.chdir(inbox_dir)

    main_df = pd.DataFrame(columns = ['sender_name', 'timestamp_ms', 'content', 'num_participants', 'chat_name'])

    # iterate through each chat
    for chat_name in os.listdir(): 
        if chat_name == ".DS_Store":
            continue
        os.chdir(chat_name)
        # Get the json files in this chat 
        json_files = [i for i in os.listdir() if i[-4:] == 'json']
        path_to_json = os.getcwd()
        # iterate through the json files in this chat
        for js in json_files:
            with open(js) as json_file:
                # clean and extract data
                json_text = json.load(json_file)
                participants = json_text["participants"]
                msg_lst = json_text["messages"]
                df = pd.DataFrame.from_dict(msg_lst)
                ppl = pd.DataFrame.from_dict(participants)
                # adding in columns
                ppl_lst = list(ppl['name'])
                df["num_participants"] = len(ppl_lst)
                df["chat_name"] = chat_name
                # append this json file data to main df
                main_df = main_df.append(df, ignore_index = True)
        os.chdir(inbox_dir)

    # select columns of interest
    main_df = main_df[['sender_name', 'timestamp_ms', 'content', 'num_participants', 'chat_name']]

    return main_df

In [3]:
df = extract()

In [4]:
df.head()

Unnamed: 0,sender_name,timestamp_ms,content,num_participants,chat_name
0,Julie Kong,1591732603564,Ohh I understand! Sounds good haha,2,christinesellingfurnituresmostalmostgoodasnew_...
1,Christine Nguyen,1591728738392,The couch is kinda out of my housemates and my...,2,christinesellingfurnituresmostalmostgoodasnew_...
2,Julie Kong,1591723901775,If you are still interested and use either of ...,2,christinesellingfurnituresmostalmostgoodasnew_...
3,Julie Kong,1591723842964,Hi Christine! I actually found out about lugg/...,2,christinesellingfurnituresmostalmostgoodasnew_...
4,Julie Kong,1591720681234,Julie changed the listing description.,2,christinesellingfurnituresmostalmostgoodasnew_...


In [5]:
df.shape

(268713, 5)

## Checking missing data

In [6]:
df.sort_values(by='sender_name')['sender_name'].iloc[0]

''

We can see that there are sender names that are empty strings. Probably users who deleted their accounts.

## Converting timestamp to a pd.Timestamp object 

In [7]:
df['timestamp'] = pd.to_datetime(df['timestamp_ms'], unit = 'ms')
df.head()

Unnamed: 0,sender_name,timestamp_ms,content,num_participants,chat_name,timestamp
0,Julie Kong,1591732603564,Ohh I understand! Sounds good haha,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 19:56:43.564
1,Christine Nguyen,1591728738392,The couch is kinda out of my housemates and my...,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 18:52:18.392
2,Julie Kong,1591723901775,If you are still interested and use either of ...,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 17:31:41.775
3,Julie Kong,1591723842964,Hi Christine! I actually found out about lugg/...,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 17:30:42.964
4,Julie Kong,1591720681234,Julie changed the listing description.,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 16:38:01.234


## Add a month, day, year columns

In [8]:
df['year'] = pd.DatetimeIndex(df['timestamp']).year
df['month'] = pd.DatetimeIndex(df['timestamp']).month
df['day'] = pd.DatetimeIndex(df['timestamp']).day
df.head()

Unnamed: 0,sender_name,timestamp_ms,content,num_participants,chat_name,timestamp,year,month,day
0,Julie Kong,1591732603564,Ohh I understand! Sounds good haha,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 19:56:43.564,2020,6,9
1,Christine Nguyen,1591728738392,The couch is kinda out of my housemates and my...,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 18:52:18.392,2020,6,9
2,Julie Kong,1591723901775,If you are still interested and use either of ...,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 17:31:41.775,2020,6,9
3,Julie Kong,1591723842964,Hi Christine! I actually found out about lugg/...,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 17:30:42.964,2020,6,9
4,Julie Kong,1591720681234,Julie changed the listing description.,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 16:38:01.234,2020,6,9


## Total messages sent and received


In [9]:
total_exchanged = df.shape[0]
total_exchanged

268713

## Total messages sent

In [10]:
msg_sent_df = df.groupby(['sender_name']).count().reset_index()[['sender_name', 'chat_name']].rename(columns = {'chat_name': 'count'})
total_sent = msg_sent_df[msg_sent_df['sender_name'] == 'Christine Nguyen']['count'].iloc[0]
total_sent

126617

## Total messages received 

In [11]:
total_received = total_exchanged - total_sent
total_received

142096

## Top Senders
Who sent you the most messages. Not counting group chats

In [12]:
indv_chats_df = df[df['num_participants'] == 2]
indv_chats_df.head()

Unnamed: 0,sender_name,timestamp_ms,content,num_participants,chat_name,timestamp,year,month,day
0,Julie Kong,1591732603564,Ohh I understand! Sounds good haha,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 19:56:43.564,2020,6,9
1,Christine Nguyen,1591728738392,The couch is kinda out of my housemates and my...,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 18:52:18.392,2020,6,9
2,Julie Kong,1591723901775,If you are still interested and use either of ...,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 17:31:41.775,2020,6,9
3,Julie Kong,1591723842964,Hi Christine! I actually found out about lugg/...,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 17:30:42.964,2020,6,9
4,Julie Kong,1591720681234,Julie changed the listing description.,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 16:38:01.234,2020,6,9


In [13]:
indv_chats_count_df = indv_chats_df.groupby(['sender_name']).count().reset_index()[['sender_name', 'content']].rename(columns = {'content' : 'count'})
indv_chats_count_df = indv_chats_count_df[indv_chats_count_df['sender_name'] != 'Christine Nguyen'].sort_values(by = ['count'], ascending = False)
indv_chats_count_df.head()

Unnamed: 0,sender_name,count
186,Oscar Hu,89413
189,Paul Cao,2147
102,Ivy Ta,1950
105,Jake Kim,1650
215,Simon Zirui Guo,1490


## Who you sent the most messages to 
Not counting group chats.

In [14]:
sent_df = indv_chats_df[indv_chats_df['sender_name'] == 'Christine Nguyen']
sent_df.head()

Unnamed: 0,sender_name,timestamp_ms,content,num_participants,chat_name,timestamp,year,month,day
1,Christine Nguyen,1591728738392,The couch is kinda out of my housemates and my...,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 18:52:18.392,2020,6,9
6,Christine Nguyen,1591673849330,hmm that is a little far : // i am good thank ...,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 03:37:29.330,2020,6,9
8,Christine Nguyen,1591673655720,let me talk to my housemates about the couch,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 03:34:15.720,2020,6,9
9,Christine Nguyen,1591673637613,omg :00 i am very interested,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 03:33:57.613,2020,6,9
11,Christine Nguyen,1591673553000,hello! is the couch avail,2,christinesellingfurnituresmostalmostgoodasnew_...,2020-06-09 03:32:33.000,2020,6,9


In [15]:
top_sent = sent_df.groupby(['chat_name']).count().reset_index().rename(columns = {'timestamp' : 'count'})[['chat_name', 'count']].sort_values(by = ['count'], ascending = False)
top_sent['chat_name'] = top_sent['chat_name'].apply(lambda row : row[:row.find("_")] if row.find("_") != -1 else row)
top_sent.head()

Unnamed: 0,chat_name,count
235,oscarhu_eqoyerggqw,100451
192,jakekim_2y2xpksoxq,1705
236,paulcao_kkf2phq9cg,1630
191,ivyta_awzam7da2w,1544
222,longho_7385tgqitg,1462


## Time to start graphing!

In [16]:
import plotly 
import plotly.graph_objs as go

## Plot bar chart of how many messages you sent per month 

In [40]:
fig = go.Figure()

dates = pd.date_range('2018-01', '2018-12', freq = 'MS')
counts_per_month_df = df[df['sender_name'] == 'Christine Nguyen'].groupby(by = ['month']).count().reset_index().rename(columns = {'day':'count'})[['month','count']]
fig.add_bar(x = dates, y = counts_per_month_df['count'])
fig.layout.xaxis.tickvals = pd.date_range('2018-01', '2018-12', freq = 'MS')
fig.layout.xaxis.tickformat = '%b'
fig.update_layout(
    title = 'Messages You Sent per Month',
    xaxis_title = 'Month',
    yaxis_title = 'Count'
)
fig.show()

## Plot bar chart of how many messages you sent per month

In [41]:
fig = go.Figure()

dates = pd.date_range('2018-01', '2018-12', freq = 'MS')
msgs_received_per_month_df = df[df['sender_name'] != 'Christine Nguyen'].groupby(by = ['month']).count().reset_index().rename(columns = {'year' : 'count'})[['month', 'count']]
fig.add_bar(x = dates, y = msgs_received_per_month_df['count'])
fig.layout.xaxis.tickvals = pd.date_range('2018-01', '2018-12', freq = 'MS')
fig.layout.xaxis.tickformat = '%b'
fig.update_layout(
    title = 'Messages You Received per Month',
    xaxis_title = 'Month',
    yaxis_title = 'Count'
)
fig.show()

## Combining these two barcharts into one figure

In [42]:
fig = go.Figure()

dates = pd.date_range('2018-01', '2018-12', freq = 'MS')
fig.add_bar(x = dates, y = msgs_received_per_month_df['count'], name = 'messages received')
fig.add_bar(x = dates, y = counts_per_month_df['count'], name = 'messages sent')
fig.layout.xaxis.tickvals = pd.date_range('2018-01', '2018-12', freq = 'MS')
fig.layout.xaxis.tickformat = '%b'
fig.update_layout(
    title = 'Messages Exchanged per Month',
    xaxis_title = 'Month',
    yaxis_title = 'Count'
)
fig.show()