In [None]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
with open('clean1.json') as f:
  telegramJson = json.load(f)

In [None]:
len(telegramJson)

In [None]:
telegramJson[2]

In [None]:
type(telegramJson)

In [None]:
df = pd.DataFrame(telegramJson)

In [None]:
df.columns

In [None]:
df.head(5)

In [None]:
#renaming partner's name to proper naming for easy analysis (without emoji)
df['from'].replace('BBChengğŸ˜˜ğŸ’“', 'Jiayin', inplace=True)

In [None]:
#removing unnecessary features from raw json
df.drop(['reply_to_message_id', 'edited','photo', 'width', 'height', 'file', 'thumbnail',
       'media_type', 'sticker_emoji', 'mime_type', 'duration_seconds',
       'forwarded_from', 'via_bot', 'contact_information', 'actor', 'actor_id',
       'action', 'discard_reason', 'location_information',
       'live_location_period_seconds', 'contact_vcard', 'poll'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
total = len(df)
print(total)

# EDA on counts over the Days, Months, Years

In [None]:
#creating new feature (year) for easy analysis
df['year'] = df['date'].astype(str).str[0:4]

In [None]:
df.head()

In [None]:
xpos = 'center'
offset = {'center': 0.5}
plt.figure(figsize=(8,5))
ax = sns.countplot('from', data=df)

for rect in ax.patches:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width() * offset[xpos], 1 * height, '{}'.format(height), va='bottom', fontsize=10)


plt.title('How many messages have we sent over the years?', fontsize = 12)
plt.xlabel("Year", labelpad=14)
plt.ylabel("Messages", labelpad=14)

In [None]:
xpos = 'center'
offset = {'center': 0.5}
plt.figure(figsize=(8,5))
ax = sns.countplot('year', data=df)

for rect in ax.patches:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width() * offset[xpos], 1 * height, '{}'.format(height), va='bottom', fontsize=10)


plt.title('How many messages have we sent over the years?', fontsize = 12)
plt.xlabel("Year", labelpad=14)
plt.ylabel("Messages", labelpad=14)

In [None]:
xpos = 'center'
offset = {'center': 0.2}
plt.figure(figsize=(8,5))
ax = sns.countplot('year', data=df, hue='from')

for rect in ax.patches:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width() * offset[xpos], 1 * height, '{}'.format(height), va='bottom', fontsize=10)


plt.title('How many messages have we sent over the years?', fontsize = 12)
plt.xlabel("Year", labelpad=14)
plt.ylabel("Messages", labelpad=14)

# EDA deep dive by analysing Months and Years

In [None]:
#creating new feature (month) for easy analysis
df['month'] = df['date'].astype(str).str[5:7]

In [None]:
df.head()

In [None]:
""""
#creating new feature (Month2) for easy analysis and reading - eg. Apr, May, Jun
monthMap = {'01':'Jan','02':'Feb','03':'Mar','04':'Apr','05':'May','06':'Jun','07':'Jul','08':'Aug', '09':'Sep','10':'Oct','11':'Nov','12':'Dec'}
df['month2'] = df['month'].map(monthMap)
df.drop(['month'], axis=1, inplace=True)
df.rename(columns={'month2':'month'}, inplace=True)
""""

In [None]:
#creating simple pd with 12 months of counts across the 3 years of data
year2017bymonth = pd.DataFrame(df[df['year']=='2017']['id'].groupby(df['month']).count())
year2018bymonth = pd.DataFrame(df[df['year']=='2018']['id'].groupby(df['month']).count())
year2019bymonth = pd.DataFrame(df[df['year']=='2019']['id'].groupby(df['month']).count())
year2017bymonth = year2017bymonth.reset_index()
year2018bymonth = year2018bymonth.reset_index()
year2019bymonth = year2019bymonth.reset_index()

In [None]:
sns.barplot(x="month", y="id", data=year2017bymonth)
plt.title('Total number of messages sent by months in 2017', fontsize = 12)

In [None]:
sns.barplot(x="month", y="id", data=year2018bymonth, palette="Blues_r")
plt.xlabel("Months", labelpad=14)
plt.ylabel("Messages", labelpad=14)
plt.title('Total number of messages sent by months in 2018', fontsize = 12)

In [None]:
plt.figure(figsize=(6,5))
sns.barplot(x="month", y="id", data=year2019bymonth)
plt.xlabel("Months", labelpad=14)
plt.ylabel("Messages", labelpad=14)
plt.title('Total number of messages sent by months in 2019', fontsize = 12)

# EDA deep dive by analysing Hours/Time of the day

In [None]:
df['hour'] = df['date'].astype(str).str[11:13]

In [None]:
df.head()

In [None]:
""""
#creating method to determine if text was sent in morning, afternoon, night
def impute_period(hr):
    hour = int(hr)
    
    if hour >6 and hour <=12:
        return 'Morning'
    elif hour >13 and hour<=19:
        return 'Afternoon'
    else: 
        return 'Night'
"""

In [None]:
df.head()

In [None]:
hourPeriod = pd.DataFrame(df.groupby('hour')['id'].count())
hourPeriod = hourPeriod.reset_index()

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(x="hour", y="id", data=hourPeriod)
plt.title('Total number of messages sent by hours', fontsize = 12)
plt.xlabel("Hour", labelpad=14)
plt.ylabel("Messages", labelpad=14)

In [None]:
hourPeriod

In [None]:
yearHour = pd.DataFrame(df.groupby(['hour','year'])['id'].count())
yearHour = yearHour.reset_index()

In [None]:
yearHour

In [None]:
fig, ax = plt.subplots(figsize=(20,6))
heatmap_data = pd.pivot_table(yearHour, values='id', 
                     index=['year'], 
                     columns='hour')
sns.heatmap(heatmap_data, cmap='Blues',linewidths=1, ax=ax)

plt.xlabel("Hour") 
plt.ylabel("Year")
plt.show()