# Whatsapp Chat analysis

#### Export chats which you want to analyse, in txt format file
Every line in this txt file will consists a msg sent by a user. Format of each line is of the form {date}, {time} - {sender}: {Message}

### Importing Libraries

In [None]:
import re
import numpy as np
import pandas as pd
import emoji

### We will fetch the date and time when message was sent

In [None]:
def getdate(ip):
    # We will use regex to get date from txt file
    date = '^([0-9]+)(\/)([0-9]+)(\/)([0-9])([0-9]), ([0-9]+):([0-9][0-9]) (am|pm) - '
    op = re.match(date, ip)
    if op:
        return True
    else:
        return False
# Just to verify our function implementation is correct
print(getdate("01/02/20, 1:44 pm - "))

### Fetch the sender of a message

In [None]:
def sender(ip):
    user = ["([\w]+):", "([\w]+)+([\s])+([\w]+):", "([\w]+)+([\s])+([\w]+)+([\s])+([\w]+):",
            "([+]\d{2} \d{5} \d{5}):", '([\w]+)[\u263a-\U0001f999]+:']

    # The expression at index 3 is for mobile numbers from India
    # while the one at 4 th position is for any name and emoji

    user = "^" +  "|".join(user)
    op = re.match(user, ip)

    if op:
        return True
    return False
# Check it's implementation
print(sender("ABC: "))

### Putting all together

In [None]:
# Get message and data by putting all together
def data(l):
    line = l.split(" - ")
    print(line)
    date_time = line[0]
    txt = " ".join(line[1:])    # txt consists author and message
    dt, time = date_time.split(", ")    # dt has date and time has time when msg was sent
    message = " ".join(line[1:])
    if sender(txt):
        msg = txt.split(": ")
        user = msg[0]
        print(msg)
        message = " ".join(msg[1:])
    else:
        user = None

    return dt, time, user, message

dt, time, user, me = data("01/02/20, 1:56 pm - ABC: Hey, every one I welcome you all")
print(dt)
print(time)
print(user)
print(me)

## Generating dataframe from our txt file

### Creating lists from raw data

In [None]:
df_data = []
txt_file = "chat_m.txt"

with open(txt_file, encoding="utf-8") as f:
    # skipping the first encryption line
    f.readline()
    messages = []
    date, time, user = [], [], []
    line = f.readlines()

    for i in line:
            
        if getdate(i):
            p = i.split(" - ")
            if sender(p[1]):
                dt, t, u, mess = data(i)              
                messages.append(mess)
                date.append(dt)
                time.append(t)
                user.append(u)
            else:
                x = i.split(" - ")
                dt = x[0]
                msg = x[1:]
                messages.append(" ".join(msg))
                dt = dt.split(", ")
                dt, t = dt[0], dt[1]
                date.append(dt)
                user.append(None)
                time.append(t)
        else:
            messages[-1]+=i
print("Total number of messages including media content",len(messages))

### Removing media content
also remove unnecessary newline character i.e. "\n"

In [None]:
clean_msg, clean_u, clean_dt, clean_t = [], [], [], []

for i in range(len(messages)):
    if messages[i]!='<Media omitted>\n':
        clean_msg.append(messages[i][:-1].replace("\n", " "))
        clean_u.append(user[i])
        clean_dt.append(date[i])
        clean_t.append(time[i])
    
print(len(clean_t))
print(len(clean_dt))
print(len(clean_u))
print("After cleaning total number of messages",len(clean_msg))

### Generating Dataframe

In [None]:
df = pd.DataFrame(list(zip(clean_dt, clean_t, clean_u, clean_msg)), columns=["Date", "Time", "Sender", "Message"])
print(df.shape)
df.sample(10)

In [None]:
df.describe()

### Finding Emoji's from a message

In [None]:
emoji_list = []
def emoji_count(text, emoji_list=[]):
    text = text.split(" ")
    for word in text:
        for i in word:
            if i in emoji.UNICODE_EMOJI:
                emoji_list.append(i)
    return emoji_list

print(emoji_count('😍 BFF ❤️😍❤️ BFF 😍"',[]))

In [None]:
# Getting all emojis from all messages
emo = []
for i in clean_msg:
    emoji_count(i, emo)

## Working on Emoji

In [None]:
emoji_df = pd.DataFrame(emo, columns=["Emoji"])
emoji_df.describe()

In [None]:
emoji_df["Emoji"].unique()

## Short Chat Summary

In [None]:
print("Total Messages", len(messages))
print("Total Media messages", len(messages)-len(clean_msg))
print("Total Number of Emojis used", len(emo))

### We will add another column for word count and Char count

In [None]:
df['Char_Count'] = df['Message'].apply(lambda s : len(s))
df['Word_Count'] = df['Message'].apply(lambda s : len(s.split(' ')))
df['Emoji Used'] = df['Message'].apply(lambda s: emoji_count(s,[]))

In [None]:
df.sample(5)

In [None]:
df.describe()

### Sender Stats

In [None]:
# Getting all participants of a group
senders = df["Sender"].unique()
p = list(senders)
print(p)

In [None]:
list(df["Message"])

In [None]:
for i in range(len(p)):
    s_df = df.loc[df["Sender"]==p[i]]
    print("Stats of ",p[i])
    print("Total Messages sent by {} is {}".format(p[i],s_df.shape[0]))
    msg = list(s_df["Message"])
    avg_c = s_df.mean()[0]
    avg_w = s_df.mean()[1]
    print("Average size of message sent by {} is {} words or {} characters".format(p[i], int(avg_w), int(avg_c)))
    
    # finding total number of emoji sent by individual participant
    e = []
    for j in msg:
        e = emoji_count(j, e)
    
    print("Total Number of Emoji sent by {} is {}".format(p[i],len(e)))
    
    edf = pd.DataFrame(e,columns=["Emoji"])
    a = edf.describe()
    freq_em = a["Emoji"][2]
    freq = a["Emoji"][3]
    print("Most frequently used emoji by {} is {}.\n{} used {} for {} times".format(p[i], freq_em, p[i], freq_em, freq))
    print(edf["Emoji"].value_counts())
    
    print()

In [None]:
text = " ".join(review for review in df.Message)

In [None]:
text

In [None]:
import matplotlib.pyplot as plt
from wordcloud import STOPWORDS, WordCloud
#stopwords = list(STOPWORDS)
stopwords = ["ha", "Hu", "tu", "na", "ne", "eni", "ma", "to","toh", "Ok", "su", "nai", "thi", "che", "This", "deleted", "message", "Aa", "ni"]

# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
# Display the generated image:
# the matplotlib way:
  
plt.figure( figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()