In [1]:
#!pip install pandas

In [118]:
import pandas as pd
import re
import pandas as pd
import plotly.express as px
import plotly.express as px
import spacy
import numpy as np
import gensim.corpora as corpora
import gensim
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis
import os
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def import_chat(file):
    # Read the file
    with open(file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Extract the data using regular expressions
    data = []
    for line in lines:
        match = re.findall('\[(.*?)\] (.*?): (.*)', line.strip())
        if match:
            data.append(match[0])
        
    # Create a pandas dataframe from the data
    print()
    df = pd.DataFrame(data, columns=['DateTime', 'Author', 'Message'])
    df.drop([0, 1, 2], axis='index', inplace=True)
    # Create a new dataframe with only valid rows
    valid_rows = []
    for i, row in df.iterrows():
        try:
            date_time = pd.to_datetime(row['DateTime'], format='%m/%d/%y, %I:%M:%S %p')
            row['Date'] = date_time.date()
            row['Time'] = date_time.time()
            valid_rows.append(row)
        except ValueError as e:
            print(f"Error on row {i}: {e}")
    df = pd.DataFrame(valid_rows)
    df.reset_index(inplace=True, drop=True)
    return df# Display the first 5 rows of the valid dataframe

def plot_message(file):
    data = import_chat(file)
# Get the value counts for each author
    author_freq = data['Author'].value_counts()
# Get the top 10 authors by message count
    top_authors = author_freq.iloc[:10]
# Create a dataframe with the author and message count data
    data_messages = pd.DataFrame({'Author': top_authors.index, 'Message Count': top_authors.values})
# Create the plot using plotly.express
    fig = px.bar(data_messages, x='Message Count', y='Author', orientation='h',
                 labels={'Message Count': 'Message Count', 'Author': 'Author'},
                 title='Top 10 Authors by Message Count')
    fig.show()

def character_count(data):
    data = import_chat(file)
    data['Message Character Count'] = data['Message'].apply(lambda x: len(x))
    dataframe = data.copy()
# Calculate the total character count for each author
    author_character_counts = data.groupby('Author')['Message Character Count'].sum()
# Sort the authors by their total character count in descending order
    sorted_authors = author_character_counts.sort_values(ascending=False)
    sorted_authors = sorted_authors.iloc[:10]
# Create a dataframe with the author and message count data
    data_character = pd.DataFrame({'Author': sorted_authors.index, 'Character Count': sorted_authors.values})
# Create the plot using plotly.express
    fig = px.bar(data_character, x='Character Count', y='Author', orientation='h',
                 labels={'Character Count': 'Character Count', 'Author': 'Author'},
                 title='Top 10 Authors by Message Count')
    fig.show()
    
    return dataframe

def pie_chart(data):
# Calculate the total character count for each author
    data = character_count(data)
    author_character_counts = data.groupby('Author')['Message Character Count'].sum()
# Sort the authors by their total character count in descending order
    sorted_authors = author_character_counts.sort_values(ascending=False)
    top_authors = sorted_authors.iloc[:10]
# Calculate the sum of the character counts
    total_character_count = top_authors.sum()
# Calculate the percentage of character count for each author
    author_percentages = [(count / total_character_count) * 100 for count in top_authors]
# Calculate the total character count of the authors outside the top 10
    other_authors = sorted_authors.iloc[10:]
    other_character_count = other_authors.sum()
# Calculate the percentage of character count for the other authors
    other_percentage = (other_character_count / total_character_count) * 100
# Create a dataframe with the author and percentage data, including 'others'
    data = pd.DataFrame({'Author': list(top_authors.index) + ['Others'], 'Percentage': list(author_percentages) + [other_percentage]})
# Create the plot using plotly.express
    fig = px.pie(data, values='Percentage', names='Author',
                 title='Top 10 Authors by Character Count (including Others)')
    fig.show()

def line_by_hour(data):
    data = character_count(data)
    data['DateTime'] = pd.to_datetime(data['DateTime'], format='%m/%d/%y, %I:%M:%S %p')
# Group the messages by hour
    hourly_counts = data.groupby(data['DateTime'].dt.hour)['Message'].count().reset_index(name='Message Count')
    hourly_counts['DateTime'] = hourly_counts['DateTime'].map({0: '12AM', 1: '1AM', 2: '2AM', 3: '3AM', 4: '4AM', 5: '5AM',
                                                               6: '6AM', 7: '7AM', 8: '8AM', 9: '9AM', 10: '10AM', 11: '11AM',
                                                               12: '12PM', 13: '1PM', 14: '2PM', 15: '3PM', 16: '4PM', 17: '5PM',
                                                               18: '6PM', 19: '7PM', 20: '8PM', 21: '9PM', 22: '10PM', 23: '11PM'})
# Plot the chart using plotly.express
    fig = px.line(hourly_counts, x='DateTime', y='Message Count', title='Messages by Hour')
    fig.show()


def pre_process(dataframe):
    pattern = r'[^a-zA-Z\s]|\s+[a-zA-Z]\s+|\s+'
# apply the regex pattern to the 'message' column, remove punctuations and emojis
    dataframe['New Message'] = dataframe['Message'].apply(lambda x: re.sub(pattern, ' ', x))

# convert the 'message' column to lowercase
    dataframe['New Message'] = dataframe['New Message'].str.lower()

    # Filter out the rows containing these texts
    dataframe = dataframe[~dataframe['Message'].str.contains('omitted|deleted')]
    filter_func = lambda s: ' '.join([w for w in s.split() if len(w) >= 6])
# apply the filter function to the specified column
    dataframe['New Message'] = dataframe['New Message'].apply(filter_func)
    dataframe['New Message'].replace('',np.nan,regex = True, inplace=True)
    dataframe.dropna(inplace=True)
    data_new = dataframe.loc[dataframe["New Message"].str.count(" ") >= 10]
    data_new.reset_index(inplace=True, drop=True)
    print('Preprocessing done')
    return data_new

def topic_modelling():
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'dey', 'go', 'na', 'people', 'even', 'make', 'know', 'one', 'still', 'like', 'say', 'time',
                       'sha', 'ooo', 'sure', 'first', 'person', 'want', 'wey', 'take', 'give', 'ni', 'day', 'problem', 
                       'Abeg',    'Oya',    'Nawa',    'Chop',    'Shishi',    'Wahala',    'Sabi',
                       'Jenifa',    'Soro',    'Kai',    'Banga',    'Wetin',    'Gbege',    'Kolo',    'Belle',    'Pikin',    'Gist',    'Ehen', 
                       'Shine',    'Oyinbo',    'Katakata',    'Hustle',    'Padi',    'Ikebe',    'Naija',    'Ojoro',    'Jollof',    'Jasi',   
                       'Waka',    'Ogbonge',    'Kpatakpata',    'Mumu',    'Orobo',    'Skelewu',    'Amebo',    'Aproko',    'Sisi',    'Yawa',  
                       'Chai',    'Abi',    'Gbese',    'Gbera',    'Go-slow',    'Gbe body e',    'Kpanlogo',    'Lai-lai',
                       'Palava',    'Suo',    'Totori',    'Zobo'])
    def sent_to_words(sentences):
        for sentence in sentences:
            # deacc=True removes punctuations
            yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
    def remove_stopwords(texts):
        return [[word for word in simple_preprocess(str(doc)) 
                 if word not in stop_words] for doc in texts]
    data = df['New Message'].tolist()
    data_words = list(sent_to_words(data))
    # remove stop words
    data_words = remove_stopwords(data_words)
    #print(data_words[:1][0][:30])
# Create Dictionary
    id2word = corpora.Dictionary(data_words)
# Create Corpus
    texts = data_words
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
# View
    num_topics = 10
# Build LDA model
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics)
    doc_lda = lda_model[corpus]
    def get_topic_keywords(lda_model):
        topic_keywords = {}
        for topic_id in range(lda_model.num_topics):
            topic_keywords[topic_id] = [word for word, prob in lda_model.show_topic(topic_id)]
        print(topic_keywords)
        return topic_keywords
# Visualize the topics
    # Visualize the topics
    pyLDAvis.enable_notebook()
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    return LDAvis_prepared



invalid escape sequence \[


invalid escape sequence \[


invalid escape sequence \[

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [137]:
file = "C:/Users/Dee/Desktop/Portfolio/Chat Analysis/WhatsApp Chat - Engr.2021 Alumni(FET)/_chat.txt"
data = import_chat(file)


Error on row 12670: time data 'Breaking' does not match format '%m/%d/%y, %I:%M:%S %p' (match)


In [138]:
data

Unnamed: 0,DateTime,Author,Message,Date,Time
0,"5/4/22, 10:28:20 AM",‪+234 903 410 3691‬,"NYSC REGISTRATION FOR 2022 Batch B, STREAM 1 W...",2022-05-04,10:28:20
1,"5/4/22, 10:30:15 AM",Samuel Alabo | Makyr,001!!!!,2022-05-04,10:30:15
2,"5/4/22, 10:31:27 AM",‪+234 903 410 3691‬,How far,2022-05-04,10:31:27
3,"5/4/22, 10:33:17 AM",‪+234 903 410 3691‬,"You know my name guy, everytime shouldn't be a...",2022-05-04,10:33:17
4,"5/4/22, 10:33:54 AM",Quadri,‎sticker omitted,2022-05-04,10:33:54
...,...,...,...,...,...
20379,"2/19/23, 11:16:50 AM",Gbogo,We can't blame God. We can only blame ourselves.,2023-02-19,11:16:50
20380,"2/19/23, 11:16:50 AM",Dee 🏡Aero Dee,Just go to settings and click chat then export,2023-02-19,11:16:50
20381,"2/19/23, 11:17:08 AM",‪+234 803 887 9196‬,Ú can blame Buhari and also his advisers,2023-02-19,11:17:08
20382,"2/19/23, 11:17:09 AM",‪+234 810 413 0059‬,Blame humans simple,2023-02-19,11:17:09


In [140]:
data['Author'] = data['Author'].str.replace('[^\w\s]+|\s+', '', regex=True)
data['Author'] = data['Author'].astype(str)
radio_list = data['Author'].unique()
tuple(radio_list)


invalid escape sequence \w


invalid escape sequence \w


invalid escape sequence \w



('2349034103691',
 'SamuelAlaboMakyr',
 'Quadri',
 'JamoVillaBaba',
 '2348146864224',
 'AyoCivil',
 'Mama',
 'LateeYekini',
 'TeslimCyborg',
 '2348104130059',
 '2348125759082',
 'Asiyanbolatayo',
 'DVGlory',
 'DeeAeroDee',
 'Habeebulah',
 'Gold',
 '2348167564278',
 '2347039610666',
 '2349065425811',
 '2347036125979',
 'EgeOla',
 'Ridwan',
 'Kemi',
 'KyrexAERO',
 '2348139433878',
 '2349079605713',
 '2348026026177',
 '2348143768841',
 'BoluMSE',
 'Gbogo',
 'Abhey',
 '2348111966849',
 '2349077095333',
 'KunmiMech',
 'Winnifred2',
 '2349029946259',
 '2347068878478',
 '2347032737892',
 '2348139426301',
 'FrancisAdedeji',
 '2347036248418',
 '2349054703681',
 '2348156165537',
 '2348159206302',
 'BolajiSmall',
 'DurosinlorunAkeem',
 'Walex',
 '2348156119190',
 '2348100158195',
 '2347018069372',
 'BoogieMan',
 'OlaMse',
 '2348135426805',
 'Engr2021AlumniFET',
 '2348026610695',
 '2347035800917',
 'ALSHShenko',
 'Afronasent',
 'Taiwo',
 'SulaMania',
 '2348140851696',
 'Westbrown',
 'AyodejiA',
 '

In [7]:
df['DateTime'] = pd.to_datetime(df['DateTime'], format='%m/%d/%y, %I:%M:%S %p')

# Group the messages by hour
hourly_counts = df.groupby(df['DateTime'].dt.hour).size().reset_index(name='Message Count')

# Plot the chart using plotly.express
fig = px.line(hourly_counts, x='DateTime', y='Message Count', title='Messages by Hour')
fig.show()

In [9]:
#!pip install spacy

In [10]:
#!python -m spacy download en_core_web_sm

Preprocessing done


[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [75]:
topic_modelling()

['wallohi', 'islamic', 'country', 'judgement', 'carried', 'muslim', 'leader', 'country', 'admonish', 'islamic', 'country', 'report', 'ignorance', 'criminal', 'extent', 'burning', 'forbidden', 'anybody', 'purnish', 'someone', 'except', 'interest', 'defense', 'ignorance']
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 3), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)]
[(0,
  '0.012*"nigeria" + 0.012*"protest" + 0.012*"buhari" + 0.010*"government" + '
  '0.010*"country" + 0.010*"enough" + 0.008*"fulani" + 0.008*"remain" + '
  '0.008*"arrested" + 0.008*"nirsal"'),
 (1,
  '0.013*"system" + 0.012*"matter" + 0.012*"mistake" + 0.012*"someone" + '
  '0.012*"country" + 0.008*"checks" + 0.008*"dollars" + 0.008*"studies" + '
  '0.008*"memory" + 0.008*"months"'),
 (2,
  '0.029*"actually" + 0.021*"tinubu" + 0.013*"concerned" + 0.013*"nigeria" + '
  '0.009*"saying" + 0.009*"campaign" + 0.009*"normal" + 0.009*"finished" + '
  '0.


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.



In [45]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'dey', 'go', 'na', 'people', 'even', 'make', 'know', 'one', 'still', 'like', 'say', 'time',
                   'sha', 'ooo', 'sure', 'first', 'person', 'want', 'wey', 'take', 'give', 'ni', 'day', 'problem', 
                   'Abeg',    'Oya',    'Nawa',    'Chop',    'Shishi',    'Wahala',    'Sabi',
                   'Jenifa',    'Soro',    'Kai',    'Banga',    'Wetin',    'Gbege',    'Kolo',    'Belle',    'Pikin',    'Gist',    'Ehen', 
                   'Shine',    'Oyinbo',    'Katakata',    'Hustle',    'Padi',    'Ikebe',    'Naija',    'Ojoro',    'Jollof',    'Jasi',   
                   'Waka',    'Ogbonge',    'Kpatakpata',    'Mumu',    'Orobo',    'Skelewu',    'Amebo',    'Aproko',    'Sisi',    'Yawa',  
                   'Chai',    'Abi',    'Gbese',    'Gbera',    'Go-slow',    'Gbe body e',    'Kpanlogo',    'Lai-lai',
                   'Palava',    'Suo',    'Totori',    'Zobo'])
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data = df['New Message'].tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['wallohi', 'islamic', 'country', 'judgement', 'carried', 'muslim', 'leader', 'country', 'admonish', 'islamic', 'country', 'report', 'ignorance', 'criminal', 'extent', 'burning', 'forbidden', 'anybody', 'purnish', 'someone', 'except', 'interest', 'defense', 'ignorance']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 3), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)]


In [69]:
from pprint import pprint
# number of topics
num_topics = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())

doc_lda = lda_model[corpus]

def get_topic_keywords(lda_model):
    topic_keywords = {}
    for topic_id in range(lda_model.num_topics):
        topic_keywords[topic_id] = [word for word, prob in lda_model.show_topic(topic_id)]
    print(topic_keywords)
    return topic_keywords

# Visualize the topics


[(0,
  '0.018*"enough" + 0.014*"grammar" + 0.011*"interest" + 0.009*"government" + '
  '0.009*"believe" + 0.009*"senate" + 0.009*"country" + 0.009*"comprehensive" '
  '+ 0.009*"purpose" + 0.009*"nigerians"'),
 (1,
  '0.028*"tinubu" + 0.026*"country" + 0.013*"system" + 0.012*"buhari" + '
  '0.011*"individual" + 0.010*"corruption" + 0.008*"except" + '
  '0.008*"judgement" + 0.008*"politics" + 0.008*"leaders"'),
 (2,
  '0.022*"buhari" + 0.019*"nigeria" + 0.016*"nobody" + 0.015*"sacked" + '
  '0.010*"president" + 0.010*"senior" + 0.010*"manifesto" + 0.010*"commission" '
  '+ 0.010*"paying" + 0.010*"tinubu"'),
 (3,
  '0.013*"president" + 0.011*"buhari" + 0.011*"friend" + 0.010*"transfers" + '
  '0.009*"general" + 0.009*"better" + 0.009*"anything" + 0.009*"surveillance" '
  '+ 0.009*"savings" + 0.009*"ikiootnomk"'),
 (4,
  '0.012*"business" + 0.012*"shepherd" + 0.012*"counterfeit" + '
  '0.012*"governor" + 0.006*"hiding" + 0.006*"million" + 0.006*"culture" + '
  '0.006*"almost" + 0.006*"amou

In [48]:
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis
import os
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
LDAvis_prepared


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.



In [None]:
import numpy
print(numpy.__version__)

In [None]:
df.describe()