# Exploratory data analysis on message history

# Preliminary notes

The area of study is one text message coming from or going to my phone, and it requires intent either of me or of one my contacts to send a text message (be it an iMessage or a SMS text)

# Setup

## Installs

In [None]:
! pip install pandas numpy seaborn pyarrow datetime

## Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import timedelta

## Options

In [None]:
pd.options.display.max_rows=300

# Open file and look at general characteristics

In [None]:
sms_orig = pd.read_csv('../../../data/allmessages.csv',  date_format='ISO8601')

In [None]:
sms_orig.info()

In [None]:
sms_orig.tail()

# EDA to filter out least contacted senders

## Create config class to store specific configuration options

In [None]:
class Config:
    MESSAGE_COUNT_CUTOFF  = 100
    SESSION_TIMEOUT = 15 # in minutes

## Inspect the frequency of senders in the SMS data

In [None]:
message_count = sms_orig['Chat Session'].value_counts()

In [None]:
message_count.hist(bins=200)

The distributions of messages per sender is very long-tailed. To give a more useful view, we switch the x axis to log scale.

In [None]:
logbins = np.geomspace(message_count.min(), message_count.max(), 50)

plt.figure(figsize=(10, 6))
plt.hist(message_count, bins=logbins)
plt.xscale('log')
plt.xlabel('Number of messages by sender (log-transformed)')
plt.ylabel('Frequency')

In [None]:
'In  an attempt to make the EDA less sensitive to senders I barely interacted with, let\'s cut all senders having sent less than {} messages.'.format(Config.MESSAGE_COUNT_CUTOFF)

In [None]:
all_senders  = message_count.index
selected_senders = message_count[message_count >= Config.MESSAGE_COUNT_CUTOFF].index
sms = sms_orig[sms_orig['Chat Session'].isin(selected_senders)]


In [None]:
'Setting the cutoff at {0} messages yields {1:.1%} percent of messages and {2:.1%} percent of senders'.format(int(Config.MESSAGE_COUNT_CUTOFF), float(sms.shape[0] / sms_orig.shape[0]), float(selected_senders.size / all_senders.size))

## Inspect the message type

In [None]:
sms['Type'].value_counts()

In [None]:
sms[sms['Type']=='Notification'].head(1)

Notifications are not in the area of study, thus they will be discarded for the rest of the EDA

In [None]:
sms = sms[sms['Type'] != 'Notification']

## Inspect time series by month of SMS activity

In [None]:
sms['date'] = pd.to_datetime(sms['Message Date'])

# Extract the month from the 'date' column
sms['month'] = sms['date'].dt.to_period('M')

# Group the data by month and count the number of messages for each month
messages_by_month = sms.groupby('month').size()

# Plot the number of messages exchanged by month
plt.figure(figsize=(20, 6))
messages_by_month.plot(kind='bar', xlabel='Month', ylabel='Number of Messages', title='Number of Messages Exchanged by Month')

plt.show()

## Create a few dimensions, measures

In [None]:
pd.to_datetime(sms['Message Date']).apply(lambda x: x.strftime('%Y%m'))

In [None]:
sms['Text Length'] = sms['Text'].str.len()
sms['Message Day'] = pd.to_datetime(sms['Message Date']).dt.round('D')
sms['Message Hour'] = pd.to_datetime(sms['Message Date']).dt.to_period('h')
sms['Message Month'] = pd.to_datetime(sms['Message Date']).dt.to_period('M')
sms['Message Yearmo'] =  pd.to_datetime(sms['Message Date']).apply(lambda x: x.strftime('%Y%m'))


sms['Message Has Heart Kiss'] = sms['Text'].map(lambda x : '😘' in str(x))
sms['Message Has Happy Kiss'] = sms['Text'].map(lambda x : '😚' in str(x))

# What month do I tend to text most in?

In [None]:
pd.to_datetime(sms['Message Date']).dt.month_name().value_counts()

In [None]:
messages_by_month

In [None]:


# Extract the month from the 'date' column

# Group the data by month and count the number of messages for each month
def groupandplotstacked100(data=sms, by_col='Type', agg='sum', extracttopnvalues=3 ):

    sms.loc[sms[by_col].isin((sms[by_col].value_counts()[sms[by_col].value_counts() < extracttopnvalues]).index), by_col] = 'other'

    if agg == 'sum':
        messages_by_month = sms.groupby(['month', by_col])['Text Length'].sum()
    elif agg =='count':
        messages_by_month = sms.groupby(['month', by_col])['Text Length'].count()
    else:
        KeyError("Err: agg is not within expected parameters.")

    # Plot the number of messages exchanged by month

    plt.figure(figsize=(10, 6))
    messages_by_month_unstacked = messages_by_month.unstack().fillna(0)

    messages_by_month_unstacked = messages_by_month_unstacked.divide(messages_by_month_unstacked.sum(axis=1), axis=0)

    messages_by_month_unstacked.plot(kind='bar', xlabel='Month', ylabel='Aggregate Message Length', title='Aggregate Message Length sent by Month', stacked=True)

    plt.hlines(0.5, 0, 1000, color='grey')

    plt.show()

TODO fix x axis labels
TODO make this proportional to 100% for each month

In [None]:
groupandplotstacked100(data=sms, by_col='Type')

In [None]:
groupandplotstacked100(data=sms, by_col='Service', agg='count')

In [None]:
groupandplotstacked100(data=sms, by_col='Service', agg='sum')

In [None]:
groupandplotstacked100(data=sms[sms['Chat Session']=='Sara Stilleke'], by_col='Message Has Happy Kiss', agg='count')

In [None]:
groupandplotstacked100(data=sms[sms['Chat Session']=='Sara Stilleke'], by_col='Message Has Happy Kiss', agg='count')

In [None]:
groupandplotstacked100(data=sms, by_col='Chat Session', agg='count', extracttopnvalues=4)

# Feature enginnering at the message level

In [None]:
sms.groupby(['Chat Session'])['Message Has Heart Kiss'].sum().sort_values(ascending=False).head(10)

In [None]:
sms['Text'].map(lambda x : '😘' in str(x)).value_counts()

In [None]:


sms['Text'].map(lambda x : '😘' in str(x)).value_counts()

In [None]:
sms.loc[sms['Chat Session']=='Chris Swim & Darryl Pierce & Jarrod Amsterdam & James Perry & +19405773150 & Chris Farris & +14155677107 & John Vochatzer & David Young', 'Chat Session']='TWATs'
sms['Chat Session'].value_counts().head(25)

In [None]:
sms.head()

In [None]:
sms[sms['Text Length'] < 400.0]['Text Length'].hist(bins=200)

In [None]:
sms['Service'].value_counts(dropna=False)

TODO plot this, also answer the question am I less likely to answer an SMS than an iMessage

In [None]:
sms.groupby([sms['Service']]

In [None]:
sms[sms['Chat Session']=='Sara Stilleke']['Type'].value_counts()

TODO look at the breakdown of incoming/outgoing by Chat Session

In [None]:
sms.info()

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,3)


last_year_contacts = sms[sms['date'].dt.year >= 2023]['Chat Session'].unique()

sms[sms['Chat Session'].isin(last_year_contacts)].groupby(['Chat Session'])['Type']\
.value_counts(normalize=True).unstack('Type').sort_values(by='Incoming').plot.bar(stacked=True)

In [None]:
sms.groupby([    'Chat Session', sms['Message Day'].dt.year ])['Text Length'].sum().sort_values().tail(20)

In [None]:
smsday = sms.groupby([
    sms['Message Day'].dt.date,
    'Chat Session']
).agg(
     {'Text Length' : ['count', 'sum']}
     
).reset_index()

In [None]:
pd.options.display.max_rows = 100

sms.groupby(['Chat Session'])['Text Length'].count().reset_index().sort_values('Text Length').tail(100)


In [None]:
sms.groupby(['Chat Session'])['Text Length'].mean().reset_index().sort_values('Text Length').head(100)


TODO break this down by sent/received and add count

In [None]:
sms[sms['Chat Session']=='Kevin Oswald'][['Sender Name', 'Text']].reset_index()

In [None]:
sms.groupby(['Chat Session'])['Text Length'].count().reset_index().sort_values('Text Length').tail(10)


In [None]:
sms[sms['Message Day'].dt.year > 2020].groupby(['Chat Session'])['Text Length'].count().reset_index().sort_values('Text Length').tail(10)


# Remove all text sent by anyone not in the rough top 30

In [None]:
sms['Chat Session'].value_counts().head(40)

# Look at only people I texted in the past year

In [None]:
sms.head()

In [None]:
last_year_contacts = sms[sms['date'].dt.year == 2024]['Chat Session'].unique()

In [None]:
last_year_contacts

In [None]:
messages_by_month = sms[sms['Chat Session'].isin(last_year_contacts)].groupby('month').size()

# Plot the number of messages exchanged by month
plt.figure(figsize=(20, 6))
messages_by_month.plot(kind='bar', xlabel='Month', ylabel='Number of Messages', title='Number of Messages Exchanged by Month')

plt.show()

# Session creation

ORder SMS by datetime sent, group by sender, lag the datetime, create indicator of datetime greater than 15 minutes

In [None]:
pd.to_datetime(sms['Message Date'])

# Carissa EDA love live

In [None]:
sms['Message Has love'] = sms['Text'].map(lambda x : 'love' in str(x).lower())
sms['Message Has live'] = sms['Text'].map(lambda x : 'live' in str(x).lower())

In [None]:
car = sms[sms['Chat Session']=='Carissa Brown']

In [None]:
cari = car[car['Type']=='Incoming']

In [None]:
cari.shape

In [None]:
cari['Message Has love'].value_counts(normalize=True)

In [None]:
cari['Message Has live'].value_counts(normalize=True)

# Search for pens to get David for his birthday

In [None]:
pd.options.display.max_colwidth = 160


In [None]:
dave_texts_june = sms[np.logical_and(sms['Chat Session']=='David Young', pd.DatetimeIndex(sms['Message Date']).month==7)]['Text'].reset_index()\
    
dave_texts_june

In [None]:
pd.DatetimeIndex(sms['Message Date']).month