In [1]:
# see full details @ https://gracejiang.github.io/cis545/

In [None]:
import pandas as pd 
import json
from pandas import json_normalize
import requests
import glob
import datetime
import matplotlib.pyplot as plt
import nltk
import os
import fnmatch
from wordcloud import WordCloud, STOPWORDS
import sklearn
import scipy.stats as st
from textblob import TextBlob

In [None]:
#
#
# loading in all msgs into the dataframe all_msgs
#
#
files_path = 'all-messages/inbox/'

all_msgs = pd.DataFrame()

for root, dir, files in os.walk(files_path):
    for json_file in fnmatch.filter(files, "*.json"):
        file_url = root + '/' + json_file
        if not ('file' in file_url):
            with open(file_url) as json_data:
                data = json.load(json_data)
            print(file_url)
            curr_json_df = json_normalize(data, 'messages')
            all_msgs = pd.concat([all_msgs, curr_json_df])
            

all_msgs.head(10)

In [17]:
#
#
# make a copy of all_msgs, since i don't want to have to rerun previous step (takes a long time) in the case that i accidentally override data
#
# comment out lines when appropriate
#
#

# all_msgs_copy = all_msgs.copy()
# all_msgs = all_msgs_copy.copy()

In [None]:
# total number of convos
files_path = 'all-messages/inbox/'

all_roots = set()
for root, dir, files in os.walk(files_path):
    for json_file in fnmatch.filter(files, "*.json"):
        file_url = root + '/' + json_file
        if not ('file' in file_url): 
            all_roots.add(file_url)
            
len(all_roots)

In [None]:
#
#
# convert columns into correct data type
#
#

# drop nan values
all_msgs = all_msgs.dropna(subset=['content'])

# convert timestamp to datetime format
all_msgs['datetime'] = all_msgs.apply(lambda row: datetime.datetime.fromtimestamp(int(row.timestamp_ms) * 0.001), axis = 1)

# separate date and time from datetime
all_msgs['date'] = [d.date() for d in all_msgs['datetime']]
all_msgs['month'] = [d.month for d in all_msgs['datetime']]
all_msgs['year'] = [d.year for d in all_msgs['datetime']]
all_msgs['time'] = [d.time() for d in all_msgs['datetime']]

# select only certain columns
all_msgs = all_msgs[['sender_name', 'date', 'month', 'year', 'time', 'content', 'reactions', 'datetime']]

# rename column sender_name to name
all_msgs = all_msgs.rename(columns={'sender_name': 'name'})

# sort by datetime
all_msgs = all_msgs.sort_values(by=['datetime'])

# only msgs from 2012 and later
all_msgs = all_msgs[all_msgs['year'] >= 2012]

In [None]:
all_msgs.head(5)

In [None]:
# total number of msgs exchagned on facebook
print(len(all_msgs))

In [None]:
# dataframe to count messages from all people except me

other_people_df = all_msgs[all_msgs['name'] != 'Grace Jiang']
my_msgs_df = all_msgs[all_msgs['name'] == 'Grace Jiang']

# how many msgs other people sent me
print(len(other_people_df))

# how many msgs i sent to other people
print(len(my_msgs_df))

In [None]:
# plot of total number of messages sent per day over time
msgs_per_date = all_msgs.groupby('date')
counts = msgs_per_date.date.count()
counts.plot(kind="line")
plt.xticks(rotation=90)

plt.show()

# ok, so this graph, is too jumpy, so i'm going to break down by per year to smooth it out


In [None]:
# plot total number of messages sent & received per year over time
fig, ax = plt.subplots()

# total number of messages sent & received
plot_all_msgs_df = all_msgs.groupby(['year'], as_index=False).agg({'content': 'count'})

# number of msgs i sent
plot_sent_msgs_df = my_msgs_df.groupby(['year'], as_index=False).agg({'content': 'count'})

# number of msgs i received
plot_received_msgs_df = other_people_df.groupby(['year'], as_index=False).agg({'content': 'count'})

ax.plot(plot_all_msgs_df['year'], plot_all_msgs_df['content'], label='total messages')
ax.plot(plot_sent_msgs_df['year'], plot_sent_msgs_df['content'], label='messages sent')
ax.plot(plot_received_msgs_df['year'], plot_received_msgs_df['content'], label='messages received')

plt.title("messages sent & received over time", loc='center', fontsize=14, fontweight=0, color='black')
ax.set_xlabel("year")
ax.set_ylabel("number of messages")
ax.legend(loc='best')



In [None]:
#
#
# count number of messages received per person
#
#

# msgs_per_person = all_msgs.groupby(['name']).count() 
# msgs_per_person = msgs_per_person['content']

msgs_per_person = other_people_df['name'].value_counts()
msgs_per_person.head(5)

In [None]:
# count messages per person per month & year
# only include people with at least 25k messages received (~50k total messages, indicating significant talking)

close_friends_series = msgs_per_person[msgs_per_person >= 25000]

close_friends = set(close_friends_series.index)
for friend in close_friends:
    print(friend)

#
# filter messages to include only close friends
#

close_friends_df = other_people_df[other_people_df['name'].isin(close_friends)]

In [None]:
# plot number of messages exchanged with my close friends over time

fig, ax = plt.subplots()

plot_cf_df = close_friends_df.groupby(['year', 'name'], as_index=False).agg({'content': 'count'})

for name in close_friends:
    ax.plot(plot_cf_df[plot_cf_df.name == name].year, plot_cf_df[plot_cf_df.name == name].content,label=name)

plt.title("messages received from close friends over time", loc='center', fontsize=14, fontweight=0, color='black')
ax.set_xlabel("year")
ax.set_ylabel("number of messages")
ax.legend(loc='best')


In [None]:
# graph excluding outlier
outlier_name = 'Name Here' #edit this to contain your own outlier

excluding_outliers = close_friends.copy()
excluding_outliers.remove(outlier_name)

excluding_outliers_df = other_people_df[other_people_df['name'].isin(excluding_outliers)]

fig, ax = plt.subplots()

plot_cf_df = excluding_outliers_df.groupby(['year', 'name'], as_index=False).agg({'content': 'count'})

for name in excluding_outliers:
    ax.plot(plot_cf_df[plot_cf_df.name == name].year, plot_cf_df[plot_cf_df.name == name].content,label=name)

plt.title("messages received from close friends excluding outliers", loc='center', fontsize=14, fontweight=0, color='black')
ax.set_xlabel("year")
ax.set_ylabel("number of messages")
ax.legend(loc='best')

In [29]:
# analyzing how my language changes over time
# word clouds!

In [None]:
# stopwords
# most commonly used words

from collections import Counter

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['ur', 'u', 'like', 'ok', 'im', 'yea', 'â', 'dont', 'oh', 'yeah', 'idk', 'also', 'thats', 'i', 'and', 'the', 'a', 'but', 'so', 'then', 'bc', 'cuz'])


In [16]:
# splitting into wordclouds over years

my_msgs_2012 = my_msgs_df[my_msgs_df['year'] == 2012]
my_msgs_2013 = my_msgs_df[my_msgs_df['year'] == 2013]
my_msgs_2014 = my_msgs_df[my_msgs_df['year'] == 2014]
my_msgs_2015 = my_msgs_df[my_msgs_df['year'] == 2015]
my_msgs_2016 = my_msgs_df[my_msgs_df['year'] == 2016]
my_msgs_2017 = my_msgs_df[my_msgs_df['year'] == 2017]
my_msgs_2018 = my_msgs_df[my_msgs_df['year'] == 2018]
my_msgs_2019 = my_msgs_df[my_msgs_df['year'] == 2019]
my_msgs_2020 = my_msgs_df[my_msgs_df['year'] == 2020]

In [18]:
# generate all words in a list

def generate_words_list(df):
    # split content into lists of words
    split_words = df.content.str.lower().str.split()
    split_words_df = pd.DataFrame(split_words)

    # iterate through each word and add word to all_words_list
    # count_var = 0
    all_words_list = list()
    for index, row in split_words_df.iterrows():
        if (type(row.content) == list):
            for word in row.content:
                if word not in stopwords:
                    all_words_list.append(word)
            # print(str(count_var), ": ", row.content)
            # count_var = count_var + 1

    # convert all_words_list to dataframe
    all_words_df = pd.DataFrame(all_words_list, columns=['word'])
    all_words_df.head()

    #
    # uncomment the following code below to see the count of
    # number of words (see most popular words per year)
    #
    # counts = all_words_df.groupby('word')\
    #     .word.value_counts()\
    #     .to_frame()\
    #     .rename(columns={'word':'count'})

    # counts = counts.sort_values(by=['count'], ascending=False)
    # counts.head(15)

    return all_words_list

# generate wordcloud
def generate_wordcloud(df, title):
    all_words_list = generate_words_list(df)
    wordcloud = WordCloud(
        width = 1500,
        height = 1000,
        background_color = 'black',
        stopwords = STOPWORDS).generate(str(all_words_list))
    fig = plt.figure(
        figsize = (40, 30),
        facecolor = 'k',
        edgecolor = 'k')
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.title(title, loc='center', fontsize=80, color='white')
    plt.show()


In [None]:
generate_wordcloud(my_msgs_2020, "Most Common Words I Used in 2020")

In [None]:
generate_wordcloud(my_msgs_2019, "Most Common Words I Used in 2019")

In [None]:
generate_wordcloud(my_msgs_2018, "Most Common Words I Used in 2018")

In [None]:
generate_wordcloud(my_msgs_2017, "Most Common Words I Used in 2017")

In [None]:
generate_wordcloud(my_msgs_2016, "Most Common Words I Used in 2016")

In [None]:
generate_wordcloud(my_msgs_2015, "Most Common Words I Used in 2015")

In [None]:
generate_wordcloud(my_msgs_2014, "Most Common Words I Used in 2014")

In [None]:
generate_wordcloud(my_msgs_2013, "Most Common Words I Used in 2013")

In [None]:
generate_wordcloud(my_msgs_2012, "Most Common Words I Used in 2012")

In [None]:
# sentiment analysis for all my messages over the years
from textblob import TextBlob

def find_sentiment_analysis(df):
    sentiment = 0.0
    num_msgs = 0.0
    for row in df.content.str.lower():
        blob = TextBlob(row)
        sentiment += blob.sentiment.polarity
        num_msgs += 1
    return sentiment / num_msgs * 100.0

sentiment_analysis = []
sentiment_analysis.append(find_sentiment_analysis(my_msgs_2012))
sentiment_analysis.append(find_sentiment_analysis(my_msgs_2013))
sentiment_analysis.append(find_sentiment_analysis(my_msgs_2014))
sentiment_analysis.append(find_sentiment_analysis(my_msgs_2015))
sentiment_analysis.append(find_sentiment_analysis(my_msgs_2016))
sentiment_analysis.append(find_sentiment_analysis(my_msgs_2017))
sentiment_analysis.append(find_sentiment_analysis(my_msgs_2018))
sentiment_analysis.append(find_sentiment_analysis(my_msgs_2019))
sentiment_analysis.append(find_sentiment_analysis(my_msgs_2020))

my_sentiments_df = pd.DataFrame({'sentiment': sentiment_analysis}, index=[2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])
my_sentiments_df.plot.line()

In [None]:
# finding what kind of regression to use to predict my future 2020 messaging trends with my friend

path_to_user = 'messages/inbox/acbubba_wggssp40wq/'
json_files = os.path.join(path_to_user, '*.json')
files = glob.glob(json_files)

ac_df = pd.DataFrame()

# read in all jsons in a persons's message folder
for file_url in files:
    with open(file_url) as json_data:
        data = json.load(json_data)
    curr_json_df = json_normalize(data, 'messages')
    ac_df = ac_df.append(curr_json_df, ignore_index = True)

ac_df.head(15)

# drop nans
ac_df = ac_df.dropna(subset=['content'])

# convert timestamp to datetime format
ac_df['datetime'] = ac_df.apply(lambda row: datetime.datetime.fromtimestamp(int(row.timestamp_ms) * 0.001), axis = 1) 

# separate date and time from datetime
ac_df['date'] = [d.date() for d in ac_df['datetime']]
ac_df['month'] = [d.month for d in ac_df['datetime']]
ac_df['year'] = [d.year for d in ac_df['datetime']]
ac_df['time'] = [d.time() for d in ac_df['datetime']]

# select only certain columns
ac_df = ac_df[['sender_name', 'date', 'month', 'year', 'time', 'content', 'reactions', 'datetime']]

# proper year month
def convert_month_str(month):
    if (month < 10):
        return '0' + str(month)
    return str(month)
ac_df['month'] = ac_df['month'].apply(lambda x: convert_month_str(x))
ac_df['year-month'] = ac_df['year'].astype(str) + '-' + ac_df['month'].astype(str)

# rename column sender_name to name
ac_df = ac_df.rename(columns={'sender_name': 'name'})

# sort by datetime
ac_df = ac_df.sort_values(by=['datetime'])


In [None]:
ac_df.head(6)

In [None]:
# plot number of messages sent & received to/from ac over time
fig, ax = plt.subplots()

# total number of messages sent & received between us
plot_ac_df = ac_df.groupby(['year-month'], as_index=False).agg({'content': 'count'})

# i sent to ac / ac sent to me:
ac_to_me_df = ac_df[ac_df['name'] != 'Grace Jiang']
me_to_ac_df = ac_df[ac_df['name'] == 'Grace Jiang']

# number of msgs i sent
plot_sent_msgs_df = me_to_ac_df.groupby(['year-month'], as_index=False).agg({'content': 'count'})

# number of msgs i received
plot_received_msgs_df = ac_to_me_df.groupby(['year-month'], as_index=False).agg({'content': 'count'})

ax.plot(plot_ac_df['year-month'], plot_ac_df['content'], label='total messages exchanged')
ax.plot(plot_sent_msgs_df['year-month'], plot_sent_msgs_df['content'], label='messages i sent')
ax.plot(plot_received_msgs_df['year-month'], plot_received_msgs_df['content'], label='messages i received')

# plotting

ax.plot(plot_ac_df['year-month'], plot_ac_df['content'])

plt.title("messages sent & received to/from ac over time", loc='center', fontsize=14, fontweight=0, color='black')
ax.set_xlabel("year and month")
ax.set_ylabel("number of messages")
plt.xticks(rotation=90)
ax.legend(loc='best')

In [None]:
# nlp sentiment analysis for each person (how positive they are)

ac_phrases = []
ac_sentiment = 0.0
ac_msgs = 0.0
for row in ac_to_me_df.content.str.lower():
    ac_phrases.append(row)
    blob = TextBlob(row)
    ac_sentiment += blob.sentiment.polarity
    ac_msgs += 1

print("AC Sentiment Analysis Ratio: ")
print(ac_sentiment / ac_msgs * 100.0)

grac_phrases = []
grac_sentiment = 0.0
grac_msgs = 0.0
for row in me_to_ac_df.content.str.lower():
    grac_phrases.append(row)
    blob = TextBlob(row)
    grac_sentiment += blob.sentiment.polarity
    grac_msgs += 1

print("Grac Sentiment Analysis Ratio: ")
print(grac_sentiment / grac_msgs * 100.0)



In [None]:
# linear regression: simple

from sklearn.linear_model import LinearRegression

new_lr_ac_df = ac_df.groupby(['year-month', 'name'], as_index=False).agg({'content': 'count'})

def year_in_num(year_month):
    year = int(year_month[:4])
    month = int(year_month[5:])
    raw_value = year * 12 + month
    return raw_value - (2019 * 12 + 7)

ac_lr_df = new_lr_ac_df[new_lr_ac_df['name'] == 'AC Bubba']
ac_lr_df['time'] = ac_lr_df['year-month'].apply(lambda x: year_in_num(x))
ac_lr_df = ac_lr_df.dropna(subset=['year-month'])

grac_lr_df = new_lr_ac_df[new_lr_ac_df['name'] == 'Grace Jiang']
grac_lr_df['time'] = grac_lr_df['year-month'].apply(lambda x: year_in_num(x))
grac_lr_df = grac_lr_df.dropna(subset=['year-month'])

ac_X = ac_lr_df['time'].values.reshape(-1, 1)
ac_Y = ac_lr_df['content'].values.reshape(-1, 1)

lr = LinearRegression()
lr.fit(ac_X, ac_Y)
ac_Y_pred = lr.predict(ac_X)

plt.scatter(ac_X, ac_Y)
plt.title("AC's predicted monthly messages", loc='center', fontsize=14, fontweight=0, color='black')
plt.plot(ac_X, ac_Y_pred, color='red')
plt.show()

grac_X = grac_lr_df['time'].values.reshape(-1, 1)
grac_Y = grac_lr_df['content'].values.reshape(-1, 1)

lr = LinearRegression()
lr.fit(grac_X, grac_Y)
grac_Y_pred = lr.predict(grac_X)

plt.scatter(grac_X, grac_Y)
plt.title("Grace's predicted monthly messages", loc='center', fontsize=14, fontweight=0, color='black')
plt.plot(grac_X, grac_Y_pred, color='blue')
plt.show()


In [None]:
# linear regression: more modelling

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import numpy as np
from sklearn.decomposition import PCA
import sklearn
from matplotlib import pyplot as plt 

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

lr_ac_df = ac_df.groupby(['year-month', 'name'], as_index=False).agg({'content': 'count'})
lr_ac_df['name'] = lr_ac_df['name'].astype('category')
lr_ac_df['year'] = lr_ac_df['year-month'].apply(lambda x: x[:4]).astype(int)
lr_ac_df['month'] = lr_ac_df['year-month'].apply(lambda x: x[5:]).astype(int)

lr_ac_df = lr_ac_df.drop(columns=['year-month'])

lr_ac_df = pd.get_dummies(lr_ac_df, columns=['name'])

label = lr_ac_df['content']
features = lr_ac_df.drop(columns=['content'])

x_train, x_test, y_train, y_test = train_test_split(features, label, test_size=0.2)

lin_regressor = LinearRegression()
lin_regressor.fit(x_train, y_train)
y_pred = lin_regressor.predict(x_test)

mse_test = mean_squared_error(y_test, y_pred)


In [None]:
# Dimensionality reduction with PCA 
x_df = pd.DataFrame(x_train)

pca = PCA()
to_train_pca = sklearn.preprocessing.StandardScaler().fit_transform(x_df)
trained_pca = pca.fit_transform(to_train_pca)

plt.plot(trained_pca)
plt.show()

evr = pca.explained_variance_ratio_
components = pca.components_
ratio_plot = np.cumsum(evr)
plt.plot(ratio_plot)

new_pca = PCA(n_components=4)
x_train = new_pca.fit_transform(x_train)

rfr = RandomForestRegressor(random_state=4)
parameters = {
    'max_depth': [2, 4], 'n_estimators': [1, 2, 3]
}

grid_search = GridSearchCV(estimator=rfr, param_grid=parameters)

grid_search.fit(x_train, y_train)

x_test = new_pca.fit_transform(x_test)
y_pred = grid_search.best_estimator_.predict(x_test)

print(np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [None]:
# machine learning to categorize who sent the message based off language analysis


data = []
data_labels = []
for row in ac_to_me_df.content.str.lower():
    data.append(row)
    data_labels.append('ac')

for row in me_to_ac_df.content.str.lower():
    data.append(row)
    data_labels.append('grac')

In [None]:
# machine learning to categorize who sent the message based off language analysis

vectorizer = CountVectorizer(
    analyzer = 'word',
    lowercase = False,
)

features = vectorizer.fit_transform(data)
features_nd = features.toarray() # for easy usage

X_train, X_test, y_train, y_test  = train_test_split(
        features_nd, 
        data_labels,
        train_size=0.80, 
        random_state=1000)

In [None]:
# building linear classifier

log_model = LogisticRegression()
log_model = log_model.fit(X=X_train, y=y_train)
y_pred = log_model.predict(X_test)

accuracy_score(y_test, y_pred)