In [1]:
from datetime import datetime
import os
import sys
dirname = os.path.dirname('__file__')

sys.path.append(os.path.join(dirname, "pkg"))
sys.path.append(os.path.join(dirname, "pkg", "preproc"))
from pkg.preproc import base

#### Objectives

1. Identify number of potential speakers (clustering)
    1.1 Use Factors: Readability (F-K), doc_length, n_mentions, n_hashtags, time_of_day??, avg_word_size, avg_num_syllables
2. Attempt to identify Trump (with 'known' data)

In [2]:
from pkg.mod import cluster
import numpy as np
from collections import Counter
from pandas import DataFrame
fields = ["n_hashtags", "n_mentions", "avg_syllables",
          "avg_word_length", "fk", 'n_sents',
          "n_ents", "n_uppers", "platform_id",'amplifier',
          'analneg', 'attribadj', 'auxdo', 'bemv',
          'bracket', 'caps', 'cconj', 'cntrstconj',
          'colon', 'comma', 'defart', 'detquan',
          'exclam', 'fstpp', 'fulstop', 'gerund',
          'havemv', 'imperative', 'indefart',
          'infinitive', 'it', 'mdnec',
          'mdposs', 'mdpred', 'multiwvb',
          'nomin', 'numdet', 'numnoun', 'objpro',
          'otheradv', 'othrintj', 'othrnoun',
          'othrverb', 'passive', 'past', 'perceptvb',
          'perfect', 'posesprpn', 'possdet', 'predadj',
          'prep', 'procontract', 'progressive',
          'proquan', 'provdo', 'prpn', 'prvv', 'pubv',
          'ques', 'relclausesubgap', 'sinflect',
          'sndpp', 'stancevb', 'subjpro', 'superlative',
          'thrdpp', 'timeadv', 'whw', 'initialmention']
dat = base.Data()
prep = base.Preprocessor()
dat.get_data(load_local=True)

prep.tokenize(data=dat.data)

ids_ = [d.ID for d in prep.docs]
arrs = [d.feature_array for d in prep.docs]



100%|██████████| 17433/17433 [06:24<00:00, 45.32it/s]


Length Error: 
4746) Iran humiliated the United States with the capture of our 10 sailors. Horrible pictures and images. We are weak. I will NOT forget!
Length Error: 
4748) Iran humiliated the United States with the capture of our 10 sailors. Horrible pictures and images. We are weak. I will NOT forget!


In [3]:
import time
df = DataFrame(arrs, index=ids_, columns=fields)
tod_map = {}
dates, times, tokens = [], [], []
platforms = []
tstamps = []
for d in prep.docs:
    datefmt = datetime.strptime(d.local_date, "%Y-%m-%d")
    dates.append(datefmt)
    tstamps.append(datefmt.timestamp())
    tm = d.local_time
    times.append(time.mktime(tm))
    platforms.append(d.platform)
    tokens.append(",".join(d.get_tokens_merged(lowercase=False)))
df.insert(0, "tokens", tokens)
df.insert(0, 'date', dates)
df.insert(0, 'time', times)
df.insert(0, 'platform', platforms)
df.insert(0, 'date_ts', tstamps)

TypeError: Tuple or struct_time argument required

In [None]:
from pkg.utils import make_tokens_list
from tabulate import tabulate

all_tokens = make_tokens_list(df['tokens'])
android_tokens_series = df.loc[[i for i in df.index
                                if df.loc[i]['platform']=='android']]['tokens']
android_tokens = make_tokens_list(android_tokens_series)
iphone_tokens_series = df.loc[[i for i in df.index
                               if df.loc[i]['platform']=='iphone']]['tokens']
iphone_tokens = make_tokens_list(iphone_tokens_series)


words_counter = dict(Counter(all_tokens).most_common())
android_counter = dict(Counter(android_tokens).most_common())
iphone_counter = dict(Counter(iphone_tokens).most_common())


android_words_unique = [(k, android_counter[k]) for k in android_counter.keys()
                        if k not in iphone_counter]

iphone_words_unique = [(kx, iphone_counter[kx]) for kx in iphone_counter.keys()
                       if kx not in android_counter]

sidexside = [(*android_words_unique[ix], *iphone_words_unique[ix]) for ix in range(20)]
combined_table = tabulate(sidexside, headers=("android_token", "android_count", 'iphone_token', 'iphone_count'))

print(combined_table)

### Difference Between IPhone and Android

Its clear just from looking at the most common tokens that
the posts originating from Android are more "incendiary" in terms
of the insulting tone of many keywords. While this is certainly a characteristic
one might associate with Trump it is not enough by itself to support the
contention that Trump wrote all of these himself.

Ironically, it is the tokens from the IPhone group which discuss the "fake news"
in great detail. Though this makes sense if you consider that Trump switched to an
IPhone after taking office. So we expand our condition to include certain times of the day
in 2018 when he was suspected of tweeting and compare the counts again.

In [None]:
import time
from datetime import datetime

trump = df.loc[[ix for ix in df.index if (13 >= time.strptime(df.loc[ix]['time'], "%H:%M:%S").tm_hour >= 10
                                           and df.loc[ix]['date'].year == 2018
                                          and df.loc[ix]['platform']=='iphone')
                or df.loc[ix]['platform'] == 'android']]['tokens']

trump_tokens = make_tokens_list(trump)
not_trump = df.loc[[ix for ix in df.index if (time.strptime(df.loc[ix]['time'], "%H:%M:%S").tm_hour > 13 or time.strptime(df.loc[ix]['time'], "%H:%M:%S").tm_hour < 10)
                                           and df.loc[ix]['date'].year == 2018
                                          and df.loc[ix]['platform']=='iphone']]['tokens']

not_trump_tokens = make_tokens_list(not_trump)

trump_counter = dict(Counter(trump_tokens).most_common())
not_trump_counter = dict(Counter(not_trump_tokens).most_common())
trump_unique = [(tk, trump_counter[tk]) for tk in trump_counter if tk not in not_trump_counter]
not_trump_unique = [(ntk, not_trump_counter[ntk]) for ntk in not_trump_counter if ntk not in trump_counter]



sidexside2 = [(*trump_unique[ix], *not_trump_unique[ix]) for ix in range(min(len(trump_unique), len(not_trump_unique), 20))]
combined_table2 = tabulate(sidexside2, headers=("trump_token", "trump_count", 'nottrump_token', 'nottrump_count'))

print(combined_table2)


There were several things I noticed from the updated tokens lists:

    (1) The 'likely' Trump tokens are shorter on average.
    (2) The 'likely' Trump tokens are more sentiment oriented.
    (3) The 'unlikely' Trump tokens have more numeric values.


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


### Linguistic Feature Groups

In [None]:
mod0 = cluster(data=arrs, cluster_meth='optics', xi=.00075,
               metric='l2', min_cluster_size=300)

mod0_labs = mod0.labels_
print(len(np.unique(mod0_labs)))
print(dict(Counter(mod0_labs)))


df.insert(len(df.columns), "cluster", mod0_labs)

In [None]:
tod_map = {}
dates, times, tokens = [], [], []


clusts = df['cluster']

outlier_filt = clusts == -1
clust_filt = clusts != -1

df_outliers = df.where(outlier_filt)
df_clusts = df.where(clust_filt)
df_outliers = df_outliers.dropna()
df_clusts = df_clusts.dropna()
grps = df_clusts.groupby(['cluster'])


for grp in grps:
    grpdf = grp[1]
    grptokens = make_tokens_list(grpdf['tokens'])
    grptoken_cnt = dict(Counter(grptokens).most_common(20)).items()
    print(f"\n------------------\nGroup #{int(grp[0])}\n====================\n")
    grptable = tabulate(grptoken_cnt, headers=('token', 'count'))

    print(grptable)


### Combining filters

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from time import strptime
sns.set_theme(style="darkgrid")

pal = sns.color_palette("hls", len(np.unique(mod0_labs))-1)

from matplotlib.ticker import FuncFormatter


#fieldsx = fields.copy()
#fieldsx.append('cluster')
#fig = sns.pairplot(df[fieldsx], hue='cluster')

#fig.savefig('plots/lang_attrs.png')

# maybe_trump_df = df.loc[[idx for idx in df.index
#                          if (df.loc[idx]['platform'] == 'android'
#                              or df.loc[idx]['platform'] == 'web client')
#                          or df.loc[idx]['date'] <=  datetime(2017, 1, 20)
#                          or (df.loc[idx]['date'] > datetime(2017, 1, 20) and
#                              13 > strptime(df.loc[idx]['time'], "%H:%M:%S").tm_hour > 10)
#                          ]]

### Conclusion

Ultimately, it is evident that text features alone cannot serve as a substitute
for observed patterns. However, it is interesting that the text features did segment
the posts into what could be considered rough categories where each group seems to be
centered on a theme. This would seem to indicate that, at least in this case, the linguistic
patterns of posts share features when they are about the topic or related to a theme.

Lastly, I have to give credit to ___ who wrote the article identifying the conditions
of Trumps tweets. I find there is a significant degree of merit in their observations.

In [None]:
#pal = ['green','orange','dodgerblue']

df_clusts['cluster'] = [int(x) for x in df_clusts['cluster']]
funcfmt_date = lambda x, pos: datetime.fromtimestamp(x).strftime("%Y-%m")


fig, axes = plt.subplots(2, 1, figsize=(10, 40), sharex=True, sharey=True)
fig.suptitle("Tweet Clusters By Date")
sns.violinplot(ax=axes[0], x='date_ts', y='cluster', orient='h',
               data=df_clusts, palette=pal, inner=None)

sns.scatterplot(ax=axes[1], x='date_ts', y='cluster', data=df_clusts,
                hue='cluster', palette=pal)


axes[0].xaxis.set_major_formatter(funcfmt_date)
axes[1].xaxis.set_major_formatter(funcfmt_date)

plt.show()

In [None]:
df_clusts['time'] = [x for x in df_clusts['time']]
funcfmt_time = lambda x, pos: datetime.fromtimestamp(x).strftime('%H:%M')


fig0, axes0 = plt.subplots(2, 1, figsize=(10, 40), sharex=True, sharey=True)
fig0.suptitle("Tweet Clusters By Time")
sns.violinplot(ax=axes0[0], x='time', y='cluster', orient='h',
               data=df_clusts, palette=pal, inner=None)

sns.scatterplot(ax=axes0[1], x='time', y='cluster', data=df_clusts,
                hue='cluster', palette=pal)
axes0[0].xaxis.set_major_formatter(funcfmt_time)
axes0[1].xaxis.set_major_formatter(funcfmt_time)

plt.show()