In [65]:
from relevance import load_lda_model
import pandas, numpy

In [70]:
lda, feature_sets = load_lda_model()
df = pandas.DataFrame(data=numpy.array(lda.topics).T, columns=["topic"+str(e) for e in list(range(len(lda.topics)))])
df

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9
0,"be rare,",go navy!,close on,cheer,md,"@navyfederal, i'm",country,boycott,union in,@navyfederal
1,out bit.ly/2nlpavi,@navyfederal go,hell be,cheer loud,union in,"hey @navyfederal,",@mflynnjr,on hannity,@navyfederal credit,@navyfederalhelp
2,struggle to,go,night,i'm cheer,@navyfederal credit,"@navyfederal,",resource,#firehannity,credit union,get
3,@huntington_bank @usaa,where's my,"in odenton,",loud and,at @navyfederal,hey,@trumpgirlstrong,@firehannity,i'm at,go
4,bit.ly/2nlpavi,where's,"odenton,",and proud,i'm at,rooting,their head,hannity,union,deposit
5,marketer,irish,"odenton, md",proud for,credit union,i'm rooting,they should,because they,at @navyfederal,bank
6,bit.ly/2nlpavi @forrester,@canada,online banking?,loud,union,rooting for,head to,@firehannity #firehannity,credit,app
7,"rare, financial",a proud,have issues.,proud,credit,cards.twitter.com/cards/2egq5x/2…,@basedmonitored,firehannity.org,va,account
8,@huntington_bank,@usaa @navyfederalhelp,@thaisaidit @navyfederal,@navyfederal i'm,"rockville,",i'm,@basedmonitored @rambobiggs,boycott @navyfederal,"norfolk,",@usaa
9,marketer struggle,run me,@thaisaidit,midshipmen!,"rockville, md",navy!,@vfl2013 @jamss3468,#firehannity firehannity.org,"in norfolk,",??


Note that some of the above topics represent very standardized social media posts that we can easily filter out with our LDA topic scores:

- topic 1: Go Navy!
- topic 3: Loud and Proud
- topic 4: I'm at NFCU in Rockville, MD
- topic 5: Hey NFCU, I'm Rooting For...
- topic 7: #FireHannity
- topic 8: I'm at NFCU in Norfolk, VA

Let's find thresholds that will let us filter those 

In [21]:
def analyze_topic(i, thresholds):
    print("Analyzing topic " + str(i) +"...")
    print("Most common tokens:", lda.topics[i])
    print("Total documents: " + str(len(feature_sets)))
    for t in thresholds:
        print("Number of documents whose topic " + str(i) + " weight is >="+str(t)+" is " + str(len([e for e in feature_sets if e[1][i]>=t])))
        
def filter_features(topic_idx, lower_bound, upper_bound=1, desc=False):
    ret = [e for e in feature_sets if e[1][topic_idx]>=lower_bound and e[1][topic_idx]<=upper_bound]
    ret.sort(reverse=desc, key = lambda e: e[1][topic_idx])
    return ret

In [33]:
analyze_topic(1, [.1, .2, .3, .4, .5, .6, .7, .8, .9, .95])

Analyzing topic 1...
Most common tokens: ['go navy!', '@navyfederal go', 'go', "where's my", "where's", 'irish', '@canada', 'a proud', '@usaa @navyfederalhelp', 'run me']
Total documents: 22540
Number of documents whose topic 1 weight is >=0.1 is 1427
Number of documents whose topic 1 weight is >=0.2 is 1296
Number of documents whose topic 1 weight is >=0.3 is 1096
Number of documents whose topic 1 weight is >=0.4 is 638
Number of documents whose topic 1 weight is >=0.5 is 233
Number of documents whose topic 1 weight is >=0.6 is 57
Number of documents whose topic 1 weight is >=0.7 is 12
Number of documents whose topic 1 weight is >=0.8 is 2
Number of documents whose topic 1 weight is >=0.9 is 0
Number of documents whose topic 1 weight is >=0.95 is 0


In [35]:
f = filter_features(1, .65, desc=False)
f[:10]

[(['@vwcares',
   '@cardinalewayvol',
   'non_alpha_numeric',
   'my',
   'new',
   'rep',
   'says',
   'hes',
   'off',
   'tomorrow',
   'so',
   'maybe',
   'weds-im',
   'done',
   'waiting!!',
   'wth?rate',
   'with',
   '',
   '@navyfederal',
   'may',
   'go',
   'up!',
   'twitter.com/elicitations/s…'],
  array([0.01418264, 0.65208469, 0.0141837 , 0.01418295, 0.01418297,
         0.01418276, 0.01418275, 0.01418309, 0.01418912, 0.23444533])),
 (['@navyfederal',
   'hello.',
   'our',
   'firm',
   'in',
   'korea',
   '(phone',
   '070-****-',
   '8495)',
   'receives',
   'calls',
   'for',
   'navy',
   'federal',
   'everyday.',
   'you',
   'have',
   'the',
   'wrong',
   'non_alpha_numeric',
   'posted.',
   'next?'],
  array([0.0143914 , 0.65895798, 0.01439635, 0.01439336, 0.01439567,
         0.0143932 , 0.01439946, 0.01439392, 0.01439478, 0.22588388])),
 (['@seansean252',
   '@brandondonkey',
   '@sayachi2010',
   '??',
   "i'm",
   'so',
   'fussy',
   'i',
   "can't

This seems to be a group that includes themes of nationalism/patriotism/military and seems to be infrequent enough that it isn't worth breaking down until we see if another iteration with some of the noise removed fixes this topic

In [27]:
analyze_topic(3, [.1, .2, .3, .4, .5, .6, .7, .8, .9, .95])

Analyzing topic 3...
Most common tokens: ['cheer', 'cheer loud', "i'm cheer", 'loud and', 'and proud', 'proud for', 'loud', 'proud', "@navyfederal i'm", 'midshipmen!']
Total documents: 22540
Number of documents whose topic 3 weight is >=0.1 is 4694
Number of documents whose topic 3 weight is >=0.2 is 4562
Number of documents whose topic 3 weight is >=0.3 is 4245
Number of documents whose topic 3 weight is >=0.4 is 3789
Number of documents whose topic 3 weight is >=0.5 is 3420
Number of documents whose topic 3 weight is >=0.6 is 3266
Number of documents whose topic 3 weight is >=0.7 is 3231
Number of documents whose topic 3 weight is >=0.8 is 3222
Number of documents whose topic 3 weight is >=0.9 is 0
Number of documents whose topic 3 weight is >=0.95 is 0


In [32]:
f = filter_features(3, .7, desc=False)
f[:10]

[(['#goirish',
   '@navyfederal',
   "i'm",
   'cheering',
   'loud',
   'and',
   'proud',
   'for',
   'notre',
   'dame!',
   '#goirish'],
  array([0.0205337 , 0.0205337 , 0.0205337 , 0.70208431, 0.02053371,
         0.02053428, 0.13364532, 0.02053371, 0.02053372, 0.02053384])),
 (['#goirish.',
   '@navyfederal',
   "i'm",
   'cheering',
   'loud',
   'and',
   'proud',
   'for',
   'notre',
   'dame!',
   '#goirish'],
  array([0.12610558, 0.02020274, 0.02020274, 0.7122718 , 0.02020282,
         0.02020325, 0.02020274, 0.02020274, 0.02020275, 0.02020286])),
 (['sorry',
   '@navyfederal',
   "i'm",
   'cheering',
   'loud',
   'and',
   'proud',
   'for',
   'notre',
   'dame!',
   '#goirish'],
  array([0.01956947, 0.01956853, 0.01956853, 0.72162047, 0.12182067,
         0.01956902, 0.01956928, 0.01956853, 0.01956854, 0.01957696])),
 (['i',
   'love',
   'sea',
   'food.',
   '@navyfederal',
   "i'm",
   'cheering',
   'loud',
   'and',
   'proud',
   'for',
   'the',
   'midshipmen!

We can follow this example and assume any post with a topic 3 score of >=0.7 is a loud & proud campaign post.

In [34]:
analyze_topic(4, [.1, .2, .3, .4, .5, .6, .7, .8, .9, .95])

Analyzing topic 4...
Most common tokens: ['md', 'union in', '@navyfederal credit', 'at @navyfederal', "i'm at", 'credit union', 'union', 'credit', 'rockville,', 'rockville, md']
Total documents: 22540
Number of documents whose topic 4 weight is >=0.1 is 1618
Number of documents whose topic 4 weight is >=0.2 is 1472
Number of documents whose topic 4 weight is >=0.3 is 1231
Number of documents whose topic 4 weight is >=0.4 is 765
Number of documents whose topic 4 weight is >=0.5 is 304
Number of documents whose topic 4 weight is >=0.6 is 113
Number of documents whose topic 4 weight is >=0.7 is 26
Number of documents whose topic 4 weight is >=0.8 is 22
Number of documents whose topic 4 weight is >=0.9 is 0
Number of documents whose topic 4 weight is >=0.95 is 0


In [38]:
f = filter_features(4, .6, desc=False)
f[:10]

[(['bad',
   'decision',
   '@usaa',
   '',
   "i'll",
   'take',
   'my',
   'free',
   'speech',
   'and',
   'banking,',
   'insurance',
   'non_alpha_numeric',
   'investment',
   'needs',
   'elsewhere.',
   'way',
   '2',
   'cave',
   'to',
   'pressure',
   'hello',
   '@navyfederal'],
  array([0.0141962 , 0.01419612, 0.01420207, 0.01419701, 0.60025164,
         0.01419646, 0.01419858, 0.01419648, 0.01420094, 0.2861645 ])),
 (['@navyfederal',
   'applause',
   'to',
   'valerie',
   '(supervisor)',
   'at',
   'navyfed',
   'for',
   'providing',
   'great',
   'customer',
   'service.'],
  array([0.01838301, 0.01838212, 0.01838212, 0.01838619, 0.60155297,
         0.01838512, 0.01838213, 0.01838213, 0.01838235, 0.25138185])),
 (['@navyfederal',
   'joe',
   "y'all",
   'owe',
   'me',
   '$50',
   "i'm",
   'tryna',
   'see',
   'where',
   "it's",
   'at',
   'b,',
   'i',
   'need',
   'that.'],
  array([0.01620935, 0.01621122, 0.0162166 , 0.01621181, 0.60183462,
         0.