In [65]:
from relevance import load_lda_model
import pandas, numpy

In [70]:
lda, feature_sets = load_lda_model()
df = pandas.DataFrame(data=numpy.array(lda.topics).T, columns=["topic"+str(e) for e in list(range(len(lda.topics)))])
df

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9
0,"be rare,",go navy!,close on,cheer,md,"@navyfederal, i'm",country,boycott,union in,@navyfederal
1,out bit.ly/2nlpavi,@navyfederal go,hell be,cheer loud,union in,"hey @navyfederal,",@mflynnjr,on hannity,@navyfederal credit,@navyfederalhelp
2,struggle to,go,night,i'm cheer,@navyfederal credit,"@navyfederal,",resource,#firehannity,credit union,get
3,@huntington_bank @usaa,where's my,"in odenton,",loud and,at @navyfederal,hey,@trumpgirlstrong,@firehannity,i'm at,go
4,bit.ly/2nlpavi,where's,"odenton,",and proud,i'm at,rooting,their head,hannity,union,deposit
5,marketer,irish,"odenton, md",proud for,credit union,i'm rooting,they should,because they,at @navyfederal,bank
6,bit.ly/2nlpavi @forrester,@canada,online banking?,loud,union,rooting for,head to,@firehannity #firehannity,credit,app
7,"rare, financial",a proud,have issues.,proud,credit,cards.twitter.com/cards/2egq5x/2…,@basedmonitored,firehannity.org,va,account
8,@huntington_bank,@usaa @navyfederalhelp,@thaisaidit @navyfederal,@navyfederal i'm,"rockville,",i'm,@basedmonitored @rambobiggs,boycott @navyfederal,"norfolk,",@usaa
9,marketer struggle,run me,@thaisaidit,midshipmen!,"rockville, md",navy!,@vfl2013 @jamss3468,#firehannity firehannity.org,"in norfolk,",??


Note that some of the above topics represent very standardized social media posts that we can easily filter out with our LDA topic scores:

- topic 1: Go Navy!
- topic 3: Loud and Proud
- topic 4: I'm at NFCU in Rockville, MD
- topic 5: Hey NFCU, I'm Rooting For...
- topic 7: #FireHannity
- topic 8: I'm at NFCU in Norfolk, VA

Let's find thresholds that will let us filter those 

In [7]:
def analyze_topic(i, thresholds):
    print("Analyzing topic " + str(i) +"...")
    print("Most common tokens:", lda.topics[i])
    print("Total documents: " + str(len(feature_sets)))
    for t in thresholds:
        print("Number of documents whose topic " + str(i) + " weight is >="+str(t)+" is " + str(len([e for e in feature_sets if e[1][i]>=t])))
        
def filter_features(topic_idx, lower_bound, upper_bound=1, descending=False):
    ret = [e for e in feature_sets if e[1][topic_idx]>=lower_bound and e[1][topic_idx]<=upper_bound]
    ret.sort(key = lambda e: e[1][topic_idx], reverse=descending)
    return ret

In [13]:
analyze_topic(8, [.3, .4, .5, .6, .7, .8, .9, .95])

Analyzing topic 8...
Most common tokens: ['union in', '@navyfederal credit', 'credit union', "i'm at", 'union', 'at @navyfederal', 'credit', 'va', 'norfolk,', 'in norfolk,']
Total documents: 22540
Number of documents whose topic 8 weight is >=0.3 is 957
Number of documents whose topic 8 weight is >=0.4 is 621
Number of documents whose topic 8 weight is >=0.5 is 294
Number of documents whose topic 8 weight is >=0.6 is 134
Number of documents whose topic 8 weight is >=0.7 is 41
Number of documents whose topic 8 weight is >=0.8 is 37
Number of documents whose topic 8 weight is >=0.9 is 0
Number of documents whose topic 8 weight is >=0.95 is 0


In [12]:
f = filter_features(0, .7)
f

[(['navy',
   'federal',
   'will\xa0cover\xa0pay',
   'for\xa0its',
   'eligible',
   'members\xa0during',
   'partial-federal',
   'government',
   'shutdown\xa0via',
   'interest-free',
   'loans.',
   'once',
   'direct',
   'deposit',
   'of\xa0pay',
   'resumes,',
   'the',
   'loan',
   'amount\xa0will\xa0be',
   'automatically',
   'deducted',
   'by\xa0#navyfederal\xa0as',
   'repayment.\r\nthank',
   'you',
   '@navyfederal\r\n#bordersecurity',
   '@potus'],
  array([0.71325418, 0.01269978, 0.01269981, 0.01270013, 0.01269978,
         0.01270012, 0.01269978, 0.01270225, 0.01269978, 0.18514438])),
 (['@brianbeutler',
   'between',
   'this,',
   'not',
   'disclosing',
   'his',
   'conflicts',
   'w/cohen',
   'as',
   'his',
   'lawyer',
   'while',
   'interviewing',
   'him,',
   'causing',
   'harm',
   'to',
   'seth',
   "rich's",
   'family',
   'while',
   'they',
   'mourned,',
   'his',
   'nightly',
   'pillow',
   'talks',
   'coordination',
   'w/trump',
   'whil

In [10]:
f[:100]

[(['@navyfederal',
   'navy',
   'federal',
   'has',
   'the',
   'worst',
   'security',
   'pactices.'],
  array([0.0222153 , 0.0222153 , 0.0222153 , 0.02221612, 0.0222153 ,
         0.02221723, 0.0222153 , 0.0222153 , 0.02221531, 0.80005956])),
 (['shoutout',
   'to',
   '@navyfederal',
   'their',
   'customer',
   'service',
   'is',
   'a1'],
  array([0.02217557, 0.02217557, 0.02217557, 0.02217584, 0.02217557,
         0.02217557, 0.02217559, 0.02217557, 0.02217558, 0.80041958])),
 (['@navyfederal', 'has', 'now', 'been', 'down', 'over', 'five', 'hours!'],
  array([0.02211727, 0.02211727, 0.02211727, 0.02211751, 0.02211727,
         0.02211727, 0.02211727, 0.02211727, 0.02211728, 0.80094432])),
 (['@urbangem',
   '@navyfederal',
   '@usaa',
   'i',
   'just',
   'got',
   'navy',
   'federal'],
  array([0.02211145, 0.0221114 , 0.0221114 , 0.02211171, 0.02211141,
         0.02211141, 0.02211144, 0.02211141, 0.02211141, 0.80099696])),
 (['@navyfederalhelp',
   '',
   'what',
   '',