# ATL for english editorial articles

In [1]:
import sys
sys.path.append('../..')
import src.Automatic_Topic_Labeling.lable_topic as tl
import src.Automatic_Topic_Labeling.helper_functions as hp
import src.data.data_loader as dl
from src.features.vectorizer import Vectorizer
from src.models.topic_models import TopicModel

INFO:summarizer.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English


### 1-Load tags and documents

In [2]:
data = dl.get_articles_by_type("english", "editorial",merge_tokens=False, kind='tagged')
article_texts = data["article_texts"]
article_pos = data["article_pos"]
pos=[('NN','NN'), ('JJ', 'NN')]

### 2-Load Vectorizer and Topic model

In [3]:
vec = Vectorizer.load('tagged/vectorizer/lda_english_article_texts_pos.pkl')
tm = TopicModel.load('tagged/topic_models/lda/lda_english_article_texts_tagged_130.pkl')

### 3-Generate Topic Lables with Pos-Tags

In [4]:
labels_pos = tl.get_topic_lables(article_pos,article_texts,pos,vec,tm, n_cand_lables = 100,lable_min_df = 5, n_labels = 8)

shape topic model
(130, 6365)
pmishape
(6365, 100)


### 4-Print topics and their labels

In [5]:
display(hp.print_label_df(tm,labels_pos))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,label
Topic 0,say,safety,fda,recall,product,outbreak,company,people,california,illness,nausea vomiting
Topic 1,monsanto,industry,public,company,safety,call,court,group,u.s.,write,bacillus thuringiensis
Topic 2,store,say,grocery,new,will,market,customer,chain,more,year,cookie cutter
Topic 3,campaign,marketing,medium,site,this,hospital,web,video,article,â,presidential election
Topic 4,compost,waste,almond,scrap,facility,city,material,will,yard,like,vacuum cleaner
Topic 5,ddt,study,ban,allen,faculty,ph.d.,expose,studies,mouse,exposure,metabolic syndrome
Topic 6,restaurant,fast,chain,say,menu,ingredient,burger,chipotle,mcdonald,meal,music festival
Topic 7,grain,wheat,whole,flour,rice,myth,gluten,bread,white,brown,irritable bowel
Topic 8,farmer,farm,program,policy,would,government,subsidy,who,president,obama,presidential election
Topic 9,chemical,exposure,health,toxic,safe,use,cancer,can,these,about,premature aging


### All generated lables per topic

In [6]:
print("\nTopical labels:")
print("-" * 20)
for i, labels in enumerate(labels_pos):
    print(u"Topic {}: {}\n".format(
        i,
        ', '.join(map(lambda l: ' '.join(l), labels))
    ))


Topical labels:
--------------------
Topic 0: nausea vomiting, cautionary tale, staphylococcus aureus, bacillus thuringiensis, irritable bowel, density lipoprotein, anorexia nervosa, lauryl sulfate

Topic 1: bacillus thuringiensis, mechanical tillage, density lipoprotein, avian flu, ozone layer, presidential election, cautionary tale, nausea vomiting

Topic 2: cookie cutter, cautionary tale, avian flu, music festival, sport utility, bacillus thuringiensis, ozone layer, vacuum cleaner

Topic 3: presidential election, density lipoprotein, avian flu, cookie cutter, ozone layer, bacillus thuringiensis, anorexia nervosa, sport utility

Topic 4: vacuum cleaner, ozone layer, sport utility, bacillus thuringiensis, cognitive dissonance, anaerobic digestion, irritable bowel, gene splicing

Topic 5: metabolic syndrome, premature aging, thyroid gland, irritable bowel, bacillus thuringiensis, mechanical tillage, density lipoprotein, ozone layer

Topic 6: music festival, cookie cutter, ozone layer,

## Labels without POS-Tags

In [7]:
labels = tl.get_topic_lables([],article_texts,None,vec,tm, n_cand_lables = 100,lable_min_df = 5, n_labels = 8)
display(hp.print_label_df(tm,labels))

shape topic model
(130, 6365)
pmishape
(6365, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,label
Topic 0,say,safety,fda,recall,product,outbreak,company,people,california,illness,earl butz
Topic 1,monsanto,industry,public,company,safety,call,court,group,u.s.,write,carey gillam
Topic 2,store,say,grocery,new,will,market,customer,chain,more,year,a.c. gallo
Topic 3,campaign,marketing,medium,site,this,hospital,web,video,article,â,frosted flakes
Topic 4,compost,waste,almond,scrap,facility,city,material,will,yard,like,bon appétit
Topic 5,ddt,study,ban,allen,faculty,ph.d.,expose,studies,mouse,exposure,kirsten brandt
Topic 6,restaurant,fast,chain,say,menu,ingredient,burger,chipotle,mcdonald,meal,dunkin donuts
Topic 7,grain,wheat,whole,flour,rice,myth,gluten,bread,white,brown,cara rosenbloom
Topic 8,farmer,farm,program,policy,would,government,subsidy,who,president,obama,rahm emanuel
Topic 9,chemical,exposure,health,toxic,safe,use,cancer,can,these,about,polyvinyl chloride


## Labels with Bigrams

In [8]:
article_data_bi = dl.get_articles_by_type("english", "editorial", merge_tokens=False, kind="with_2bigrams")
article_texts_bi = article_data_bi["article_texts"]

In [9]:
vec_bi = Vectorizer.load('vectorizer/ENED_lda_english_editorial.pkl')
tm_bi = TopicModel.load('topic_models/lda/ENED_lda_english_editorial_articles_130.pkl')

In [10]:
labels_bi = tl.get_topic_lables([],article_texts_bi,None,vec_bi,tm_bi,n_cand_lables = 100,lable_min_df = 5, n_labels = 5)

shape topic model
(130, 6254)
pmishape
(6254, 100)


In [11]:
display(hp.print_label_df(tm_bi,labels_bi))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,label
Topic 0,walmart,online,company,grocery,customer,delivery,store,service,business,retailer,wedbush securities
Topic 1,program,usda,agriculture,year,national,farmer,$,farm,rule,federal,mcevoy deputy_administrator
Topic 2,book,people,life,come,community,generation,movement,new,know,write,jose andres
Topic 3,yogurt,low,bacteria,carb,fermented,probiotic,fat,diet,eat,ancient,irritable_bowel syndrome
Topic 4,$,cup,q,soy,slice,serve,water,oz,foam,healthy,cara rosenbloom
Topic 5,outbreak,bacteria,safety,salmonella,e._coli,illness,produce,safe,contamination,people,irritable_bowel syndrome
Topic 6,cotton,clothing,make,use,grow,accord,powder,benefit,health,treat,eczema psoriasis
Topic 7,hen,soy,egg,operation,allow,disease,porch,large,barn,confine,irritable_bowel syndrome
Topic 8,labeling,label,gmo,law,consumer,genetically,state,ingredient,modify,company,sen._pat roberts_r
Topic 9,company,product,claim,campaign,honest,lawsuit,marketing,advertising,market,ad,stacy malkan
