# Auto Labeller
Applied to news dataset to evaluate model performance
* Applied on 500 rows
* Applied using normal labels

In [1]:
# Standard Libary Imports
import pandas as pd
import numpy as np
import json

from src.toolkit.autolabel import Preprocessor, AutoLabeller, check_labels
from src.toolkit.autolabel import recommend_words, Evaluator

from sklearn.naive_bayes import MultinomialNB

In [2]:
# file path to text data
text_path = "data/news/news500.csv"
labelled_path = "data/news/news500_labelled.csv"  # INPUT YOUR PREFERED OUTPUT PATH
score_path = "data/news/news500_score.csv"
labels_path = "data/news/news_labels.csv"  # INPUT PATH TO LABELS DICTIONARY

stopwords_path = "data/stopwords.csv"  ## ADJUST IF YOU HAVE CUSTOM STOPWORDS
text_column_name = "content"

news = pd.read_csv(text_path)
data = news[[text_column_name]]

In [3]:
data.head()

Unnamed: 0,content
0,Unions representing workers at Turner Newall...
1,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,AP - A company founded by a chemistry research...
3,AP - It's barely dawn when Mike Fitzpatrick st...
4,AP - Southern California's smog-fighting agenc...


In [4]:
corpus = data[text_column_name]

preprocessor = Preprocessor()

# Text Preprocessing
preprocessed_corpus = preprocessor.corpus_preprocess(corpus=corpus, stopwords_path=stopwords_path)

# Replace bigrams
data[text_column_name] = preprocessor.corpus_replace_bigrams(corpus=preprocessed_corpus, min_df=50, max_df=500)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


## Recommended themes and words

In [5]:
n_words = 20  # CHANGE THE NUMBER OF WORDS RECOMMENDED (IF YOU WANT TO)

# Returns a matrix of recommended words
topic_model, dtm, best_n = recommend_words(corpus) 
topic_model.show_topics(dtm=dtm, best_n=best_n, n_words=n_words)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,say,president,official,two,would,one,government,worker,plan,security,yesterday,afp,people,wednesday,monday,new,internet,could,result,service
1,lt,gt,http,href,wwwinvestorreuterscomfullquoteaspx,targetstocksquickinfofullquote,reuters,inc,say,product,new,quarterly,use,co,chicago,company,demand,cut,forecast,would
2,athens,olympic,gold,medal,win,american,reuters,woman,greece,second,game,meter,men,thursday,become,olympics,first,individual,time,wednesday
3,quot,go,aug,come,see,night,team,easy,friday,last,day,could,today,like,water,leave,breathe,evidence,area,nasa
4,ap,night,sunday,inning,get,win,game,hit,run,ninth,beat,take,atlanta,boston,homer,last,another,second,pitch,victory
5,city,najaf,iraq,holy,cleric,shiite,iraqi,end,shrine,alsadr,us,baghdad,radical,fighter,militia,troop,reuters,appear,leader,saturday
6,price,new,oil,reuters,us,year,high,rise,energy,dollar,percent,investor,london,tuesday,public,share,record,cost,monday,consumer


In [6]:
labels = pd.read_csv(labels_path)
labels = check_labels(news[[text_column_name]], labels)
labels.head(5)  

olympic is not in the input corpus. It is removed from dictionary
iraq is not in the input corpus. It is removed from dictionary
soldier is not in the input corpus. It is removed from dictionary
oil_price is not in the input corpus. It is removed from dictionary
washington is not in the input corpus. It is removed from dictionary
official_say is not in the input corpus. It is removed from dictionary
microsoft is not in the input corpus. It is removed from dictionary


Unnamed: 0,Business,SciTech,Sports,World
0,stock,software,,
1,price,internet,champion,
2,,network,basketball,
3,sale,technology,champion,international
4,percent,research,silver,


In [7]:
labels.head(7)

Unnamed: 0,Business,SciTech,Sports,World
0,stock,software,,
1,price,internet,champion,
2,,network,basketball,
3,sale,technology,champion,international
4,percent,research,silver,
5,business,computer,medal,competition
6,revenue,,season,


In [8]:
autoLabeller = AutoLabeller(labels.head(6), corpus, data)
enriched_labels = autoLabeller.train()

enriched_labels  ## Enriched suggested labels

  return bound(*args, **kwds)


Unnamed: 0,Business,SciTech,Sports,World
0,investor,tool,bronze,large
1,cost,phone,basketball,retailer
2,company,company,sport,engage
3,initial,service,preliminary,practice
4,share,research,allaround,francisco
5,drop,network,game,web
6,rise,delay,meter,analyst
7,business,release,gold,business
8,pace,fact,greece,international
9,offering,ability,defend,intel


In [9]:
mnb = MultinomialNB()
ypred = autoLabeller.apply(mnb, 'content')
ypred.to_csv(labelled_path)

  return bound(*args, **kwds)


In [10]:
evaluator = Evaluator()
score = evaluator.evaluate_predictions(news, ypred, labels, split=0.2, random_state=42)
score.to_csv(score_path, index=False)
score

Unnamed: 0,Business,SciTech,Sports,World
Precision,0.1071,0.2759,0.5862,0.0968
Recall,0.1429,0.3636,0.6538,0.0968
F1-score,0.1224,0.3137,0.6182,0.0968


In [11]:
evaluator.compare_to_other_models(score, news, labels)

Unnamed: 0,Automatic Labeling,MLP Neural Network,Gradient Boosted Trees,Random Forest
Precision,0.266,0.45,0.289,0.65
Recall,0.314,0.273,0.215,0.101
F1-score,0.288,0.332,0.243,0.17


In [12]:
demo = news[["content"]].join(ypred)

demo

Unnamed: 0,content,Business,SciTech,Sports,World
0,Unions representing workers at Turner Newall...,1.0,0.0,0.0,1.0
1,"SPACE.com - TORONTO, Canada -- A second\team o...",0.0,0.0,0.0,0.0
2,AP - A company founded by a chemistry research...,0.0,0.0,1.0,1.0
3,AP - It's barely dawn when Mike Fitzpatrick st...,0.0,0.0,1.0,0.0
4,AP - Southern California's smog-fighting agenc...,0.0,0.0,0.0,0.0
...,...,...,...,...,...
495,ATHENS -- The booing went on for nearly 10 min...,0.0,0.0,1.0,0.0
496,WASHINGTON -- Hundreds of workers rallied on t...,0.0,0.0,0.0,0.0
497,A Bosnian Serb general accused of organising t...,1.0,0.0,0.0,0.0
498,NBC's prime-time Olympic coverage is taped and...,0.0,0.0,1.0,1.0
