# Preprocessing and Feature Extraction

## Imports

In [1]:
import json
import nltk, string
import numpy as np
import pandas as pd
import itertools 

In [2]:
import PyDictionary
from vocabulary.vocabulary import Vocabulary 
from nltk.corpus import wordnet as wn # I had to use this instead PyDictionary is too bad!

## Read files

There are two different files. The first one, called "instances" contains all the information for a given post. The second, "truth", contains the labels of each instance. The following are the schemas of these files:

`""" Fields in instances.jsonl: <br/>
 { <br/>
    "id": "<instance id>", <br/>
    "postTimestamp": "<weekday> <month> <day> <hour>:<minute>:<second> <time_offset> <year>", <br/>
    "postText": ["<text of the post with links removed>"], <br/>
    "postMedia": ["<path to a file in the media archive>"], <br/>
    "targetTitle": "<title of target article>", <br/>
    "targetDescription": "<description tag of target article>", <br/>
    "targetKeywords": "<keywords tag of target article>", <br/>
    "targetParagraphs": ["<text of the ith paragraph in the target article>"], <br/>
    "targetCaptions": ["<caption of the ith image in the target article>"] <br/>
  } """`


`""" Fields in truth.jsonl:
  {
    "id": "<instance id>",
    "truthJudgments": [<number in [0,1]>],
    "truthMean": <number in [0,1]>,
    "truthMedian": <number in [0,1]>,
    "truthMode": <number in [0,1]>,
    "truthClass": "clickbait | no-clickbait"
  } """`

In [3]:
def loadDataset(size):
    instances = []
    labels = []
    fileName = 'trainSmall' if size == 'small' else 'trainLarge'
    with open('data/'+fileName+'/instances.jsonl') as file:
        for line in file:
            instances.append(json.loads(line))
    with open('data/'+fileName+'/truth.jsonl') as file:
        for line in file:
            labels.append(json.loads(line))
    return instances, labels

In [4]:
dataset_size = 'large'

In [5]:
instances, labels = loadDataset(dataset_size)

In [6]:
instancesDF = pd.DataFrame(instances)
labelsDF = pd.DataFrame(labels)

## Preprocessing

The followed steps are:
* tokenizing
* removal of stopwords
* stemming

In [10]:
# These are the tools we are using, we can easily switch them here
tokenizer = nltk.word_tokenize
stemmer = nltk.stem.PorterStemmer().stem
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

In [11]:
from nltk.corpus import stopwords

In [12]:
instancesDF.iloc[0,:]

id                                                  858462320779026433
postMedia                                                           []
postText             [UK’s response to modern slavery leaving victi...
postTimestamp                           Sat Apr 29 23:25:41 +0000 2017
targetCaptions                                [modern-slavery-rex.jpg]
targetDescription    “Inexcusable” failures in the UK’s system for ...
targetKeywords       modern slavery, Department For Work And Pensio...
targetParagraphs     [Thousands of modern slavery victims have not ...
targetTitle          ‘Inexcusable’ failures in UK’s response to mod...
Name: 0, dtype: object

In [13]:
# In this function the text variable is a list of length 1
def preprocess(text):
    sw = set(stopwords.words('english'))    
    filtered_sentence = []
    if isinstance(text, list):      
        for t in text:            
            word_tokens = tokenizer(t.translate(remove_punctuation_map))
            filtered_sentence = filtered_sentence + [w for w in word_tokens if not w in sw] 
    else:
        word_tokens = tokenizer(text.translate(remove_punctuation_map))
        filtered_sentence = filtered_sentence + [w for w in word_tokens if not w in sw] 
    return filtered_sentence

In [14]:
instancesDF['postTextTokens'] = instancesDF.postText.apply(preprocess)
instancesDF['postTextClean'] = instancesDF.postTextTokens.apply(' '.join)
instancesDF['postTextPOSTags'] = instancesDF.postTextTokens.apply(nltk.pos_tag)

instancesDF['targetCaptionsTokens'] = instancesDF.targetCaptions.apply(preprocess)
instancesDF['targetCaptionsClean'] = instancesDF.targetCaptionsTokens.apply(' '.join)

instancesDF['targetDescriptionTokens'] = instancesDF.targetDescription.apply(preprocess)
instancesDF['targetDescriptionClean'] = instancesDF.targetDescriptionTokens.apply(' '.join)

instancesDF['targetKeywordsTokens'] = instancesDF.targetKeywords.apply(preprocess)
instancesDF['targetKeywordsClean'] = instancesDF.targetKeywordsTokens.apply(' '.join)

instancesDF['targetParagraphsTokens'] = instancesDF.targetParagraphs.apply(preprocess)
instancesDF['targetParagraphsClean'] = instancesDF.targetParagraphsTokens.apply(' '.join)

instancesDF['targetTitleTokens'] = instancesDF.targetTitle.apply(preprocess)
instancesDF['targetTitleClean'] = instancesDF.targetTitleTokens.apply(' '.join)

In [15]:
instancesDF.head()

Unnamed: 0,id,postMedia,postText,postTimestamp,targetCaptions,targetDescription,targetKeywords,targetParagraphs,targetTitle,postTextTokens,...,targetCaptionsTokens,targetCaptionsClean,targetDescriptionTokens,targetDescriptionClean,targetKeywordsTokens,targetKeywordsClean,targetParagraphsTokens,targetParagraphsClean,targetTitleTokens,targetTitleClean
0,858462320779026433,[],[UK’s response to modern slavery leaving victi...,Sat Apr 29 23:25:41 +0000 2017,[modern-slavery-rex.jpg],“Inexcusable” failures in the UK’s system for ...,"modern slavery, Department For Work And Pensio...",[Thousands of modern slavery victims have not ...,‘Inexcusable’ failures in UK’s response to mod...,"[UK, ’, response, modern, slavery, leaving, vi...",...,[modernslaveryrexjpg],modernslaveryrexjpg,"[“, Inexcusable, ”, failures, UK, ’, system, d...",“ Inexcusable ” failures UK ’ system dealing m...,"[modern, slavery, Department, For, Work, And, ...",modern slavery Department For Work And Pension...,"[Thousands, modern, slavery, victims, come, fo...",Thousands modern slavery victims come forward ...,"[‘, Inexcusable, ’, failures, UK, ’, response,...",‘ Inexcusable ’ failures UK ’ response modern ...
1,858421020331560960,[],[this is good],Sat Apr 29 20:41:34 +0000 2017,"[In this July 1, 2010 file photo, Dr. Charmain...",President Donald Trump has appointed pro-life ...,"Americans United for Life, Dr. Charmaine Yoest...",[President Donald Trump has appointed the pro-...,Donald Trump Appoints Pro-Life Advocate as Ass...,[good],...,"[In, July, 1, 2010, file, photo, Dr, Charmaine...",In July 1 2010 file photo Dr Charmaine Yoest t...,"[President, Donald, Trump, appointed, prolife,...",President Donald Trump appointed prolife advoc...,"[Americans, United, Life, Dr, Charmaine, Yoest...",Americans United Life Dr Charmaine Yoest Plann...,"[President, Donald, Trump, appointed, prolife,...",President Donald Trump appointed prolife advoc...,"[Donald, Trump, Appoints, ProLife, Advocate, A...",Donald Trump Appoints ProLife Advocate Assista...
2,858368123753435136,[],"[The ""forgotten"" Trump roast: Relive his bruta...",Sat Apr 29 17:11:23 +0000 2017,[President Trump will not attend this year's W...,President Trump won't be at this year's White ...,"trump whcd, whcd, white house correspondents d...",[When the White House correspondents’ dinner i...,The ‘forgotten’ Trump roast: Relive his brutal...,"[The, forgotten, Trump, roast, Relive, brutal,...",...,"[President, Trump, attend, years, White, House...",President Trump attend years White House corre...,"[President, Trump, wont, years, White, House, ...",President Trump wont years White House corresp...,"[trump, whcd, whcd, white, house, corresponden...",trump whcd whcd white house correspondents din...,"[When, White, House, correspondents, ’, dinner...",When White House correspondents ’ dinner enter...,"[The, ‘, forgotten, ’, Trump, roast, Relive, b...",The ‘ forgotten ’ Trump roast Relive brutal 20...
3,858323428260139008,[],[Meet the happiest #dog in the world!],Sat Apr 29 14:13:46 +0000 2017,"[Maru , Maru, Maru, Maru, Maru]","The article is about Maru, a husky dog who has...","Maru, husky, dogs, pandas, furball, instagram",[Adorable is probably an understatement. This ...,"Meet The Happiest Dog In The World, Maru The H...","[Meet, happiest, dog, world]",...,"[Maru, Maru, Maru, Maru, Maru]",Maru Maru Maru Maru Maru,"[The, article, Maru, husky, dog, uncanny, rese...",The article Maru husky dog uncanny resemblance...,"[Maru, husky, dogs, pandas, furball, instagram]",Maru husky dogs pandas furball instagram,"[Adorable, probably, understatement, This, ado...",Adorable probably understatement This adorable...,"[Meet, The, Happiest, Dog, In, The, World, Mar...",Meet The Happiest Dog In The World Maru The Hu...
4,858283602626347008,[],[Tokyo's subway is shut down amid fears over a...,Sat Apr 29 11:35:31 +0000 2017,[All nine lines of Tokyo's subway system were ...,"The temporary suspension, which lasted ten min...","Tokyo,subway,shut,fears,North,Korean,attack",[One of Tokyo's major subways systems says it ...,Tokyo's subway is shut down amid fears over an...,"[Tokyos, subway, shut, amid, fears, imminent, ...",...,"[All, nine, lines, Tokyos, subway, system, sus...",All nine lines Tokyos subway system suspended ...,"[The, temporary, suspension, lasted, ten, minu...",The temporary suspension lasted ten minutes af...,[TokyosubwayshutfearsNorthKoreanattack],TokyosubwayshutfearsNorthKoreanattack,"[One, Tokyos, major, subways, systems, says, s...",One Tokyos major subways systems says shut lin...,"[Tokyos, subway, shut, amid, fears, imminent, ...",Tokyos subway shut amid fears imminent North K...


## Define features

The following is the list of features to implement:
* numChar(TargetTitle, Post, TargetParagraphs)
* diffNumChar(TargetTitleVsPost, TargetTitleVsTargetParagraphs, PostVsTargetParagraphs)
* ratioNumChar(TargetTitleVsPost, TargetTitleVsTargetParagraphs, PostVsTargetParagraphs)
* numWords (TargetTitle, Post, TargetParagraphs)
* diffNumWords (TargetTitleVsPost, TargetTitleVsTargetParagraphs, PostVsTargetParagraphs)
* ratioNumWords (TargetTitleVsPost, TargetTitleVsTargetParagraphs, PostVsTargetParagraphs)
* numFormalInformal (TargetTitle, Post, TargetParagraphs)
* ratioFormalInformal (TargetTitle, Post, TargetParagraphs)

In [16]:
instancesDF.columns

Index(['id', 'postMedia', 'postText', 'postTimestamp', 'targetCaptions',
       'targetDescription', 'targetKeywords', 'targetParagraphs',
       'targetTitle', 'postTextTokens', 'postTextClean', 'postTextPOSTags',
       'targetCaptionsTokens', 'targetCaptionsClean',
       'targetDescriptionTokens', 'targetDescriptionClean',
       'targetKeywordsTokens', 'targetKeywordsClean', 'targetParagraphsTokens',
       'targetParagraphsClean', 'targetTitleTokens', 'targetTitleClean'],
      dtype='object')

In [17]:
ratio_corrector = lambda x: 0.0001 if x == 0 else x

In [18]:
# numChars
instancesDF['featNumCharPostText'] = instancesDF.postTextClean.apply(len)
instancesDF['featNumCharTargetTitle'] = instancesDF.targetTitleClean.apply(len)
instancesDF['featNumCharTargetDescription'] = instancesDF.targetDescriptionClean.apply(len)
instancesDF['featNumCharTargetKeywords'] = instancesDF.targetKeywordsClean.apply(len)
instancesDF['featNumCharTargetCaptions'] = instancesDF.targetCaptionsClean.apply(len)
instancesDF['featNumCharTargetParagraphs'] = instancesDF.targetParagraphsClean.apply(len)

In [19]:
# diffNumChars
base_text_orig = 'featNumChar'
base_text_new = 'featDiffChar'
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for f, s in list(itertools.combinations(elements, 2)):
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_orig+f] - instancesDF[base_text_orig+s]
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_new + f + '_' + s].apply(abs)

In [20]:
# ratioNumChars
base_text_orig = 'featNumChar'
base_text_new = 'featRatioChar'
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for f, s in list(itertools.combinations(elements, 2)):
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_orig+f].apply(ratio_corrector) / instancesDF[base_text_orig+s].apply(ratio_corrector)
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_new + f + '_' + s].apply(abs)

In [21]:
# numWords
instancesDF['featNumWordsPostText'] = instancesDF.postTextTokens.apply(len)
instancesDF['featNumWordsTargetCaptions'] = instancesDF.targetCaptionsTokens.apply(len)
instancesDF['featNumWordsTargetDescription'] = instancesDF.targetDescriptionTokens.apply(len)
instancesDF['featNumWordsTargetKeywords'] = instancesDF.targetKeywordsTokens.apply(len)
instancesDF['featNumWordsTargetParagraphs'] = instancesDF.targetParagraphsTokens.apply(len)
instancesDF['featNumWordsTargetTitle'] = instancesDF.targetTitleTokens.apply(len)

In [22]:
# diffNumWords
base_text_orig = 'featNumWords'
base_text_new = 'featDiffWords'
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for f, s in list(itertools.combinations(elements, 2)):
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_orig+f] - instancesDF[base_text_orig+s]
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_new + f + '_' + s].apply(abs)

In [23]:
# ratioNumWords
base_text_orig = 'featNumWords'
base_text_new = 'featRatioWords'
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for f, s in list(itertools.combinations(elements, 2)):
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_orig+f].apply(ratio_corrector) / instancesDF[base_text_orig+s].apply(ratio_corrector)
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_new + f + '_' + s].apply(abs)

In [24]:
# Number of formal and informal english words
def count_words(words_tokens, formal=False):
    num_elements = 0
    for word in words_tokens:
        output = len(wn.synsets(word))
        if formal and output != 0:
            num_elements += 1
        if not formal and output == 0:
            num_elements += 1
    return num_elements    

In [25]:
# count formal words
num_formal_words = lambda x: count_words(x, True)

instancesDF['featNumFormalWordsPostText'] = instancesDF.postTextTokens.apply(num_formal_words)
instancesDF['featNumFormalWordsTargetTitle'] = instancesDF.targetTitleTokens.apply(num_formal_words)
instancesDF['featNumFormalWordsTargetDescription'] = instancesDF.targetDescriptionTokens.apply(num_formal_words)
instancesDF['featNumFormalWordsTargetKeywords'] = instancesDF.targetKeywordsTokens.apply(num_formal_words)
instancesDF['featNumFormalWordsTargetCaptions'] = instancesDF.targetCaptionsTokens.apply(num_formal_words)
instancesDF['featNumFormalWordsTargetParagraphs'] = instancesDF.targetParagraphsTokens.apply(num_formal_words)

In [26]:
# count informal words
num_informal_words = lambda x: count_words(x, False)

instancesDF['featNumInformalWordsPostText'] = instancesDF.postTextTokens.apply(num_informal_words)
instancesDF['featNumInformalWordsTargetTitle'] = instancesDF.targetTitleTokens.apply(num_informal_words)
instancesDF['featNumInformalWordsTargetDescription'] = instancesDF.targetDescriptionTokens.apply(num_informal_words)
instancesDF['featNumInformalWordsTargetKeywords'] = instancesDF.targetKeywordsTokens.apply(num_informal_words)
instancesDF['featNumInformalWordsTargetCaptions'] = instancesDF.targetCaptionsTokens.apply(num_informal_words)
instancesDF['featNumInformalWordsTargetParagraphs'] = instancesDF.targetParagraphsTokens.apply(num_informal_words)

In [27]:
# percent of formal words
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for e in elements:    
    instancesDF['featPercentFormalWords' + e ] = instancesDF['featNumFormalWords'+e].apply(ratio_corrector) / instancesDF['featNumWords'+e].apply(ratio_corrector)

In [28]:
# percent of informal words
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for e in elements:    
    instancesDF['featPercentInformalWords' + e ] = instancesDF['featNumInformalWords'+e].apply(ratio_corrector) / instancesDF['featNumWords'+e].apply(ratio_corrector)

In [29]:
filter_col = ['id'] + [col for col in instancesDF if col.startswith('feat')]
instancesDF[filter_col].shape

(19538, 97)

## Our features

* NNPs (maybe POS)
* simmilarity
* readability
* POS 2-gram NNP NNP
* TF-iDF

In [30]:
instancesDF.head().postTextClean.map(print);

UK ’ response modern slavery leaving victims destitute abusers go free
good
The forgotten Trump roast Relive brutal 2004 thrashing New York Friars Club
Meet happiest dog world
Tokyos subway shut amid fears imminent North Korean missile attack Japan


In [31]:
labelsDF.head()

Unnamed: 0,id,truthClass,truthJudgments,truthMean,truthMedian,truthMode
0,858464162594172928,clickbait,"[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,1.0,1.0
1,858462320779026433,no-clickbait,"[0.3333333333, 0.0, 0.3333333333, 0.0, 0.0]",0.133333,0.0,0.0
2,858460992073863168,no-clickbait,"[0.3333333333, 0.6666666666, 1.0, 0.0, 0.0]",0.4,0.333333,0.0
3,858459539296980995,no-clickbait,"[0.0, 0.6666666666, 0.0, 0.3333333333, 0.33333...",0.266667,0.333333,0.333333
4,858455355948384257,no-clickbait,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0,0.0


### - POS Features

Clickbait posts omit important information so they can exploit the curiosity gap in readers mind. An example of this can be observed in the following sentence: 

In [32]:
print(instancesDF.postText[4])
print(nltk.pos_tag(tokenizer(instancesDF.postText[4][0])))

["Tokyo's subway is shut down amid fears over an imminent North Korean missile attack on Japan"]
[('Tokyo', 'NNP'), ("'s", 'POS'), ('subway', 'NN'), ('is', 'VBZ'), ('shut', 'VBN'), ('down', 'RP'), ('amid', 'IN'), ('fears', 'NNS'), ('over', 'IN'), ('an', 'DT'), ('imminent', 'JJ'), ('North', 'JJ'), ('Korean', 'JJ'), ('missile', 'NN'), ('attack', 'NN'), ('on', 'IN'), ('Japan', 'NNP')]


As can be observed they don't explicitely state the company. Instead they say 'one company'. In POS terms, this is a succession of a CD (cardinal digit) and a NN (singular noun). Other examples of this bigrams are:
* This company
* This person 
* One famous actor 

As can be observed, the presence of different parts-of-speech can give us important hints regarding the status of a post text. Not only this, in the previous examples was shown how the interaction or sequence of certain POS tags is also indicative of a clickbait title. In this order of ideas, we compute POS features on three levels of granularity: 1-grams, 2-grams and 3-grams. 

In order to do this, a subset of interesting tags was choosen based on their effectiveness in previous experiments [CITE]. Then, we assembled a set that contained the permutations (because the order is important) of these elements. We ended up with 1464 possible features. After this, we computed the corresponding counts for the postText of each instance. We notices that most of the columns were filled with zeros and removed them. This process left us with 337 POS-related features. 

Given that different post texts can contain a different number of words and therefore different absolute total counts (CLARIFY THIS), we normalized each row by dividing the features by the total number of 1-grams, 2-grams and 3-grams, accordingly.

In [33]:
available_pos = {'CC': 'coordinating conjunction','CD': 'cardinal digit','DT': 'determiner','EX': 'existential there (like: “there is” … think of it like “there exists”)','FW': 'foreign word','IN': 'preposition/subordinating conjunction','JJ': 'adjective ‘big’','JJR': 'adjective, comparative ‘bigger’','JJS': 'adjective, superlative ‘biggest’','LS': 'list marker 1)','MD': 'modal could, will','NN': 'noun, singular ‘desk’','NNS': 'noun plural ‘desks’','NNP': 'proper noun, singular ‘Harrison’','NNPS': 'proper noun, plural ‘Americans’','PDT': 'predeterminer ‘all the kids’','POS': 'possessive ending parent’s','PRP': 'personal pronoun I, he, she','PRP$': 'possessive pronoun my, his, hers','RB': 'adverb very, silently,','RBR': 'adverb, comparative better','RBS': 'adverb, superlative best','RP': 'particle give up','TO': ', to go ‘to’ the store.','UH': 'interjection, errrrrrrrm','VB': 'verb, base form take','VBD': 'verb, past tense took','VBG': 'verb, gerund/present participle taking','VBN': 'verb, past participle taken','VBP': 'verb, sing. present, non-3d take','VBZ': 'verb, 3rd person sing. present takes','WDT': 'wh-determiner which','WP': 'wh-pronoun who, what','WP$': 'possessive wh-pronoun whose','WRB': 'wh-abverb where, when'}
available_pos 

{'CC': 'coordinating conjunction',
 'CD': 'cardinal digit',
 'DT': 'determiner',
 'EX': 'existential there (like: “there is” … think of it like “there exists”)',
 'FW': 'foreign word',
 'IN': 'preposition/subordinating conjunction',
 'JJ': 'adjective ‘big’',
 'JJR': 'adjective, comparative ‘bigger’',
 'JJS': 'adjective, superlative ‘biggest’',
 'LS': 'list marker 1)',
 'MD': 'modal could, will',
 'NN': 'noun, singular ‘desk’',
 'NNS': 'noun plural ‘desks’',
 'NNP': 'proper noun, singular ‘Harrison’',
 'NNPS': 'proper noun, plural ‘Americans’',
 'PDT': 'predeterminer ‘all the kids’',
 'POS': 'possessive ending parent’s',
 'PRP': 'personal pronoun I, he, she',
 'PRP$': 'possessive pronoun my, his, hers',
 'RB': 'adverb very, silently,',
 'RBR': 'adverb, comparative better',
 'RBS': 'adverb, superlative best',
 'RP': 'particle give up',
 'TO': ', to go ‘to’ the store.',
 'UH': 'interjection, errrrrrrrm',
 'VB': 'verb, base form take',
 'VBD': 'verb, past tense took',
 'VBG': 'verb, gerund

In [34]:
interesting_pos = ['NNP','IN','VBZ','WRB','NN','QM','PRP','WP','DT','JJ','NNS','RB', 'RBS']
def is_interesting(tag):
    tags = tag.split('_')
    interesting = False
    if len(tags) == 1 and tags[0] in interesting_pos:
        interesting = True
    elif len(tags) == 2 and (tags[0] in interesting_pos and tags[1] in interesting_pos):
        interesting = True
    elif len(tags) == 3 and (tags[0] in interesting_pos and tags[1] in interesting_pos and tags[2] in interesting_pos):
        interesting = True
    return interesting
    
pos_list = list(available_pos.keys())
pos_list = pos_list + ['_'.join(x) for x in list(itertools.permutations(available_pos, 2))] + ['_'.join(x) for x in list(itertools.permutations(available_pos, 3))]
pos_list = [x for x in pos_list if is_interesting(x)] 
len(pos_list)

1464

In [35]:
# Number of NNPs (actually in this part you can compute features that have to do with POS)
def get_pos_related_features(tags):
    pos_tags_count = dict.fromkeys(pos_list, 0)
    #tokens = tokenizer(cleaned_text)
    #tags = nltk.pos_tag(tokens)
        
    # unigrams
    num_1grams = len(tags)
    for i in range(len(tags)):        
        w1, t1 = tags[i]
        if t1 in pos_tags_count: pos_tags_count[t1] += 1#/ num_1grams # normalizer
    
    # bigrams
    num_2grams = len(tags) - 1
    for i in range(len(tags) - 1):
        w1, t1 = tags[i]
        w2, t2 = tags[i + 1] 
        if t1+'_'+t2 in pos_tags_count: pos_tags_count[t1+'_'+t2] += 1 #/num_2grams # normalizer

    # trigrams
    num_3grams = len(tags) - 2
    for i in range(len(tags) - 2):
        w1, t1 = tags[i]
        w2, t2 = tags[i + 1] 
        w3, t3 = tags[i + 2] 
        if t1+'_'+t2+'_'+t3 in pos_tags_count: pos_tags_count[t1+'_'+t2+'_'+t3] += 1 #/num_3grams # normalizer

    
    return list(pos_tags_count.values()) # these should always be in the same order

In [36]:
# https://stackoverflow.com/questions/16236684/apply-pandas-function-to-column-to-create-multiple-new-columns
temp = list(zip(*instancesDF['postTextPOSTags'].map(get_pos_related_features)))
for i, c in enumerate(pos_list): 
    instancesDF['featCountPOS_'+c] = temp[i]
instancesDF = instancesDF.loc[:, (instancesDF != 0).any(axis=0)]

In [37]:
#instancesDF.shape
instance_index = 4
temp = instancesDF[[col for col in instancesDF if col.startswith('featCountPOS_')]].iloc[instance_index]
print(instancesDF.postText[instance_index]); print()
print(nltk.pos_tag(tokenizer(instancesDF.postText[instance_index][0]))); print()
print(temp[temp != 0])

["Tokyo's subway is shut down amid fears over an imminent North Korean missile attack on Japan"]

[('Tokyo', 'NNP'), ("'s", 'POS'), ('subway', 'NN'), ('is', 'VBZ'), ('shut', 'VBN'), ('down', 'RP'), ('amid', 'IN'), ('fears', 'NNS'), ('over', 'IN'), ('an', 'DT'), ('imminent', 'JJ'), ('North', 'JJ'), ('Korean', 'JJ'), ('missile', 'NN'), ('attack', 'NN'), ('on', 'IN'), ('Japan', 'NNP')]

featCountPOS_IN           1
featCountPOS_JJ           3
featCountPOS_NN           4
featCountPOS_NNS          1
featCountPOS_NNP          2
featCountPOS_IN_NNS       1
featCountPOS_JJ_NN        1
featCountPOS_NN_IN        1
featCountPOS_NN_NNP       1
featCountPOS_NNS_JJ       1
featCountPOS_NNP_NN       1
featCountPOS_IN_NNS_JJ    1
featCountPOS_NN_IN_NNS    1
Name: 4, dtype: int64


### - Named Entity Recognition

As was mentioned before, the clickbait posts exploit the curiosity gap in the user. This includes leaving out names (e.g Trump) and instead using generic nouns (e.g. This president). Named entity recognition aims, as its name suggests, at finding out the named entities in a text. Our hypothesis regarding this feature is that if a potsText contains a named entity (NE) then it is more probable that it is not clickbait. 
To build this feature we used Spacy's entity tagger. We chose this option over the simpler NLTK named entity chunker given that the latter requires a training step on a corpus. Instead, the Spacy tagger that we used has already been trained on a medium sized corpus, it is a english multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl [CITE](https://spacy.io/models/en). We acknowlede there may be problems recognizing some entities given the difference in corpus (COMPLETE).



In [38]:
import spacy
from spacy import displacy # install with conda 
from collections import Counter
import en_core_web_md # download with python -m spacy download en_core_web_sm
nlp = en_core_web_md.load()

In [39]:
doc = nlp(instancesDF.postTextClean[35])
print('[' + labelsDF.truthClass[35] + '] ' + instancesDF.postTextClean[25])
print()
print([(X.text, X.label_) for X in doc.ents])
print(1 if len(doc.ents) > 0 else 0)

[no-clickbait] 14 strangely satisfying videos melting cheese

[('Paris', 'GPE')]
1


In [40]:
instancesDF['featIsNEPresent'] = instancesDF.postTextClean.apply(lambda x: 1 if len((nlp(x)).ents) > 0 else 0)

### - Sentiment

In order to grab the attention of the reader clickbait posts can use words that exagerate the message. This can be quantified by using the VADER sentiment score. VADER is a lexicon and rule-based sentiment analysis tool that is tunned to work with social media text. It works by computing a valence score for each word in the lexicon and computing a 'compound' value by aggregating them and normalizing the resulting value between -1 and 1 ([CITE](https://github.com/cjhutto/vaderSentiment)). 

In [41]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer




In [42]:
sid = SentimentIntensityAnalyzer()
ss = sid.polarity_scores(instancesDF.postTextClean[0])
print(instancesDF.postTextClean[0])
print(ss)


UK ’ response modern slavery leaving victims destitute abusers go free
{'neg': 0.535, 'neu': 0.3, 'pos': 0.165, 'compound': -0.8126}


In [43]:
instancesDF['featSentiment'] = instancesDF.postTextClean.apply(lambda x: (sid.polarity_scores(x))['compound'])


### - Similarity

Another feature that we consider is the similarity between two documents. Specifically, we wanted to measure how close is the postText to the actual content of the target article. For this, we compute the similarity between: the postText and the targetTitle, the postText and the targetParagraphs, and the postText and the targetKeywords. For this we use the similarity function included in Spacy. The algorithm works by comparing word vector representation of the sentences (COMPLETE). 

In [44]:
text1 = instancesDF[['postTextClean']].iloc[3,0]; print(text1)
text2 = instancesDF[['targetTitle']].iloc[3,0]; print(text2)
sim = nlp(text1).similarity(nlp(text2)); print(sim)

Meet happiest dog world
Meet The Happiest Dog In The World, Maru The Husky Who Also Looks Like A Panda!
0.8453118005693266


In [45]:
# https://stackoverflow.com/questions/12182744/python-pandas-apply-a-function-with-arguments-to-a-series
compute_similarity = lambda x, col1, col2: nlp(x[col1]).similarity(nlp(x[col2]))
instancesDF['featSimilarityPostTextTargetTitle'] = instancesDF.apply(compute_similarity, axis=1, args=('postTextClean','targetTitleClean'))
instancesDF['featSimilarityPostTextTargetParagraphs'] = instancesDF.apply(compute_similarity, axis=1, args=('postTextClean','targetParagraphsClean'))
instancesDF['featSimilarityPostTextTargetKeywords'] = instancesDF.apply(compute_similarity, axis=1, args=('postTextClean','targetKeywordsClean'))

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mo

## Save feature sets

In [46]:
filter_col = ['id'] + [col for col in instancesDF if col.startswith('feat')]
featureSet = instancesDF[filter_col]
print(featureSet.shape)
featureSet.head()


(19538, 764)


Unnamed: 0,id,featNumCharPostText,featNumCharTargetTitle,featNumCharTargetDescription,featNumCharTargetKeywords,featNumCharTargetCaptions,featNumCharTargetParagraphs,featDiffCharPostText_TargetCaptions,featDiffCharPostText_TargetDescription,featDiffCharPostText_TargetKeywords,...,featCountPOS_WRB_RB_JJ,featCountPOS_WRB_RB_NNP,featCountPOS_WRB_VBZ_NN,featCountPOS_WRB_VBZ_NNS,featCountPOS_WRB_VBZ_NNP,featIsNEPresent,featSentiment,featSimilarityPostTextTargetTitle,featSimilarityPostTextTargetParagraphs,featSimilarityPostTextTargetKeywords
0,858462320779026433,70,108,163,96,19,4940,51,93,26,...,0,0,0,0,0,1,-0.8126,0.949143,0.867918,0.835194
1,858421020331560960,4,77,175,133,1248,1930,1244,171,129,...,0,0,0,0,0,0,0.4404,0.334389,0.593592,0.567879
2,858368123753435136,75,79,101,141,372,4151,297,26,66,...,0,0,0,0,0,1,-0.7184,0.978079,0.835426,0.781944
3,858323428260139008,23,77,52,40,24,441,1,29,17,...,0,0,0,0,0,0,0.6369,0.851503,0.752917,0.595358
4,858283602626347008,72,72,114,37,1893,2308,1821,42,35,...,0,0,0,0,0,1,-0.7096,1.0,0.828642,0.0


In [47]:
featureSet.to_csv('feature_set_'+dataset_size+'.csv')


## Save labels 

In [48]:
labelsDF.head()

Unnamed: 0,id,truthClass,truthJudgments,truthMean,truthMedian,truthMode
0,858464162594172928,clickbait,"[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,1.0,1.0
1,858462320779026433,no-clickbait,"[0.3333333333, 0.0, 0.3333333333, 0.0, 0.0]",0.133333,0.0,0.0
2,858460992073863168,no-clickbait,"[0.3333333333, 0.6666666666, 1.0, 0.0, 0.0]",0.4,0.333333,0.0
3,858459539296980995,no-clickbait,"[0.0, 0.6666666666, 0.0, 0.3333333333, 0.33333...",0.266667,0.333333,0.333333
4,858455355948384257,no-clickbait,"[0.0, 0.0, 0.0, 0.0, 0.0]",0.0,0.0,0.0


In [49]:
filter_col = ['id', 'truthClass', 'truthMean', 'truthJudgments']
labelsSet = labelsDF[filter_col]
labelsSet.head()
labelsSet.to_csv('labels_set_'+dataset_size+'.csv')