# Preprocessing and Feature Extraction

## Imports

In [1]:
import json
import nltk, string
import numpy as np
import pandas as pd
import itertools 

In [2]:
import PyDictionary
from vocabulary.vocabulary import Vocabulary 
from nltk.corpus import wordnet as wn # I had to use this instead PyDictionary is too bad!

## Read files

There are two different files. The first one, called "instances" contains all the information for a given post. The second, "truth", contains the labels of each instance. The following are the schemas of these files:

`""" Fields in instances.jsonl: <br/>
 { <br/>
    "id": "<instance id>", <br/>
    "postTimestamp": "<weekday> <month> <day> <hour>:<minute>:<second> <time_offset> <year>", <br/>
    "postText": ["<text of the post with links removed>"], <br/>
    "postMedia": ["<path to a file in the media archive>"], <br/>
    "targetTitle": "<title of target article>", <br/>
    "targetDescription": "<description tag of target article>", <br/>
    "targetKeywords": "<keywords tag of target article>", <br/>
    "targetParagraphs": ["<text of the ith paragraph in the target article>"], <br/>
    "targetCaptions": ["<caption of the ith image in the target article>"] <br/>
  } """`


`""" Fields in truth.jsonl:
  {
    "id": "<instance id>",
    "truthJudgments": [<number in [0,1]>],
    "truthMean": <number in [0,1]>,
    "truthMedian": <number in [0,1]>,
    "truthMode": <number in [0,1]>,
    "truthClass": "clickbait | no-clickbait"
  } """`

In [3]:
def loadDataset(size):
    instances = []
    labels = []
    fileName = 'trainSmall' if size == 'small' else 'trainLarge'
    with open('data/'+fileName+'/instances.jsonl') as file:
        for line in file:
            instances.append(json.loads(line))
    with open('data/'+fileName+'/truth.jsonl') as file:
        for line in file:
            labels.append(json.loads(line))
    return instances, labels

In [4]:
dataset_size = 'small'

In [5]:
instances, labels = loadDataset(dataset_size)

In [6]:
instancesDF = pd.DataFrame(instances)
labelsDF = pd.DataFrame(labels)

## Preprocessing

The followed steps are:
* tokenizing
* removal of stopwords
* stemming

In [7]:
# These are the tools we are using, we can easily switch them here
tokenizer = nltk.word_tokenize
stemmer = nltk.stem.PorterStemmer().stem
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

In [8]:
from nltk.corpus import stopwords

In [9]:
instancesDF.iloc[0,:]

id                                                  608310377143799810
postMedia                                                           []
postText             [Apple's iOS 9 'App thinning' feature will giv...
postTimestamp                           Tue Jun 09 16:31:10 +0000 2015
targetCaptions       ['App thinning' will be supported on Apple's i...
targetDescription    'App thinning' will be supported on Apple's iO...
targetKeywords       Apple,gives,gigabytes,iOS,9,app,thinning,featu...
targetParagraphs     [Paying for a 64GB phone only to discover that...
targetTitle          Apple gives back gigabytes: iOS 9 'app thinnin...
Name: 0, dtype: object

In [10]:
# In this function the text variable is a list of length 1
def preprocess(text):
    sw = set(stopwords.words('english'))    
    filtered_sentence = []
    if isinstance(text, list):      
        for t in text:            
            word_tokens = tokenizer(t.translate(remove_punctuation_map))
            filtered_sentence = filtered_sentence + [w for w in word_tokens if not w in sw] 
    else:
        word_tokens = tokenizer(text.translate(remove_punctuation_map))
        filtered_sentence = filtered_sentence + [w for w in word_tokens if not w in sw] 
    return filtered_sentence

In [11]:
instancesDF['postTextTokens'] = instancesDF.postText.apply(preprocess)
instancesDF['postTextClean'] = instancesDF.postTextTokens.apply(' '.join)
instancesDF['postTextPOSTags'] = instancesDF.postTextTokens.apply(nltk.pos_tag)

instancesDF['targetCaptionsTokens'] = instancesDF.targetCaptions.apply(preprocess)
instancesDF['targetCaptionsClean'] = instancesDF.targetCaptionsTokens.apply(' '.join)

instancesDF['targetDescriptionTokens'] = instancesDF.targetDescription.apply(preprocess)
instancesDF['targetDescriptionClean'] = instancesDF.targetDescriptionTokens.apply(' '.join)

instancesDF['targetKeywordsTokens'] = instancesDF.targetKeywords.apply(preprocess)
instancesDF['targetKeywordsClean'] = instancesDF.targetKeywordsTokens.apply(' '.join)

instancesDF['targetParagraphsTokens'] = instancesDF.targetParagraphs.apply(preprocess)
instancesDF['targetParagraphsClean'] = instancesDF.targetParagraphsTokens.apply(' '.join)

instancesDF['targetTitleTokens'] = instancesDF.targetTitle.apply(preprocess)
instancesDF['targetTitleClean'] = instancesDF.targetTitleTokens.apply(' '.join)

In [13]:
instancesDF.head()

Unnamed: 0,id,postMedia,postText,postTimestamp,targetCaptions,targetDescription,targetKeywords,targetParagraphs,targetTitle,postTextTokens,...,targetCaptionsTokens,targetCaptionsClean,targetDescriptionTokens,targetDescriptionClean,targetKeywordsTokens,targetKeywordsClean,targetParagraphsTokens,targetParagraphsClean,targetTitleTokens,targetTitleClean
0,608310377143799810,[],[Apple's iOS 9 'App thinning' feature will giv...,Tue Jun 09 16:31:10 +0000 2015,['App thinning' will be supported on Apple's i...,'App thinning' will be supported on Apple's iO...,"Apple,gives,gigabytes,iOS,9,app,thinning,featu...",[Paying for a 64GB phone only to discover that...,Apple gives back gigabytes: iOS 9 'app thinnin...,"[Apples, iOS, 9, App, thinning, feature, give,...",...,"[App, thinning, supported, Apples, iOS, 9, lat...",App thinning supported Apples iOS 9 later mode...,"[App, thinning, supported, Apples, iOS, 9, lat...",App thinning supported Apples iOS 9 later mode...,[ApplegivesgigabytesiOS9appthinningfeaturefina...,ApplegivesgigabytesiOS9appthinningfeaturefinal...,"[Paying, 64GB, phone, discover, significantly,...",Paying 64GB phone discover significantly reduc...,"[Apple, gives, back, gigabytes, iOS, 9, app, t...",Apple gives back gigabytes iOS 9 app thinning ...
1,609297109095972864,[media/609297109095972864.jpg],[RT @kenbrown12: Emerging market investors are...,Fri Jun 12 09:52:05 +0000 2015,"[Stocks Fall as Investors Watch Central Banks,...",Global investors have yanked $9.3 billion from...,"emerging market,emerging markets,em flows,em i...","[Emerging markets are out of favor., Global in...",Emerging Markets Suffer Largest Outflow in Sev...,"[RT, kenbrown12, Emerging, market, investors, ...",...,"[Stocks, Fall, Investors, Watch, Central, Bank...",Stocks Fall Investors Watch Central Banks Do T...,"[Global, investors, yanked, 93, billion, stock...",Global investors yanked 93 billion stocks deve...,"[emerging, marketemerging, marketsem, flowsem,...",emerging marketemerging marketsem flowsem infl...,"[Emerging, markets, favor, Global, investors, ...",Emerging markets favor Global investors yanked...,"[Emerging, Markets, Suffer, Largest, Outflow, ...",Emerging Markets Suffer Largest Outflow Seven ...
2,609504474621612032,[],[U.S. Soccer should start answering tough ques...,Fri Jun 12 23:36:05 +0000 2015,[US to vote for Ali in FIFA election and not B...,A U.S. Senator's scathing letter questioned U....,,"[WINNIPEG, Manitoba – The bubble U.S. Soccer i...",U.S. Soccer should start answering tough quest...,"[US, Soccer, start, answering, tough, question...",...,"[US, vote, Ali, FIFA, election, Blatter, US, v...",US vote Ali FIFA election Blatter US vote Ali ...,"[A, US, Senators, scathing, letter, questioned...",A US Senators scathing letter questioned US So...,[],,"[WINNIPEG, Manitoba, –, The, bubble, US, Socce...",WINNIPEG Manitoba – The bubble US Soccer putti...,"[US, Soccer, start, answering, tough, question...",US Soccer start answering tough questions Hope...
3,609748367049105409,[],[How theme parks like Disney World left the mi...,Sat Jun 13 15:45:13 +0000 2015,"[Some 1,000 persons turned out in Albuquerque,...","America's top family vacation spots, like the ...","disney, disney world, disney ticket prices, di...",[When Walt Disney World opened in an Orlando s...,How theme parks like Disney World left the mid...,"[How, theme, parks, like, Disney, World, left,...",...,"[Some, 1000, persons, turned, Albuquerque, New...",Some 1000 persons turned Albuquerque New Mexic...,"[Americas, top, family, vacation, spots, like,...",Americas top family vacation spots like happie...,"[disney, disney, world, disney, ticket, prices...",disney disney world disney ticket prices disne...,"[When, Walt, Disney, World, opened, Orlando, s...",When Walt Disney World opened Orlando swamp 19...,"[How, theme, parks, like, Disney, World, left,...",How theme parks like Disney World left middle ...
4,608688782821453825,[media/608688782821453825.jpg],[Could light bulbs hurt your health? One compa...,Wed Jun 10 17:34:49 +0000 2015,[Electric lights have made the world safer and...,One company will put a health notice on all th...,"health, Should there be warning labels on your...",[(CNN)The light bulb always makes the world's ...,Warning labels on your light bulbs,"[Could, light, bulbs, hurt, health, One, compa...",...,"[Electric, lights, made, world, safer, made, p...",Electric lights made world safer made people s...,"[One, company, put, health, notice, packages, ...",One company put health notice packages lightin...,"[health, Should, warning, labels, light, bulbs...",health Should warning labels light bulbs CNNcom,"[CNNThe, light, bulb, always, makes, worlds, t...",CNNThe light bulb always makes worlds top inve...,"[Warning, labels, light, bulbs]",Warning labels light bulbs


## Define features

The following is the list of features to implement:
* numChar(TargetTitle, Post, TargetParagraphs)
* diffNumChar(TargetTitleVsPost, TargetTitleVsTargetParagraphs, PostVsTargetParagraphs)
* ratioNumChar(TargetTitleVsPost, TargetTitleVsTargetParagraphs, PostVsTargetParagraphs)
* numWords (TargetTitle, Post, TargetParagraphs)
* diffNumWords (TargetTitleVsPost, TargetTitleVsTargetParagraphs, PostVsTargetParagraphs)
* ratioNumWords (TargetTitleVsPost, TargetTitleVsTargetParagraphs, PostVsTargetParagraphs)
* numFormalInformal (TargetTitle, Post, TargetParagraphs)
* ratioFormalInformal (TargetTitle, Post, TargetParagraphs)

In [14]:
instancesDF.columns

Index(['id', 'postMedia', 'postText', 'postTimestamp', 'targetCaptions',
       'targetDescription', 'targetKeywords', 'targetParagraphs',
       'targetTitle', 'postTextTokens', 'postTextClean', 'postTextPOSTags',
       'targetCaptionsTokens', 'targetCaptionsClean',
       'targetDescriptionTokens', 'targetDescriptionClean',
       'targetKeywordsTokens', 'targetKeywordsClean', 'targetParagraphsTokens',
       'targetParagraphsClean', 'targetTitleTokens', 'targetTitleClean'],
      dtype='object')

In [15]:
ratio_corrector = lambda x: 0.0001 if x == 0 else x

In [16]:
# numChars
instancesDF['featNumCharPostText'] = instancesDF.postTextClean.apply(len)
instancesDF['featNumCharTargetTitle'] = instancesDF.targetTitleClean.apply(len)
instancesDF['featNumCharTargetDescription'] = instancesDF.targetDescriptionClean.apply(len)
instancesDF['featNumCharTargetKeywords'] = instancesDF.targetKeywordsClean.apply(len)
instancesDF['featNumCharTargetCaptions'] = instancesDF.targetCaptionsClean.apply(len)
instancesDF['featNumCharTargetParagraphs'] = instancesDF.targetParagraphsClean.apply(len)

In [17]:
# diffNumChars
base_text_orig = 'featNumChar'
base_text_new = 'featDiffChar'
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for f, s in list(itertools.combinations(elements, 2)):
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_orig+f] - instancesDF[base_text_orig+s]
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_new + f + '_' + s].apply(abs)

In [18]:
# ratioNumChars
base_text_orig = 'featNumChar'
base_text_new = 'featRatioChar'
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for f, s in list(itertools.combinations(elements, 2)):
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_orig+f].apply(ratio_corrector) / instancesDF[base_text_orig+s].apply(ratio_corrector)
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_new + f + '_' + s].apply(abs)

In [19]:
# numWords
instancesDF['featNumWordsPostText'] = instancesDF.postTextTokens.apply(len)
instancesDF['featNumWordsTargetCaptions'] = instancesDF.targetCaptionsTokens.apply(len)
instancesDF['featNumWordsTargetDescription'] = instancesDF.targetDescriptionTokens.apply(len)
instancesDF['featNumWordsTargetKeywords'] = instancesDF.targetKeywordsTokens.apply(len)
instancesDF['featNumWordsTargetParagraphs'] = instancesDF.targetParagraphsTokens.apply(len)
instancesDF['featNumWordsTargetTitle'] = instancesDF.targetTitleTokens.apply(len)

In [20]:
# diffNumWords
base_text_orig = 'featNumWords'
base_text_new = 'featDiffWords'
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for f, s in list(itertools.combinations(elements, 2)):
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_orig+f] - instancesDF[base_text_orig+s]
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_new + f + '_' + s].apply(abs)

In [21]:
# ratioNumWords
base_text_orig = 'featNumWords'
base_text_new = 'featRatioWords'
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for f, s in list(itertools.combinations(elements, 2)):
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_orig+f].apply(ratio_corrector) / instancesDF[base_text_orig+s].apply(ratio_corrector)
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_new + f + '_' + s].apply(abs)

In [22]:
# Number of formal and informal english words
def count_words(words_tokens, formal=False):
    num_elements = 0
    for word in words_tokens:
        output = len(wn.synsets(word))
        if formal and output != 0:
            num_elements += 1
        if not formal and output == 0:
            num_elements += 1
    return num_elements    

In [23]:
# count formal words
num_formal_words = lambda x: count_words(x, True)

instancesDF['featNumFormalWordsPostText'] = instancesDF.postTextTokens.apply(num_formal_words)
instancesDF['featNumFormalWordsTargetTitle'] = instancesDF.targetTitleTokens.apply(num_formal_words)
instancesDF['featNumFormalWordsTargetDescription'] = instancesDF.targetDescriptionTokens.apply(num_formal_words)
instancesDF['featNumFormalWordsTargetKeywords'] = instancesDF.targetKeywordsTokens.apply(num_formal_words)
instancesDF['featNumFormalWordsTargetCaptions'] = instancesDF.targetCaptionsTokens.apply(num_formal_words)
instancesDF['featNumFormalWordsTargetParagraphs'] = instancesDF.targetParagraphsTokens.apply(num_formal_words)

In [24]:
# count informal words
num_informal_words = lambda x: count_words(x, False)

instancesDF['featNumInformalWordsPostText'] = instancesDF.postTextTokens.apply(num_informal_words)
instancesDF['featNumInformalWordsTargetTitle'] = instancesDF.targetTitleTokens.apply(num_informal_words)
instancesDF['featNumInformalWordsTargetDescription'] = instancesDF.targetDescriptionTokens.apply(num_informal_words)
instancesDF['featNumInformalWordsTargetKeywords'] = instancesDF.targetKeywordsTokens.apply(num_informal_words)
instancesDF['featNumInformalWordsTargetCaptions'] = instancesDF.targetCaptionsTokens.apply(num_informal_words)
instancesDF['featNumInformalWordsTargetParagraphs'] = instancesDF.targetParagraphsTokens.apply(num_informal_words)

In [25]:
# percent of formal words
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for e in elements:    
    instancesDF['featPercentFormalWords' + e ] = instancesDF['featNumFormalWords'+e].apply(ratio_corrector) / instancesDF['featNumWords'+e].apply(ratio_corrector)

In [26]:
# percent of informal words
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for e in elements:    
    instancesDF['featPercentInformalWords' + e ] = instancesDF['featNumInformalWords'+e].apply(ratio_corrector) / instancesDF['featNumWords'+e].apply(ratio_corrector)

## Our features

* NNPs (maybe POS)
* simmilarity
* readability
* POS 2-gram NNP NNP
* TF-iDF

In [29]:
instancesDF.head().postTextClean.map(print);

Apples iOS 9 App thinning feature give phones storage boost
RT kenbrown12 Emerging market investors best Monty PythonsRun away run away
US Soccer start answering tough questions Hope Solo ericadelson writes
How theme parks like Disney World left middle class behind


In [30]:
labelsDF.head()

Unnamed: 0,id,truthClass,truthJudgments,truthMean,truthMedian,truthMode
0,608310377143799810,no-clickbait,"[0.0, 0.6666667, 0.0, 0.33333334, 0.0]",0.2,0.0,0.0
1,609297109095972864,no-clickbait,"[0.6666667, 0.0, 0.0, 0.0, 0.0]",0.133333,0.0,0.0
2,609504474621612032,clickbait,"[0.33333334, 0.6666667, 1.0, 0.0, 0.6666667]",0.533333,0.666667,0.666667
3,609748367049105409,no-clickbait,"[1.0, 0.0, 0.33333334, 0.33333334, 0.6666667]",0.466667,0.333333,0.333333
4,608688782821453825,clickbait,"[1.0, 0.33333334, 0.6666667, 0.33333334, 1.0]",0.666667,0.666667,1.0


### - POS Features

Clickbait posts omit important information so they can exploit the curiosity gap in readers mind. An example of this can be observed in the following sentence: 

In [31]:
print(instancesDF.postText[4])
print(nltk.pos_tag(tokenizer(instancesDF.postText[4][0])))



As can be observed they don't explicitely state the company. Instead they say 'one company'. In POS terms, this is a succession of a CD (cardinal digit) and a NN (singular noun). Other examples of this bigrams are:
* This company
* This person 
* One famous actor 

As can be observed, the presence of different parts-of-speech can give us important hints regarding the status of a post text. Not only this, in the previous examples was shown how the interaction or sequence of certain POS tags is also indicative of a clickbait title. In this order of ideas, we compute POS features on three levels of granularity: 1-grams, 2-grams and 3-grams. 

In order to do this, a subset of interesting tags was choosen based on their effectiveness in previous experiments [CITE]. Then, we assembled a set that contained the permutations (because the order is important) of these elements. We ended up with 1464 possible features. After this, we computed the corresponding counts for the postText of each instance. We notices that most of the columns were filled with zeros and removed them. This process left us with 337 POS-related features. 

Given that different post texts can contain a different number of words and therefore different absolute total counts (CLARIFY THIS), we normalized each row by dividing the features by the total number of 1-grams, 2-grams and 3-grams, accordingly.

In [32]:
available_pos = {'CC': 'coordinating conjunction','CD': 'cardinal digit','DT': 'determiner','EX': 'existential there (like: “there is” … think of it like “there exists”)','FW': 'foreign word','IN': 'preposition/subordinating conjunction','JJ': 'adjective ‘big’','JJR': 'adjective, comparative ‘bigger’','JJS': 'adjective, superlative ‘biggest’','LS': 'list marker 1)','MD': 'modal could, will','NN': 'noun, singular ‘desk’','NNS': 'noun plural ‘desks’','NNP': 'proper noun, singular ‘Harrison’','NNPS': 'proper noun, plural ‘Americans’','PDT': 'predeterminer ‘all the kids’','POS': 'possessive ending parent’s','PRP': 'personal pronoun I, he, she','PRP$': 'possessive pronoun my, his, hers','RB': 'adverb very, silently,','RBR': 'adverb, comparative better','RBS': 'adverb, superlative best','RP': 'particle give up','TO': ', to go ‘to’ the store.','UH': 'interjection, errrrrrrrm','VB': 'verb, base form take','VBD': 'verb, past tense took','VBG': 'verb, gerund/present participle taking','VBN': 'verb, past participle taken','VBP': 'verb, sing. present, non-3d take','VBZ': 'verb, 3rd person sing. present takes','WDT': 'wh-determiner which','WP': 'wh-pronoun who, what','WP$': 'possessive wh-pronoun whose','WRB': 'wh-abverb where, when'}
available_pos 

{'CC': 'coordinating conjunction',
 'CD': 'cardinal digit',
 'DT': 'determiner',
 'EX': 'existential there (like: “there is” … think of it like “there exists”)',
 'FW': 'foreign word',
 'IN': 'preposition/subordinating conjunction',
 'JJ': 'adjective ‘big’',
 'JJR': 'adjective, comparative ‘bigger’',
 'JJS': 'adjective, superlative ‘biggest’',
 'LS': 'list marker 1)',
 'MD': 'modal could, will',
 'NN': 'noun, singular ‘desk’',
 'NNS': 'noun plural ‘desks’',
 'NNP': 'proper noun, singular ‘Harrison’',
 'NNPS': 'proper noun, plural ‘Americans’',
 'PDT': 'predeterminer ‘all the kids’',
 'POS': 'possessive ending parent’s',
 'PRP': 'personal pronoun I, he, she',
 'PRP$': 'possessive pronoun my, his, hers',
 'RB': 'adverb very, silently,',
 'RBR': 'adverb, comparative better',
 'RBS': 'adverb, superlative best',
 'RP': 'particle give up',
 'TO': ', to go ‘to’ the store.',
 'UH': 'interjection, errrrrrrrm',
 'VB': 'verb, base form take',
 'VBD': 'verb, past tense took',
 'VBG': 'verb, gerund

In [33]:
interesting_pos = ['NNP','IN','VBZ','WRB','NN','QM','PRP','WP','DT','JJ','NNS','RB', 'RBS']
def is_interesting(tag):
    tags = tag.split('_')
    interesting = False
    if len(tags) == 1 and tags[0] in interesting_pos:
        interesting = True
    elif len(tags) == 2 and (tags[0] in interesting_pos and tags[1] in interesting_pos):
        interesting = True
    elif len(tags) == 3 and (tags[0] in interesting_pos and tags[1] in interesting_pos and tags[2] in interesting_pos):
        interesting = True
    return interesting
    
pos_list = list(available_pos.keys())
pos_list = pos_list + ['_'.join(x) for x in list(itertools.permutations(available_pos, 2))] + ['_'.join(x) for x in list(itertools.permutations(available_pos, 3))]
pos_list = [x for x in pos_list if is_interesting(x)] 
len(pos_list)

1464

In [34]:
# Number of NNPs (actually in this part you can compute features that have to do with POS)
def get_pos_related_features(tags):
    pos_tags_count = dict.fromkeys(pos_list, 0)
    #tokens = tokenizer(cleaned_text)
    #tags = nltk.pos_tag(tokens)
        
    # unigrams
    num_1grams = len(tags)
    for i in range(len(tags)):        
        w1, t1 = tags[i]
        if t1 in pos_tags_count: pos_tags_count[t1] += 1#/ num_1grams # normalizer
    
    # bigrams
    num_2grams = len(tags) - 1
    for i in range(len(tags) - 1):
        w1, t1 = tags[i]
        w2, t2 = tags[i + 1] 
        if t1+'_'+t2 in pos_tags_count: pos_tags_count[t1+'_'+t2] += 1 #/num_2grams # normalizer

    # trigrams
    num_3grams = len(tags) - 2
    for i in range(len(tags) - 2):
        w1, t1 = tags[i]
        w2, t2 = tags[i + 1] 
        w3, t3 = tags[i + 2] 
        if t1+'_'+t2+'_'+t3 in pos_tags_count: pos_tags_count[t1+'_'+t2+'_'+t3] += 1 #/num_3grams # normalizer

    
    return list(pos_tags_count.values()) # these should always be in the same order

In [35]:
# https://stackoverflow.com/questions/16236684/apply-pandas-function-to-column-to-create-multiple-new-columns
temp = list(zip(*instancesDF['postTextPOSTags'].map(get_pos_related_features)))
for i, c in enumerate(pos_list): 
    instancesDF['featCountPOS_'+c] = temp[i]
instancesDF = instancesDF.loc[:, (instancesDF != 0).any(axis=0)]

In [36]:
#instancesDF.shape
instance_index = 4
temp = instancesDF[[col for col in instancesDF if col.startswith('featCountPOS_')]].iloc[instance_index]
print(instancesDF.postText[instance_index]); print()
print(nltk.pos_tag(tokenizer(instancesDF.postText[instance_index][0]))); print()
print(temp[temp != 0])



featCountPOS_JJ     1
featCountPOS_NN     2
featCountPOS_NNS    2
Name: 4, dtype: int64


### - Named Entity Recognition

As was mentioned before, the clickbait posts exploit the curiosity gap in the user. This includes leaving out names (e.g Trump) and instead using generic nouns (e.g. This president). Named entity recognition aims, as its name suggests, at finding out the named entities in a text. Our hypothesis regarding this feature is that if a potsText contains a named entity (NE) then it is more probable that it is not clickbait. 
To build this feature we used Spacy's entity tagger. We chose this option over the simpler NLTK named entity chunker given that the latter requires a training step on a corpus. Instead, the Spacy tagger that we used has already been trained on a medium sized corpus, it is a english multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl [CITE](https://spacy.io/models/en). We acknowlede there may be problems recognizing some entities given the difference in corpus (COMPLETE).



In [37]:
import spacy
from spacy import displacy # install with conda 
from collections import Counter
import en_core_web_md # download with python -m spacy download en_core_web_sm
nlp = en_core_web_md.load()

In [38]:
doc = nlp(instancesDF.postTextClean[35])
print('[' + labelsDF.truthClass[35] + '] ' + instancesDF.postTextClean[25])
print()
print([(X.text, X.label_) for X in doc.ents])
print(1 if len(doc.ents) > 0 else 0)

[clickbait] 5 inconsistencies Jurassic World drive scientists crazy BIVideo

[]
0


In [39]:
instancesDF['featIsNEPresent'] = instancesDF.postTextClean.apply(lambda x: 1 if len((nlp(x)).ents) > 0 else 0)

### - Sentiment

In order to grab the attention of the reader clickbait posts can use words that exagerate the message. This can be quantified by using the VADER sentiment score. VADER is a lexicon and rule-based sentiment analysis tool that is tunned to work with social media text. It works by computing a valence score for each word in the lexicon and computing a 'compound' value by aggregating them and normalizing the resulting value between -1 and 1 ([CITE](https://github.com/cjhutto/vaderSentiment)). 

In [40]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer




In [41]:
sid = SentimentIntensityAnalyzer()
ss = sid.polarity_scores(instancesDF.postTextClean[0])
print(instancesDF.postTextClean[0])
print(ss)


Apples iOS 9 App thinning feature give phones storage boost
{'neg': 0.0, 'neu': 0.748, 'pos': 0.252, 'compound': 0.4019}


In [42]:
instancesDF['featSentiment'] = instancesDF.postTextClean.apply(lambda x: (sid.polarity_scores(x))['compound'])


### - Similarity

Another feature that we consider is the similarity between two documents. Specifically, we wanted to measure how close is the postText to the actual content of the target article. For this, we compute the similarity between: the postText and the targetTitle, the postText and the targetParagraphs, and the postText and the targetKeywords. For this we use the similarity function included in Spacy. The algorithm works by comparing word vector representation of the sentences (COMPLETE). 

In [43]:
text1 = instancesDF[['postTextClean']].iloc[3,0]; print(text1)
text2 = instancesDF[['targetTitle']].iloc[3,0]; print(text2)
sim = nlp(text1).similarity(nlp(text2)); print(sim)

How theme parks like Disney World left middle class behind
How theme parks like Disney World left the middle class behind
0.9958699016922942


In [44]:
# https://stackoverflow.com/questions/12182744/python-pandas-apply-a-function-with-arguments-to-a-series
compute_similarity = lambda x, col1, col2: nlp(x[col1]).similarity(nlp(x[col2]))
instancesDF['featSimilarityPostTextTargetTitle'] = instancesDF.apply(compute_similarity, axis=1, args=('postTextClean','targetTitleClean'))
instancesDF['featSimilarityPostTextTargetParagraphs'] = instancesDF.apply(compute_similarity, axis=1, args=('postTextClean','targetParagraphsClean'))
instancesDF['featSimilarityPostTextTargetKeywords'] = instancesDF.apply(compute_similarity, axis=1, args=('postTextClean','targetKeywordsClean'))

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mo

## Save feature sets

In [46]:
filter_col = ['id'] + [col for col in instancesDF if col.startswith('feat')]
featureSet = instancesDF[filter_col]
print(featureSet.shape)
featureSet.head()


(2459, 501)


Unnamed: 0,id,featNumCharPostText,featNumCharTargetTitle,featNumCharTargetDescription,featNumCharTargetKeywords,featNumCharTargetCaptions,featNumCharTargetParagraphs,featDiffCharPostText_TargetCaptions,featDiffCharPostText_TargetDescription,featDiffCharPostText_TargetKeywords,...,featCountPOS_WRB_NNP_NN,featCountPOS_WRB_NNP_NNS,featCountPOS_WRB_NNP_VBZ,featCountPOS_WRB_RB_NN,featCountPOS_WRB_VBZ_NNS,featIsNEPresent,featSentiment,featSimilarityPostTextTargetTitle,featSimilarityPostTextTargetParagraphs,featSimilarityPostTextTargetKeywords
0,608310377143799810,59,87,141,66,2059,2861,2000,82,7,...,0,0,0,0,0,1,0.4019,0.960639,0.854196,0.0
1,609297109095972864,75,51,119,120,1875,309,1800,44,45,...,0,0,0,0,0,1,0.6369,0.746968,0.795202,0.746093
2,609504474621612032,70,51,133,0,337,2903,267,63,70,...,0,0,0,0,0,1,0.34,0.98548,0.875466,0.0
3,609748367049105409,58,58,94,120,846,8127,788,36,62,...,0,0,0,0,0,1,0.3612,1.0,0.892577,0.709239
4,608688782821453825,70,26,81,47,149,4006,79,11,23,...,0,0,0,0,0,1,-0.7003,0.867288,0.858466,0.942257


In [47]:
featureSet.to_csv('feature_set_'+dataset_size+'.csv')


## Save labels 

In [48]:
labelsDF.head()

Unnamed: 0,id,truthClass,truthJudgments,truthMean,truthMedian,truthMode
0,608310377143799810,no-clickbait,"[0.0, 0.6666667, 0.0, 0.33333334, 0.0]",0.2,0.0,0.0
1,609297109095972864,no-clickbait,"[0.6666667, 0.0, 0.0, 0.0, 0.0]",0.133333,0.0,0.0
2,609504474621612032,clickbait,"[0.33333334, 0.6666667, 1.0, 0.0, 0.6666667]",0.533333,0.666667,0.666667
3,609748367049105409,no-clickbait,"[1.0, 0.0, 0.33333334, 0.33333334, 0.6666667]",0.466667,0.333333,0.333333
4,608688782821453825,clickbait,"[1.0, 0.33333334, 0.6666667, 0.33333334, 1.0]",0.666667,0.666667,1.0


In [49]:
filter_col = ['id', 'truthClass', 'truthMean', 'truthJudgments']
labelsSet = labelsDF[filter_col]
labelsSet.head()
labelsSet.to_csv('labels_set_'+dataset_size+'.csv')