# Preprocessing and Feature Extraction

## Imports

In [1]:
import json
import nltk
import numpy as np
import pandas as pd
import itertools 

In [2]:
import PyDictionary
from vocabulary.vocabulary import Vocabulary 
from nltk.corpus import wordnet as wn # I had to use this instead PyDictionary is too bad!

## Read files

There are two different files. The first one, called "instances" contains all the information for a given post. The second, "truth", contains the labels of each instance. The following are the schemas of these files:

`""" Fields in instances.jsonl: <br/>
 { <br/>
    "id": "<instance id>", <br/>
    "postTimestamp": "<weekday> <month> <day> <hour>:<minute>:<second> <time_offset> <year>", <br/>
    "postText": ["<text of the post with links removed>"], <br/>
    "postMedia": ["<path to a file in the media archive>"], <br/>
    "targetTitle": "<title of target article>", <br/>
    "targetDescription": "<description tag of target article>", <br/>
    "targetKeywords": "<keywords tag of target article>", <br/>
    "targetParagraphs": ["<text of the ith paragraph in the target article>"], <br/>
    "targetCaptions": ["<caption of the ith image in the target article>"] <br/>
  } """`


`""" Fields in truth.jsonl:
  {
    "id": "<instance id>",
    "truthJudgments": [<number in [0,1]>],
    "truthMean": <number in [0,1]>,
    "truthMedian": <number in [0,1]>,
    "truthMode": <number in [0,1]>,
    "truthClass": "clickbait | no-clickbait"
  } """`

In [3]:
def loadDataset(size):
    instances = []
    labels = []
    fileName = 'trainSmall' if size == 'small' else 'trainLarge'
    with open('data/'+fileName+'/instances.jsonl') as file:
        for line in file:
            instances.append(json.loads(line))
    with open('data/'+fileName+'/truth.jsonl') as file:
        for line in file:
            labels.append(json.loads(line))
    return instances, labels

In [4]:
dataset_size = 'small'

In [5]:
instances, labels = loadDataset(dataset_size)

## Preprocessing

In [6]:
from nltk.corpus import stopwords

In [7]:
instancesDF = pd.DataFrame(instances)
instancesDF.iloc[0,:]

id                                                  608310377143799810
postMedia                                                           []
postText             [Apple's iOS 9 'App thinning' feature will giv...
postTimestamp                           Tue Jun 09 16:31:10 +0000 2015
targetCaptions       ['App thinning' will be supported on Apple's i...
targetDescription    'App thinning' will be supported on Apple's iO...
targetKeywords       Apple,gives,gigabytes,iOS,9,app,thinning,featu...
targetParagraphs     [Paying for a 64GB phone only to discover that...
targetTitle          Apple gives back gigabytes: iOS 9 'app thinnin...
Name: 0, dtype: object

In [8]:
# In this function the text variable is a list of length 1
def preprocess(text):
    sw = set(stopwords.words('english'))    
    filtered_sentence = []
    if isinstance(text, list):      
        for t in text:            
            word_tokens = nltk.word_tokenize(t)
            filtered_sentence = filtered_sentence + [w for w in word_tokens if not w in sw] 
    else:
        word_tokens = nltk.word_tokenize(text)
        filtered_sentence = filtered_sentence + [w for w in word_tokens if not w in sw] 
    return filtered_sentence

In [9]:
instancesDF['postTextTokens'] = instancesDF.postText.apply(preprocess)
instancesDF['postTextClean'] = instancesDF.postTextTokens.apply(' '.join)

instancesDF['targetCaptionsTokens'] = instancesDF.targetCaptions.apply(preprocess)
instancesDF['targetCaptionsClean'] = instancesDF.targetCaptionsTokens.apply(' '.join)

instancesDF['targetDescriptionTokens'] = instancesDF.targetDescription.apply(preprocess)
instancesDF['targetDescriptionClean'] = instancesDF.targetDescriptionTokens.apply(' '.join)

instancesDF['targetKeywordsTokens'] = instancesDF.targetKeywords.apply(preprocess)
instancesDF['targetKeywordsClean'] = instancesDF.targetKeywordsTokens.apply(' '.join)

instancesDF['targetParagraphsTokens'] = instancesDF.targetParagraphs.apply(preprocess)
instancesDF['targetParagraphsClean'] = instancesDF.targetParagraphsTokens.apply(' '.join)

instancesDF['targetTitleTokens'] = instancesDF.targetTitle.apply(preprocess)
instancesDF['targetTitleClean'] = instancesDF.targetTitleTokens.apply(' '.join)

In [10]:
instancesDF.head()

Unnamed: 0,id,postMedia,postText,postTimestamp,targetCaptions,targetDescription,targetKeywords,targetParagraphs,targetTitle,postTextTokens,...,targetCaptionsTokens,targetCaptionsClean,targetDescriptionTokens,targetDescriptionClean,targetKeywordsTokens,targetKeywordsClean,targetParagraphsTokens,targetParagraphsClean,targetTitleTokens,targetTitleClean
0,608310377143799810,[],[Apple's iOS 9 'App thinning' feature will giv...,Tue Jun 09 16:31:10 +0000 2015,['App thinning' will be supported on Apple's i...,'App thinning' will be supported on Apple's iO...,"Apple,gives,gigabytes,iOS,9,app,thinning,featu...",[Paying for a 64GB phone only to discover that...,Apple gives back gigabytes: iOS 9 'app thinnin...,"[Apple, 's, iOS, 9, 'App, thinning, ', feature...",...,"['App, thinning, ', supported, Apple, 's, iOS,...",'App thinning ' supported Apple 's iOS 9 later...,"['App, thinning, ', supported, Apple, 's, iOS,...",'App thinning ' supported Apple 's iOS 9 later...,"[Apple, ,, gives, ,, gigabytes, ,, iOS,9, ,, a...","Apple , gives , gigabytes , iOS,9 , app , thin...","[Paying, 64GB, phone, discover, significantly,...",Paying 64GB phone discover significantly reduc...,"[Apple, gives, back, gigabytes, :, iOS, 9, 'ap...",Apple gives back gigabytes : iOS 9 'app thinni...
1,609297109095972864,[media/609297109095972864.jpg],[RT @kenbrown12: Emerging market investors are...,Fri Jun 12 09:52:05 +0000 2015,"[Stocks Fall as Investors Watch Central Banks,...",Global investors have yanked $9.3 billion from...,"emerging market,emerging markets,em flows,em i...","[Emerging markets are out of favor., Global in...",Emerging Markets Suffer Largest Outflow in Sev...,"[RT, @, kenbrown12, :, Emerging, market, inves...",...,"[Stocks, Fall, Investors, Watch, Central, Bank...",Stocks Fall Investors Watch Central Banks Do T...,"[Global, investors, yanked, $, 9.3, billion, s...",Global investors yanked $ 9.3 billion stocks d...,"[emerging, market, ,, emerging, markets, ,, em...","emerging market , emerging markets , em flows ...","[Emerging, markets, favor, ., Global, investor...",Emerging markets favor . Global investors yank...,"[Emerging, Markets, Suffer, Largest, Outflow, ...",Emerging Markets Suffer Largest Outflow Seven ...
2,609504474621612032,[],[U.S. Soccer should start answering tough ques...,Fri Jun 12 23:36:05 +0000 2015,[US to vote for Ali in FIFA election and not B...,A U.S. Senator's scathing letter questioned U....,,"[WINNIPEG, Manitoba – The bubble U.S. Soccer i...",U.S. Soccer should start answering tough quest...,"[U.S., Soccer, start, answering, tough, questi...",...,"[US, vote, Ali, FIFA, election, Blatter, US, v...",US vote Ali FIFA election Blatter US vote Ali ...,"[A, U.S, ., Senator, 's, scathing, letter, que...",A U.S . Senator 's scathing letter questioned ...,[],,"[WINNIPEG, ,, Manitoba, –, The, bubble, U.S., ...","WINNIPEG , Manitoba – The bubble U.S. Soccer p...","[U.S., Soccer, start, answering, tough, questi...",U.S. Soccer start answering tough questions Ho...
3,609748367049105409,[],[How theme parks like Disney World left the mi...,Sat Jun 13 15:45:13 +0000 2015,"[Some 1,000 persons turned out in Albuquerque,...","America's top family vacation spots, like the ...","disney, disney world, disney ticket prices, di...",[When Walt Disney World opened in an Orlando s...,How theme parks like Disney World left the mid...,"[How, theme, parks, like, Disney, World, left,...",...,"[Some, 1,000, persons, turned, Albuquerque, ,,...","Some 1,000 persons turned Albuquerque , New Me...","[America, 's, top, family, vacation, spots, ,,...","America 's top family vacation spots , like ``...","[disney, ,, disney, world, ,, disney, ticket, ...","disney , disney world , disney ticket prices ,...","[When, Walt, Disney, World, opened, Orlando, s...",When Walt Disney World opened Orlando swamp 19...,"[How, theme, parks, like, Disney, World, left,...",How theme parks like Disney World left middle ...
4,608688782821453825,[media/608688782821453825.jpg],[Could light bulbs hurt your health? One compa...,Wed Jun 10 17:34:49 +0000 2015,[Electric lights have made the world safer and...,One company will put a health notice on all th...,"health, Should there be warning labels on your...",[(CNN)The light bulb always makes the world's ...,Warning labels on your light bulbs,"[Could, light, bulbs, hurt, health, ?, One, co...",...,"[Electric, lights, made, world, safer, made, p...",Electric lights made world safer made people s...,"[One, company, put, health, notice, packages, ...",One company put health notice packages lightin...,"[health, ,, Should, warning, labels, light, bu...","health , Should warning labels light bulbs ? -...","[(, CNN, ), The, light, bulb, always, makes, w...",( CNN ) The light bulb always makes world 's t...,"[Warning, labels, light, bulbs]",Warning labels light bulbs


## Define features

The following is the list of features to implement:
* numChar(TargetTitle, Post, TargetParagraphs)
* diffNumChar(TargetTitleVsPost, TargetTitleVsTargetParagraphs, PostVsTargetParagraphs)
* ratioNumChar(TargetTitleVsPost, TargetTitleVsTargetParagraphs, PostVsTargetParagraphs)
* numWords (TargetTitle, Post, TargetParagraphs)
* diffNumWords (TargetTitleVsPost, TargetTitleVsTargetParagraphs, PostVsTargetParagraphs)
* ratioNumWords (TargetTitleVsPost, TargetTitleVsTargetParagraphs, PostVsTargetParagraphs)
* numFormalInformal (TargetTitle, Post, TargetParagraphs)
* ratioFormalInformal (TargetTitle, Post, TargetParagraphs)

In [11]:
instancesDF.columns

Index(['id', 'postMedia', 'postText', 'postTimestamp', 'targetCaptions',
       'targetDescription', 'targetKeywords', 'targetParagraphs',
       'targetTitle', 'postTextTokens', 'postTextClean',
       'targetCaptionsTokens', 'targetCaptionsClean',
       'targetDescriptionTokens', 'targetDescriptionClean',
       'targetKeywordsTokens', 'targetKeywordsClean', 'targetParagraphsTokens',
       'targetParagraphsClean', 'targetTitleTokens', 'targetTitleClean'],
      dtype='object')

In [12]:
ratio_corrector = lambda x: 0.0001 if x == 0 else x

In [13]:
# numChars
instancesDF['featNumCharPostText'] = instancesDF.postTextClean.apply(len)
instancesDF['featNumCharTargetTitle'] = instancesDF.targetTitleClean.apply(len)
instancesDF['featNumCharTargetDescription'] = instancesDF.targetDescriptionClean.apply(len)
instancesDF['featNumCharTargetKeywords'] = instancesDF.targetKeywordsClean.apply(len)
instancesDF['featNumCharTargetCaptions'] = instancesDF.targetCaptionsClean.apply(len)
instancesDF['featNumCharTargetParagraphs'] = instancesDF.targetParagraphsClean.apply(len)

In [14]:
# diffNumChars
base_text_orig = 'featNumChar'
base_text_new = 'featDiffChar'
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for f, s in list(itertools.combinations(elements, 2)):
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_orig+f] - instancesDF[base_text_orig+s]
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_new + f + '_' + s].apply(abs)

In [15]:
# ratioNumChars
base_text_orig = 'featNumChar'
base_text_new = 'featRatioChar'
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for f, s in list(itertools.combinations(elements, 2)):
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_orig+f].apply(ratio_corrector) / instancesDF[base_text_orig+s].apply(ratio_corrector)
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_new + f + '_' + s].apply(abs)

In [16]:
# numWords
instancesDF['featNumWordsPostText'] = instancesDF.postTextTokens.apply(len)
instancesDF['featNumWordsTargetCaptions'] = instancesDF.targetCaptionsTokens.apply(len)
instancesDF['featNumWordsTargetDescription'] = instancesDF.targetDescriptionTokens.apply(len)
instancesDF['featNumWordsTargetKeywords'] = instancesDF.targetKeywordsTokens.apply(len)
instancesDF['featNumWordsTargetParagraphs'] = instancesDF.targetParagraphsTokens.apply(len)
instancesDF['featNumWordsTargetTitle'] = instancesDF.targetTitleTokens.apply(len)

In [17]:
# diffNumWords
base_text_orig = 'featNumWords'
base_text_new = 'featDiffWords'
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for f, s in list(itertools.combinations(elements, 2)):
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_orig+f] - instancesDF[base_text_orig+s]
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_new + f + '_' + s].apply(abs)

In [18]:
# ratioNumWords
base_text_orig = 'featNumWords'
base_text_new = 'featRatioWords'
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for f, s in list(itertools.combinations(elements, 2)):
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_orig+f].apply(ratio_corrector) / instancesDF[base_text_orig+s].apply(ratio_corrector)
    instancesDF[base_text_new + f + '_' + s] = instancesDF[base_text_new + f + '_' + s].apply(abs)

In [19]:
# Number of formal and informal english words
def count_words(words_tokens, formal=False):
    num_elements = 0
    for word in words_tokens:
        output = len(wn.synsets(word))
        if formal and output != 0:
            num_elements += 1
        if not formal and output == 0:
            num_elements += 1
    return num_elements    

In [20]:
# count formal words
num_formal_words = lambda x: count_words(x, True)

instancesDF['featNumFormalWordsPostText'] = instancesDF.postTextTokens.apply(num_formal_words)
instancesDF['featNumFormalWordsTargetTitle'] = instancesDF.targetTitleTokens.apply(num_formal_words)
instancesDF['featNumFormalWordsTargetDescription'] = instancesDF.targetDescriptionTokens.apply(num_formal_words)
instancesDF['featNumFormalWordsTargetKeywords'] = instancesDF.targetKeywordsTokens.apply(num_formal_words)
instancesDF['featNumFormalWordsTargetCaptions'] = instancesDF.targetCaptionsTokens.apply(num_formal_words)
instancesDF['featNumFormalWordsTargetParagraphs'] = instancesDF.targetParagraphsTokens.apply(num_formal_words)

In [21]:
# count informal words
num_informal_words = lambda x: count_words(x, False)

instancesDF['featNumInformalWordsPostText'] = instancesDF.postTextTokens.apply(num_informal_words)
instancesDF['featNumInformalWordsTargetTitle'] = instancesDF.targetTitleTokens.apply(num_informal_words)
instancesDF['featNumInformalWordsTargetDescription'] = instancesDF.targetDescriptionTokens.apply(num_informal_words)
instancesDF['featNumInformalWordsTargetKeywords'] = instancesDF.targetKeywordsTokens.apply(num_informal_words)
instancesDF['featNumInformalWordsTargetCaptions'] = instancesDF.targetCaptionsTokens.apply(num_informal_words)
instancesDF['featNumInformalWordsTargetParagraphs'] = instancesDF.targetParagraphsTokens.apply(num_informal_words)

In [22]:
# percent of formal words
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for e in elements:    
    instancesDF['featPercentFormalWords' + e ] = instancesDF['featNumFormalWords'+e].apply(ratio_corrector) / instancesDF['featNumWords'+e].apply(ratio_corrector)

In [23]:
# percent of informal words
elements = ['PostText', 'TargetCaptions', 'TargetDescription', 'TargetKeywords', 'TargetParagraphs', 'TargetTitle']
for e in elements:    
    instancesDF['featPercentInformalWords' + e ] = instancesDF['featNumInformalWords'+e].apply(ratio_corrector) / instancesDF['featNumWords'+e].apply(ratio_corrector)

## Our faetures

* readability
* NNPs (maybe POS)
* POS 2-gram NNP NNP
* TF-iDF

## Save feature sets

In [24]:
filter_col = ['id'] + [col for col in instancesDF if col.startswith('feat')]
featureSet = instancesDF[filter_col]
featureSet.head()

Unnamed: 0,id,featNumCharPostText,featNumCharTargetTitle,featNumCharTargetDescription,featNumCharTargetKeywords,featNumCharTargetCaptions,featNumCharTargetParagraphs,featDiffCharPostText_TargetCaptions,featDiffCharPostText_TargetDescription,featDiffCharPostText_TargetKeywords,...,featPercentFormalWordsTargetDescription,featPercentFormalWordsTargetKeywords,featPercentFormalWordsTargetParagraphs,featPercentFormalWordsTargetTitle,featPercentInformalWordsPostText,featPercentInformalWordsTargetCaptions,featPercentInformalWordsTargetDescription,featPercentInformalWordsTargetKeywords,featPercentInformalWordsTargetParagraphs,featPercentInformalWordsTargetTitle
0,608310377143799810,66,94,157,98,2199,3030,2133,91,32,...,0.642857,0.409091,0.636546,0.764706,0.307692,0.337017,0.357143,0.590909,0.363454,0.235294
1,609297109095972864,91,51,126,147,2075,331,1984,35,56,...,0.684211,0.642857,0.705882,1.0,0.5,0.361217,0.315789,0.357143,0.294118,1.4e-05
2,609504474621612032,79,53,150,0,368,3155,289,71,79,...,0.615385,1.0,0.682081,1.0,0.307692,0.470588,0.384615,1.0,0.317919,1.3e-05
3,609748367049105409,58,58,108,136,917,8665,859,50,78,...,0.684211,0.64,0.676351,0.9,0.1,0.423313,0.315789,0.36,0.323649,0.1
4,608688782821453825,74,26,85,54,157,4338,83,11,20,...,0.857143,0.5,0.724476,1.0,0.230769,0.185185,0.142857,0.5,0.275524,2.5e-05


In [25]:
featureSet.to_csv('feature_set_'+dataset_size+'.csv')

## Save labels 

In [70]:
labelsDF = pd.DataFrame(labels)
labelsDF.head()

Unnamed: 0,id,truthClass,truthJudgments,truthMean,truthMedian,truthMode
0,608310377143799810,no-clickbait,"[0.0, 0.6666667, 0.0, 0.33333334, 0.0]",0.2,0.0,0.0
1,609297109095972864,no-clickbait,"[0.6666667, 0.0, 0.0, 0.0, 0.0]",0.133333,0.0,0.0
2,609504474621612032,clickbait,"[0.33333334, 0.6666667, 1.0, 0.0, 0.6666667]",0.533333,0.666667,0.666667
3,609748367049105409,no-clickbait,"[1.0, 0.0, 0.33333334, 0.33333334, 0.6666667]",0.466667,0.333333,0.333333
4,608688782821453825,clickbait,"[1.0, 0.33333334, 0.6666667, 0.33333334, 1.0]",0.666667,0.666667,1.0


In [76]:
filter_col = ['id', 'truthClass', 'truthMean', 'truthJudgments']
labelsSet = labelsDF[filter_col]
labelsSet.head()
labelsSet.to_csv('labels_set_'+dataset_size+'.csv')