In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from analysis import get_site

import re

pd.set_option('display.max_colwidth', -1)

In [2]:
superlatives = r'DAT|JJT|RGT|RRT'
degree_adverbs = r'RG(QV?|R|T)?'
comparative_adverbs = r'RGR|RRR'
proper_nouns = r'NP[12]'
dates = r'NP(D[12]|M[12])'
numbers = r'M[CDF]\w*'
fp_pronouns = r'PPI\w+'
negations = r'XX'

all_emotion = r'E[\w\.\+-]+'
emotion_general = r'E1[\+-]'
pos_emotion = r'E[2-6](\.[12])?\++'
neg_emotion = r'E[2-6](\.[12])?-+'
time_related = r'T[1-4][\.\d\+-]*'
sense_words = r'X3[\.\d\+-]*'
movement_words = r'M[1-6]'
relationships = r'S3.*'

spatial_words = r'RL|ND1|NNL[12]|M[78]'

# Imagination subject to change
inf_conjunctions = r'CC'
ima_conjunctions = r'CCB|CS'
conjunctions = r'C[A-Z]+'
inf_verb = r'VVN'
ima_verb = r'VV[^N]\w?|VM'
verbs = r'V[A-Z]+'
prepositions = r'I[A-Z]'
articles = r'AT\w*'
ima_determiners = r'DA1|DB\w?'
determiners = r'D[A-Z]\w*'
adjectives = r'J\[A-Z]+'
nouns = r'N[A-Z]\w*'
pronouns = r'P[A-Z]\w*'
adverbs = r'R[A-Z]+'

exaggeration = r'A13\.[237]'    # Boosters, Maximisers, and Minimisers.
vague_degree = r'A13\.[145]'    # Non-specific, Approximators, Compromisers

re_punctuation = r'[\.\,\"\'\“\”\`\!\?]+'

In [3]:
meta_column_names = ["Index", "URL", "Headline", "Year", "Category", "April Fool", "Set"]
meta = pd.read_csv("../Corpus Construction/meta.csv", delimiter=";", header=None, names=meta_column_names)
meta.set_index('Index', inplace=True)

feats = pd.read_csv("../System/feats_train.csv")
feats.set_index('index', inplace=True)
fakes = pd.read_csv("../System/fake_news_feats_train.csv")
fakes.set_index('index', inplace=True)

feats['headline'] = meta['Headline']
feats['URL'] = meta['URL'].apply(get_site)

In [4]:
feature = 'dates'

In [5]:
sorted_feats = feats.sort_values(by=feature, ascending=False)
sorted_feats.head(10)[[feature, 'headline', 'class', 'URL']]

Unnamed: 0_level_0,dates,headline,class,URL
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
390,0.046729,Christmas Moved to July by Congressional Order,1,www.holistory.com
580,0.043257,US trade deficit rises to near 9½-year high,0,www.cnbc.com
265,0.038627,Puscifer Announce Australian Dates!,1,www.fourtheye.net
395,0.024465,Ant and Dec fall out; Dec reinvents self as ‘Dec’,1,metro.co.uk
484,0.024038,NFL Superstar Randy Moss Signs with the Charlotte Independence,1,www.charlotteindependence.com
647,0.023529,Oman donates $10m in aid to Japan quake efforts,0,www.arabianbusiness.com
344,0.023392,"Megadeth To Perform ""Peace Sells... but Who's Buying?"" In Its' Entirety At Download Festival",1,thegauntlet.com
788,0.022222,Sunday Sport newspaper owner in administration,0,www.business-sale.com
633,0.021505,Trainer Tony Martin aims for second Irish National win,0,news.bbc.co.uk
772,0.021277,Wrestler’s Widow Settles Suit With WWE,0,investorplace.com


In [6]:
sorted_feats.tail(10)[[feature, 'headline', 'class', 'URL']]

Unnamed: 0_level_0,dates,headline,class,URL
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
422,0.0,Rolling Stones cancel Summerfest show replaced by Elton John,1,onmilwaukee.com
411,0.0,Google Reader Rises From The Grave With New (Material) Update v2.0 [APK Download],1,www.androidpolice.com
412,0.0,Domino’s Rolls Out Driverless Delivery Vehicles,1,www.dominos.co.uk
414,0.0,RNIB launches guide cats for the blind scheme,1,newsthump.com
415,0.0,Nintendo Joins the Remaster Crowd with Superman: Remastered,1,www.the-gamers-lounge.com
417,0.0,Fighting the Good Fight: An Iconic Moustache Returns to the Stage,1,andrewolson.com
418,0.0,Apple iChip Rumors Surface,1,pc.net
420,0.0,Seals of the Week,1,uk.ign.com
421,0.0,Pro-Line Unveils Gas Powered Pro-2 Short Course Truck!,1,rcnewz.com
1038,0.0,Look under 25? You’ll need to show ID to buy alcohol at Costcutter,0,www.london-se1.co.uk


In [7]:
feats.groupby('class').median()[feature]

class
0    0.001594
1    0.000000
Name: dates, dtype: float64

In [8]:
feats.groupby('class').mean()[feature]

class
0    0.003405
1    0.002310
Name: dates, dtype: float64

In [9]:
feats.loc[feats[feature]==0.0][['headline', 'URL', 'class']]

Unnamed: 0_level_0,headline,URL,class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Internet news guru Matt Drudge uncovers dating scandal,www.theregister.co.uk,1
2,Aspyr Announces New Sims Expansion Pack,www.insidemacgames.com,1
4,Buffett Buys Krispy Kreme,www.fool.com,1
9,Transparent Aluminum Used To Make Tinfoil Hats,www.technovelgy.com,1
13,"New York City Stunned By Seth Godin's ""Blogging Iceberg""",steveshu.typepad.com,1
14,Satellite Spam Scan,www.eridani.co.uk,1
15,World of Wordcraft,uk.ign.com,1
16,Apple founder Jobs joins IKEA,www.theregister.co.uk,1
17,Bush Cancels Space Shuttle Program,www.spacedaily.com,1
18,Click here for your custom-built Reg,www.theregister.co.uk,1


Try and print samples of text.

In [10]:
def print_sample(index, tag):
    sem_path = '../Tagged_Corpus/{0}/{0}.txt.pos.sem'.format(index)
    
    with open(sem_path) as sem_file:
        doc =  [line.strip() for line in sem_file]
        
    # Pattern of the lines from the POS file we are interested in.
    reg_line = re.compile(r'(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(.+)')
    reg_oth = re.compile(r'(\d+)\s+(\d+)\s+(\S+)\s+(\S+)')
    end = '\033[0m'
    quote = r'<\/?quote>'
        
    sentences = []
    curr_sent = []
    curr_colours = []
        
    for line in doc:
        m = reg_line.match(line)
        o = reg_oth.match(line)
        if m:
            word = m.group(4)
            pos = m.group(3)
            sems = m.group(5).split(" ")[0].split("[")[0].split("/")
            
            highlight = False
            
            if re.match(tag, pos):
                highlight = True
            
            for sem in sems:
                if re.match(tag, sem):
                    highlight = True
                    
            if highlight: colour = '\033[94m'
            else: colour = '\033[93m'
                
            curr_sent.append(word)
            curr_colours.append(colour)
        elif o:
            # If it's the end of a sentence
            if o.group(3) == '-----':
                sent = ''
                for w, c in zip(curr_sent, curr_colours):
                    sent = sent + c + w + end + " "
                sentences.append(sent)
                curr_sent = []
                curr_colours = []
            # If it's punctuation or something
            elif o.group(3) != 'NULL':
                curr_sent.append(o.group(4))
                curr_colours.append('\033[93m')
            elif re.match(quote, o.group(4)):
                curr_sent.append('"')
                curr_colours.append('\033[93m')
                
    if len(curr_sent) > 0:
        sent = ''
        for w, c in zip(curr_sent, curr_colours):
            sent = sent + c + w + end + " "
        sentences.append(sent)
        
    for sentence in sentences:
        print('{}'.format(sentence))
    return

In [11]:
feature = time_related

In [12]:
print_sample(580, feature)


[93mThe[0m [93mU.S.[0m [93mtrade[0m [93mdeficit[0m [93mincreased[0m [93mto[0m [93ma[0m [93mnear[0m [93m9-year[0m [93mhigh[0m [93min[0m [94mFebruary[0m [93mas[0m [93mboth[0m [93mexports[0m [93mand[0m [93mimports[0m [93mrose[0m [93mto[0m [93mrecord[0m [93mhighs[0m [93m,[0m [93mbut[0m [93mthe[0m [93mshortfall[0m [93mwith[0m [93mChina[0m [93mnarrowed[0m [93msharply[0m [93m.[0m 
[93mThe[0m [93mCommerce[0m [93mDepartment[0m [93msaid[0m [93mon[0m [94mThursday[0m [93mthe[0m [93mtrade[0m [93mgap[0m [93mrose[0m [93m1.6[0m [93mpercent[0m [93mto[0m [93m$57.6[0m [93mbillion[0m [93m.[0m 
[93mThat[0m [93mwas[0m [93mthe[0m [93mhighest[0m [93mlevel[0m [93msince[0m [94mOctober[0m [93m2008[0m [93mand[0m [93mfollowed[0m [93ma[0m [93mslightly[0m [93mdownwardly[0m [93mrevised[0m [93m$56.7[0m [93mbillion[0m [93mshortfall[0m [93min[0m [94mJanuary[0m [93m.[0m 
[93mThe[0m [93md

In [13]:
print_sample(412, feature)


[93mOkay[0m [93m,[0m [93mso[0m [93mits[0m [93mbeen[0m [93mhard[0m [93mto[0m [93mkeep[0m [93mit[0m [93mquiet[0m [93mand[0m [93mthere[0m [93mhave[0m [93mbeen[0m [93ma[0m [93mfew[0m [93mnear-leaks[0m [93malong[0m [93mthe[0m [93mway[0m [93m,[0m [93mbut[0m [93mwe[0m [93mcan[0m [93mfinally[0m [93mconfirm[0m [93mthat[0m [93mthe[0m [93mrumours[0m [93mare[0m [93mtrue[0m [93mthe[0m [93mworlds[0m [93mfirst[0m [93mdriverless[0m [93mpizza[0m [93mdelivery[0m [93mvehicle[0m [93mis[0m [93mhere[0m [93m![0m 
[93mYouve[0m [93mseen[0m [93mGoogles[0m [93mself-driving[0m [93mcars[0m [93m,[0m [93myouve[0m [93mseen[0m [93mour[0m [93mdrone[0m [93mdeliveries[0m [93mwith[0m [93mthe[0m [93mDomiCopter[0m [93mand[0m [93mnow[0m [93myoure[0m [93mgoing[0m [93mto[0m [93msee[0m [93ma[0m [93mworld[0m [93mfirst[0m [93min[0m [93mpizza[0m [93mdelivery[0m [93mhistory[0m [93mas[0m [93mDom