In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from analysis import get_site

import re

pd.set_option('display.max_colwidth', -1)

In [2]:
superlatives = r'DAT|JJT|RGT|RRT'
degree_adverbs = r'RG(QV?|R|T)?'
comparative_adverbs = r'RGR|RRR'
proper_nouns = r'NP[12]'
dates = r'NP(D[12]|M[12])'
numbers = r'M[CDF]\w*'
fp_pronouns = r'PPI\w+'
negations = r'XX'

all_emotion = r'E[\w\.\+-]+'
emotion_general = r'E1[\+-]'
pos_emotion = r'E[2-6](\.[12])?\++'
neg_emotion = r'E[2-6](\.[12])?-+'
time_related = r'T[1-4][\.\d\+-]*'
sense_words = r'X3[\.\d\+-]*'
movement_words = r'M[1-6]'
relationships = r'S3.*'

spatial_words = r'RL|ND1|NNL[12]|M[78]'

# Imagination subject to change
inf_conjunctions = r'CC'
ima_conjunctions = r'CCB|CS'
conjunctions = r'C[A-Z]+'
inf_verb = r'VVN'
ima_verb = r'VV[^N]\w?|VM'
verbs = r'V[A-Z]+'
prepositions = r'I[A-Z]'
articles = r'AT\w*'
ima_determiners = r'DA1|DB\w?'
determiners = r'D[A-Z]\w*'
adjectives = r'J\[A-Z]+'
nouns = r'N[A-Z]\w*'
pronouns = r'P[A-Z]\w*'
adverbs = r'R[A-Z]+'

exaggeration = r'A13\.[237]'    # Boosters, Maximisers, and Minimisers.
vague_degree = r'A13\.[145]'    # Non-specific, Approximators, Compromisers

re_punctuation = r'[\.\,\"\'\“\”\`\!\?]+'

In [3]:
meta_column_names = ["Index", "URL", "Headline", "Year", "Category", "April Fool", "Set"]
meta = pd.read_csv("../Corpus Construction/meta.csv", delimiter=";", header=None, names=meta_column_names)
meta.set_index('Index', inplace=True)

feats = pd.read_csv("../System/feats_train.csv")
feats.set_index('index', inplace=True)
fakes = pd.read_csv("../System/fake_news_feats_train.csv")
fakes.set_index('index', inplace=True)

feats['headline'] = meta['Headline']
feats['URL'] = meta['URL'].apply(get_site)

In [17]:
feature = 'fp_pronouns'

In [5]:
sorted_feats = feats.sort_values(by=feature, ascending=False)
sorted_feats.head(10)[[feature, 'headline', 'class', 'URL']]

Unnamed: 0_level_0,fp_pronouns,headline,class,URL
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
421,1.0,Pro-Line Unveils Gas Powered Pro-2 Short Course Truck!,1,rcnewz.com
912,1.0,Corruption Watch: John Boehner's Abramoff Connection,0,www.theatlantic.com
913,1.0,Process That Converts Cotton to Boron Carbide Could Create Armored T-Shirts,0,www.popsci.com
345,1.0,Esperanto For Computational Fluid Dynamics,1,www.symscape.com
883,1.0,Cleveland Motion Controls acquires Kaliburn,0,www.automation.com
671,1.0,Uniguest Expands Portfolio of Guest-Facing Technology Solutions,0,www.businesswire.com
470,1.0,Hens lay rugby eggs as Germany aims for 7 Nations glory after footballing defeat to England,1,www.uk.diplo.de
612,1.0,Virgin America Tops Ranking of U.S. Airlines,0,time.com
521,1.0,Manchester City's parent company buys Uruguayan side Atletico Torque,0,www.bbc.co.uk
461,1.0,Leap years to be scrapped after Surrey scientists discover Earth’s orbit has slowed,1,www.surrey.ac.uk


In [19]:
sorted_feats[[feature, 'headline', 'class', 'URL']].to_csv("sanity.csv")

In [6]:
sorted_feats.tail(10)[[feature, 'headline', 'class', 'URL']]

Unnamed: 0_level_0,fp_pronouns,headline,class,URL
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
752,0.0,Transgenic Cows Resist Mastitis-Causing Bacteria : USDA ARS,0,www.ars.usda.gov
271,0.0,"Vatican ""Porn"" Collection to go Online",1,www.tk421.net
739,0.0,Commentary: Financial Empowerment for the Emerging Market Consumer,0,www.usnews.com
738,0.0,Al Qaeda says 7 suicide bombers struck Iraqi prison,0,www.chinadaily.com.cn
280,0.0,Unusual bow-tie penguins discovered off southern Chile,1,earthsky.org
730,0.0,Study: Immigration Bureaucracy a $30 Billion Burden to America,0,www.newsmax.com
728,0.0,Jonah Hill and Leonardo DiCaprio to Play Good Guys This Time,0,www.cheatsheet.com
727,0.0,"This 24-karat gold Nintendo Entertainment System can be yours for only $5,000",0,www.digitaltrends.com
725,0.0,"If There’s a U.S.-China Trade War, China May Have Some ‘Unconventional Weapons’",0,www.nytimes.com
520,0.0,"Leading business students expect technology will boost, not threaten, their careers",0,www.prnewswire.com


In [7]:
feats.groupby('class').median()[feature]

class
0    0.148148
1    0.272727
Name: fp_pronouns, dtype: float64

In [8]:
feats.groupby('class').mean()[feature]

class
0    0.199556
1    0.278652
Name: fp_pronouns, dtype: float64

In [9]:
feats.loc[feats[feature]==0.0][['headline', 'URL', 'class']]

Unnamed: 0_level_0,headline,URL,class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,Buffett Buys Krispy Kreme,www.fool.com,1
9,Transparent Aluminum Used To Make Tinfoil Hats,www.technovelgy.com,1
19,Water On Mars,apod.nasa.gov,1
20,SMEGmail offers 1 terabyte storage,www.smh.com.au,1
24,AI Has Been Solved,www.advogato.org,1
34,Airlines ban 'foul' Swedish fish,news.bbc.co.uk,1
38,BREAKING NEWS: Anarchy in the U.S.,www.quizlaw.com,1
41,VOYAGER 1 received by AMSAT-DL group,www.southgatearc.org,1
44,No Duck Left Behind,cce.typepad.com,1
45,White House appoints american sculptor Jeff Koons as new garden designer,placebokatz.blogspot.co.uk,1


Try and print samples of text.

In [10]:
def print_sample(index, tag):
    sem_path = '../Tagged_Corpus/{0}/{0}.txt.pos.sem'.format(index)
    
    with open(sem_path) as sem_file:
        doc =  [line.strip() for line in sem_file]
        
    # Pattern of the lines from the POS file we are interested in.
    reg_line = re.compile(r'(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(.+)')
    reg_oth = re.compile(r'(\d+)\s+(\d+)\s+(\S+)\s+(\S+)')
    end = '\033[0m'
    quote = r'<\/?quote>'
        
    sentences = []
    curr_sent = []
    curr_colours = []
        
    for line in doc:
        m = reg_line.match(line)
        o = reg_oth.match(line)
        if m:
            word = m.group(4)
            pos = m.group(3)
            sems = m.group(5).split(" ")[0].split("[")[0].split("/")
            
            highlight = False
            
            if re.match(tag, pos):
                highlight = True
            
            for sem in sems:
                if re.match(tag, sem):
                    highlight = True
                    
            if highlight: colour = '\033[94m'
            else: colour = '\033[93m'
                
            curr_sent.append(word)
            curr_colours.append(colour)
        elif o:
            # If it's the end of a sentence
            if o.group(3) == '-----':
                sent = ''
                for w, c in zip(curr_sent, curr_colours):
                    sent = sent + c + w + end + " "
                sentences.append(sent)
                curr_sent = []
                curr_colours = []
            # If it's punctuation or something
            elif o.group(3) != 'NULL':
                curr_sent.append(o.group(4))
                curr_colours.append('\033[93m')
            elif re.match(quote, o.group(4)):
                curr_sent.append('"')
                curr_colours.append('\033[93m')
                
    if len(curr_sent) > 0:
        sent = ''
        for w, c in zip(curr_sent, curr_colours):
            sent = sent + c + w + end + " "
        sentences.append(sent)
        
    for sentence in sentences:
        print('{}'.format(sentence))
    return

In [11]:
feature = time_related

In [12]:
print_sample(580, feature)


[93mThe[0m [93mU.S.[0m [93mtrade[0m [93mdeficit[0m [93mincreased[0m [93mto[0m [93ma[0m [93mnear[0m [93m9-year[0m [93mhigh[0m [93min[0m [94mFebruary[0m [93mas[0m [93mboth[0m [93mexports[0m [93mand[0m [93mimports[0m [93mrose[0m [93mto[0m [93mrecord[0m [93mhighs[0m [93m,[0m [93mbut[0m [93mthe[0m [93mshortfall[0m [93mwith[0m [93mChina[0m [93mnarrowed[0m [93msharply[0m [93m.[0m 
[93mThe[0m [93mCommerce[0m [93mDepartment[0m [93msaid[0m [93mon[0m [94mThursday[0m [93mthe[0m [93mtrade[0m [93mgap[0m [93mrose[0m [93m1.6[0m [93mpercent[0m [93mto[0m [93m$57.6[0m [93mbillion[0m [93m.[0m 
[93mThat[0m [93mwas[0m [93mthe[0m [93mhighest[0m [93mlevel[0m [93msince[0m [94mOctober[0m [94m2008[0m [93mand[0m [93mfollowed[0m [93ma[0m [93mslightly[0m [93mdownwardly[0m [93mrevised[0m [93m$56.7[0m [93mbillion[0m [93mshortfall[0m [93min[0m [94mJanuary[0m [93m.[0m 
[93mThe[0m [93md

In [13]:
print_sample(412, feature)


[93mOkay[0m [93m,[0m [93mso[0m [93mits[0m [93mbeen[0m [93mhard[0m [93mto[0m [93mkeep[0m [93mit[0m [93mquiet[0m [93mand[0m [93mthere[0m [93mhave[0m [93mbeen[0m [93ma[0m [93mfew[0m [93mnear-leaks[0m [93malong[0m [93mthe[0m [93mway[0m [93m,[0m [93mbut[0m [93mwe[0m [93mcan[0m [93mfinally[0m [93mconfirm[0m [93mthat[0m [93mthe[0m [93mrumours[0m [93mare[0m [93mtrue[0m [93mthe[0m [93mworlds[0m [93mfirst[0m [93mdriverless[0m [93mpizza[0m [93mdelivery[0m [93mvehicle[0m [93mis[0m [93mhere[0m [93m![0m 
[93mYouve[0m [93mseen[0m [93mGoogles[0m [93mself-driving[0m [93mcars[0m [93m,[0m [93myouve[0m [93mseen[0m [93mour[0m [93mdrone[0m [93mdeliveries[0m [93mwith[0m [93mthe[0m [93mDomiCopter[0m [93mand[0m [94mnow[0m [93myoure[0m [94mgoing[0m [94mto[0m [93msee[0m [93ma[0m [93mworld[0m [93mfirst[0m [93min[0m [93mpizza[0m [93mdelivery[0m [94mhistory[0m [93mas[0m [93mDom