**Note:** This code was run on google colab

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from textblob import TextBlob

In [None]:
#set the features data folder name 
FEATURES_FOLDER = 'news_features'

#Load the True news dataframe
df_True_text =  pd.read_csv(os.path.join(FEATURES_FOLDER,'df_True_text2.csv'))

#Load the Fake news dataframe
df_Fake_text =  pd.read_csv(os.path.join(FEATURES_FOLDER,'df_Fake_text.csv'))

### Politeness

We compute the politeness of each news using the politeness Pypi politeness package, which is A port of the Stanford Politeness API. https://pypi.org/project/politeness/

In [None]:
#install the politeness library
!pip install politeness==0.1.2

In [None]:
# import and download Natural Language Toolkit
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all
    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package brown to /root/nltk_data...
       |   Unzipping corpora/brown.zip.
       | Downloading package brown_tei to /root/nltk_data...
       |   Unzipping corpora/brown_tei.zip.
       | Downloading package cess_cat to /root/nltk_data...
       |   Unzipping corpora/cess_cat.zip.
       | Downloading package

True

In [None]:
import politeness
from politeness.classifier import Classifier

#instantiate the politeness classifier
cls = Classifier()

Note: At first the goal was to uses the parses computed by the coreNLP in the previous step. However scince these coputations take way too long  and we are unable to perform them on the entire dataset. We decided to compute the remaining parses using the TextBlob method which is much faster and enables us to compute the politeness for the entire dataset

In [None]:
#method that computes the politeness of each sentence in the given text.
#the test is split into sentences using the sent_tokenizer of the nltk library then we compute the parses for each sentence using the parse 
#method of the TextBlob library. Then each sentence and it's corresponding parses are passed to the classifier to get the predicted politenss
#the method returns the average politeness over all the sentences in the given news
#since we are already divinding the news into sentences, we will also return the number of sentences to avoid recomputing it
def get_politeness(doc): 
    #split into sentences
    sentences = nltk.tokenize.sent_tokenize(doc['text'])
    politeness= []
    for idx, sentence in enumerate(sentences): 
      #get the parses
      parses= TextBlob(sentences[0]).parse()
      #get the prediction
      #the predict method returns class probabilities as a dict { 'polite': float, 'impolite': float }, we take only the
      #'polite' argument
      prediction= cls.predict( {'sentence': sentence , 'parses': parses} )[0]
      #save the prediction
      politeness.append(list(prediction.values())[0][0])
    
    #return the average politeness
    return np.mean(politeness), len(sentences)



#### True news

we compute the politeness for the True news 

In [None]:
#add an empty column for the politeness in the True news dataframe
df_True_text[['politeness']]=None

In [None]:
df_True_text.head()

Unnamed: 0,text,numSentence,numWords,totSentiment,avgSentiment,positive_Sentiment,negative_Sentiment,neutral_Sentiment,parses,politeness
0,the head of a conservative republican faction ...,31.0,730.0,63.0,2.032258,10.0,9.0,12.0,['(ROOT\n (S\n (NP\n (NP\n (NP...,
1,transgender people will be allowed for the fir...,22.0,614.0,38.0,1.727273,4.0,10.0,8.0,['(ROOT\n (S\n (S\n (NP (JJ transgend...,
2,the special counsel investigation of links bet...,19.0,452.0,39.0,2.052632,5.0,4.0,10.0,['(ROOT\n (S\n (S\n (NP\n (NP ...,
3,trump campaign adviser george papadopoulos tol...,16.0,374.0,30.0,1.875,4.0,6.0,6.0,['(ROOT\n (S\n (S\n (NP (NN trump) (N...,
4,president donald trump called on the u.s. post...,45.0,827.0,77.0,1.711111,8.0,21.0,16.0,['(ROOT\n (S\n (NP\n (NP (NN presiden...,


In [None]:
#compute the politeness for each True news and save the dataframe after each 20 steps to avoid losing a lot of computations in case of a
#runtime stop
#we also compute the number of sentences and number of words in each news for the talkativeness
for i in range( 0, len(df_True_text)): 
    #print(i)
    politeness, nb_sentences = get_politeness(df_True_text.iloc[i])
    df_True_text['politeness'].iloc[i] = politeness
    df_True_text['numSentence'].iloc[i] = nb_sentences
    df_True_text['numWords'].iloc[i]= len( df_True_text['text'].iloc[i].split())
    if( i%20 ==0):
        #print("let's save")
        df_True_text.to_csv(os.path.join(FEATURES_FOLDER,'df_True_text2.csv'), index=False)

In [None]:
#save the dataframe
df_True_text.to_csv(os.path.join(FEATURES_FOLDER,'df_True_text.csv'), index=False)

#### Fake news

We compute the politeness of the Fake news

In [None]:
#add an empty column for the politeness in the Fake news dataframe
df_Fake_text[['politeness']]=None

In [None]:
df_Fake_text.head()

Unnamed: 0,text,numSentence,numWords,totSentiment,avgSentiment,positive_Sentiment,negative_Sentiment,neutral_Sentiment,parses,politeness
0,donald trump just couldn t wish all americans ...,27.0,462.0,52.0,1.925926,7.0,10.0,10.0,['(ROOT\n (S\n (NP\n (NP (JJ donald) ...,
1,house intelligence committee chairman devin nu...,10.0,308.0,12.0,1.2,0.0,8.0,2.0,['(ROOT\n (S\n (NP (NN house) (NN intellig...,
2,"on friday, it was revealed that former milwauk...",25.0,544.0,41.0,1.64,3.0,12.0,10.0,['(ROOT\n (S\n (PP (IN on)\n (NP (NNP...,
3,"on christmas day, donald trump announced that ...",17.0,411.0,26.0,1.529412,2.0,10.0,5.0,['(ROOT\n (S\n (PP (IN on)\n (NP (NNP...,
4,pope francis used his annual christmas day mes...,19.0,420.0,33.0,1.736842,6.0,11.0,2.0,['(ROOT\n (S\n (NP (NN pope) (NNS francis)...,


In [None]:
df_Fake_text.to_csv(os.path.join(FEATURES_FOLDER,'df_Fake_text.csv'), index=False)

In [None]:
#check an entry
df_Fake_text.iloc[2933]

text                  with president elect donald trump s inaugurati...
numSentence                                                          23
numWords                                                            369
totSentiment                                                         29
avgSentiment                                                    1.26087
positive_Sentiment                                                    0
negative_Sentiment                                                   17
neutral_Sentiment                                                     6
parses                ['(ROOT\n  (S\n    (PP\n      (PP (IN with)\n ...
politeness                                                         None
Name: 2937, dtype: object

In [None]:
#compute the politeness for each Fake news and save the dataframe after each 20 steps to avoid losing a lot of computations in case of a
#runtime stop
#we also compute the number of sentences and number of words in each news for the talkativeness
for i in range(0 , len(df_Fake_text)): 
    #print(i)
    politeness, nb_sentences = get_politeness(df_Fake_text.iloc[i])
    df_Fake_text['politeness'].iloc[i] = politeness
    df_Fake_text['numSentence'].iloc[i] = nb_sentences
    df_Fake_text['numWords'].iloc[i]= len( df_Fake_text['text'].iloc[i].split())
    if( i%20 ==0):
        #print("let's save")
        df_Fake_text.to_csv(os.path.join(FEATURES_FOLDER,'df_Fake_text.csv'), index=False)

In [None]:
#save the dataframe
df_Fake_text.to_csv(os.path.join(FEATURES_FOLDER,'df_Fake_text.csv'), index=False)

In [None]:
df_Fake_text.head()

Unnamed: 0,text,numSentence,numWords,totSentiment,avgSentiment,positive_Sentiment,negative_Sentiment,neutral_Sentiment,parses,politeness,premises_conclusions,subjectivity,polarity
0,donald trump just couldn t wish all americans ...,27.0,462.0,52.0,1.925926,7.0,10.0,10.0,['(ROOT\n (S\n (NP\n (NP (JJ donald) ...,0.422506,18,0.59249,0.0824626
1,house intelligence committee chairman devin nu...,10.0,308.0,12.0,1.2,0.0,8.0,2.0,['(ROOT\n (S\n (NP (NN house) (NN intellig...,0.455562,21,0.334098,-0.00500448
2,"on friday, it was revealed that former milwauk...",25.0,544.0,41.0,1.64,3.0,12.0,10.0,['(ROOT\n (S\n (PP (IN on)\n (NP (NNP...,0.44189,20,0.542352,0.00967144
3,"on christmas day, donald trump announced that ...",17.0,411.0,26.0,1.529412,2.0,10.0,5.0,['(ROOT\n (S\n (PP (IN on)\n (NP (NNP...,0.470385,22,0.372581,0.00913978
4,pope francis used his annual christmas day mes...,19.0,420.0,33.0,1.736842,6.0,11.0,2.0,['(ROOT\n (S\n (NP (NN pope) (NNS francis)...,0.45624,25,0.495222,-0.0117222


### Premises and conclusions

To compute the number of Premises and conclusions on each news, we used the markers from the article "Premise, Conclusion and Conditional Indicators, CLAUDE GRATTON, Department of Philosophy University of Nevada at Las Vegas, Las Vegas, NY 89154-5028 U.SA". We saved those markers in the file "premises_conclusions.txt".

In [None]:
#Load the "premises_conclusions.txt" file
text_file = open("premises_conclusions.txt", "r")
premises_conclusions = text_file.readlines( )

In [None]:
#extract the markers
premises_conclusions = [x.rstrip("\n").lower()  for x in premises_conclusions]
premises_conclusions.remove("")
premises_conclusions

['consequently',
 'as shown by the fact that',
 'as',
 'from this we can deduce that',
 'accordingly',
 'from this it follows tha',
 'entails that',
 'this is shown by',
 'shows that',
 'however',
 'i conclude that',
 'follows from',
 'i impersonal pronouns sometimes refer to earlier statements ',
 'moreover',
 'this proves that',
 'furthermore',
 'hence',
 'then',
 'that is proven from',
 'granted that',
 'supposing that',
 'for',
 'because',
 'nevertheless',
 'that is why',
 'here is why',
 'obviously',
 'implies that',
 'due to the reason that',
 'despite the fact that',
 'i n view of the fact that',
 'may be deduced from',
 'may be inferred from',
 'also',
 'thus ',
 'it can be derived from that',
 'this bears out the point that',
 'establishes that',
 'proves that',
 'this is proven from',
 'supports that',
 'in support of',
 'consider',
 'evidently',
 'inasmuch as',
 'on the hypothesis that',
 'ind icates that',
 'guarantees that',
 'on the basis of',
 'in light of the fact that'

#### True news

We compute the number of premises and conclusions for the True news

In [None]:
#add an empty column for the nb of premises and conclusions in the True news dataset
df_True_text[['premises_conclusions']]=None

In [None]:
df_True_text.head()

Unnamed: 0,text,numSentence,numWords,totSentiment,avgSentiment,positive_Sentiment,negative_Sentiment,neutral_Sentiment,parses,politeness,premises_conclusions
0,the head of a conservative republican faction ...,31.0,730.0,63.0,2.032258,10.0,9.0,12.0,['(ROOT\n (S\n (NP\n (NP\n (NP...,0.456962,
1,transgender people will be allowed for the fir...,22.0,614.0,38.0,1.727273,4.0,10.0,8.0,['(ROOT\n (S\n (S\n (NP (JJ transgend...,0.458206,
2,the special counsel investigation of links bet...,19.0,452.0,39.0,2.052632,5.0,4.0,10.0,['(ROOT\n (S\n (S\n (NP\n (NP ...,0.449653,
3,trump campaign adviser george papadopoulos tol...,16.0,374.0,30.0,1.875,4.0,6.0,6.0,['(ROOT\n (S\n (S\n (NP (NN trump) (N...,0.461039,
4,president donald trump called on the u.s. post...,45.0,827.0,77.0,1.711111,8.0,21.0,16.0,['(ROOT\n (S\n (NP\n (NP (NN presiden...,0.450035,


In [None]:
#count the nb of premises and conclusions in each news by checking the number of occurences of the merkers in that news 
df_True_text['premises_conclusions'] = df_True_text['text'].apply( lambda x :  sum([x.count(i) for i in premises_conclusions])  )

In [None]:
df_True_text.head()

Unnamed: 0,text,numSentence,numWords,totSentiment,avgSentiment,positive_Sentiment,negative_Sentiment,neutral_Sentiment,parses,politeness,premises_conclusions
0,the head of a conservative republican faction ...,31.0,730.0,63.0,2.032258,10.0,9.0,12.0,['(ROOT\n (S\n (NP\n (NP\n (NP...,0.456962,45
1,transgender people will be allowed for the fir...,22.0,614.0,38.0,1.727273,4.0,10.0,8.0,['(ROOT\n (S\n (S\n (NP (JJ transgend...,0.458206,23
2,the special counsel investigation of links bet...,19.0,452.0,39.0,2.052632,5.0,4.0,10.0,['(ROOT\n (S\n (S\n (NP\n (NP ...,0.449653,23
3,trump campaign adviser george papadopoulos tol...,16.0,374.0,30.0,1.875,4.0,6.0,6.0,['(ROOT\n (S\n (S\n (NP (NN trump) (N...,0.461039,28
4,president donald trump called on the u.s. post...,45.0,827.0,77.0,1.711111,8.0,21.0,16.0,['(ROOT\n (S\n (NP\n (NP (NN presiden...,0.450035,47


In [None]:
#save the dataframe
df_True_text.to_csv(os.path.join(FEATURES_FOLDER,'df_True_text2.csv'), index=False)

#### Fake news

We compute the number of premises and conclusions for the Fake news

In [None]:
#add an empty column for the nb of premises and conclusions in the Fake news dataset
df_Fake_text[['premises_conclusions']]=None

In [None]:
df_Fake_text.head()

Unnamed: 0,text,numSentence,numWords,totSentiment,avgSentiment,positive_Sentiment,negative_Sentiment,neutral_Sentiment,parses,politeness,premises_conclusions
0,donald trump just couldn t wish all americans ...,27.0,462.0,52.0,1.925926,7.0,10.0,10.0,['(ROOT\n (S\n (NP\n (NP (JJ donald) ...,0.422506,
1,house intelligence committee chairman devin nu...,10.0,308.0,12.0,1.2,0.0,8.0,2.0,['(ROOT\n (S\n (NP (NN house) (NN intellig...,0.455562,
2,"on friday, it was revealed that former milwauk...",25.0,544.0,41.0,1.64,3.0,12.0,10.0,['(ROOT\n (S\n (PP (IN on)\n (NP (NNP...,0.44189,
3,"on christmas day, donald trump announced that ...",17.0,411.0,26.0,1.529412,2.0,10.0,5.0,['(ROOT\n (S\n (PP (IN on)\n (NP (NNP...,0.470385,
4,pope francis used his annual christmas day mes...,19.0,420.0,33.0,1.736842,6.0,11.0,2.0,['(ROOT\n (S\n (NP (NN pope) (NNS francis)...,0.45624,


In [None]:
#count the nb of premises and conclusions in each news by checking the number of occurences of the merkers in that news 
df_Fake_text['premises_conclusions'] = df_Fake_text['text'].apply( lambda x :  sum([x.count(i) for i in premises_conclusions])  )

In [None]:
df_Fake_text.head()

Unnamed: 0,text,numSentence,numWords,totSentiment,avgSentiment,positive_Sentiment,negative_Sentiment,neutral_Sentiment,parses,politeness,premises_conclusions
0,donald trump just couldn t wish all americans ...,27.0,462.0,52.0,1.925926,7.0,10.0,10.0,['(ROOT\n (S\n (NP\n (NP (JJ donald) ...,0.422506,18
1,house intelligence committee chairman devin nu...,10.0,308.0,12.0,1.2,0.0,8.0,2.0,['(ROOT\n (S\n (NP (NN house) (NN intellig...,0.455562,21
2,"on friday, it was revealed that former milwauk...",25.0,544.0,41.0,1.64,3.0,12.0,10.0,['(ROOT\n (S\n (PP (IN on)\n (NP (NNP...,0.44189,20
3,"on christmas day, donald trump announced that ...",17.0,411.0,26.0,1.529412,2.0,10.0,5.0,['(ROOT\n (S\n (PP (IN on)\n (NP (NNP...,0.470385,22
4,pope francis used his annual christmas day mes...,19.0,420.0,33.0,1.736842,6.0,11.0,2.0,['(ROOT\n (S\n (NP (NN pope) (NNS francis)...,0.45624,25


In [None]:
#save the dataframe
df_Fake_text.to_csv(os.path.join(FEATURES_FOLDER,'df_Fake_text.csv'), index=False)

### Subjectivity and Polarity


Since we weren't able to compute the the sentiments of the entire dataset using the Stanford CoreNLP, we decided to use sentiment method of TextBlob that is much faster. This method computes the polarity of the news and it's subjectivity. Polarity is a float within the range [-1.0, 1.0] and Subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.
TextBlob uses a sentiment lexicon (consisting of predefined words) to assign scores for each word, which are then averaged out using a weighted average to give an overall sentence sentiment score
We will later convert the polarity score returned by TextBlob to a fine-grained class label (an integer) by cutting them into equal sized bins. We will compare the results of the two methods to see if they give the same predictions on average.

In [None]:
#perform a test on one entry
testimonial = TextBlob(df_Fake_text['text'].iloc[0])
testimonial.sentiment

Sentiment(polarity=0.08246258885147778, subjectivity=0.5924897119341561)

#### True news

We compute the subjectivity and polarity of the True news

In [None]:
df_True_text.head()

Unnamed: 0,text,numSentence,numWords,totSentiment,avgSentiment,positive_Sentiment,negative_Sentiment,neutral_Sentiment,parses,politeness,premises_conclusions
0,the head of a conservative republican faction ...,31.0,730.0,63.0,2.032258,10.0,9.0,12.0,['(ROOT\n (S\n (NP\n (NP\n (NP...,0.456962,45
1,transgender people will be allowed for the fir...,22.0,614.0,38.0,1.727273,4.0,10.0,8.0,['(ROOT\n (S\n (S\n (NP (JJ transgend...,0.458206,23
2,the special counsel investigation of links bet...,19.0,452.0,39.0,2.052632,5.0,4.0,10.0,['(ROOT\n (S\n (S\n (NP\n (NP ...,0.449653,23
3,trump campaign adviser george papadopoulos tol...,16.0,374.0,30.0,1.875,4.0,6.0,6.0,['(ROOT\n (S\n (S\n (NP (NN trump) (N...,0.461039,28
4,president donald trump called on the u.s. post...,45.0,827.0,77.0,1.711111,8.0,21.0,16.0,['(ROOT\n (S\n (NP\n (NP (NN presiden...,0.450035,47


In [None]:
#add empty columns for the polarity and subjectivity
df_True_text[['subjectivity']]= None
df_True_text[['polarity']]= None

In [None]:
df_True_text.head()

Unnamed: 0,text,numSentence,numWords,totSentiment,avgSentiment,positive_Sentiment,negative_Sentiment,neutral_Sentiment,parses,politeness,premises_conclusions,subjectivity,polarity
0,the head of a conservative republican faction ...,31.0,730.0,63.0,2.032258,10.0,9.0,12.0,['(ROOT\n (S\n (NP\n (NP\n (NP...,0.456962,45,,
1,transgender people will be allowed for the fir...,22.0,614.0,38.0,1.727273,4.0,10.0,8.0,['(ROOT\n (S\n (S\n (NP (JJ transgend...,0.458206,23,,
2,the special counsel investigation of links bet...,19.0,452.0,39.0,2.052632,5.0,4.0,10.0,['(ROOT\n (S\n (S\n (NP\n (NP ...,0.449653,23,,
3,trump campaign adviser george papadopoulos tol...,16.0,374.0,30.0,1.875,4.0,6.0,6.0,['(ROOT\n (S\n (S\n (NP (NN trump) (N...,0.461039,28,,
4,president donald trump called on the u.s. post...,45.0,827.0,77.0,1.711111,8.0,21.0,16.0,['(ROOT\n (S\n (NP\n (NP (NN presiden...,0.450035,47,,


In [None]:
#for each news compute and store the polarity and subjectivity
for i in range( len(df_True_text)):
    sentiment = TextBlob(df_True_text['text'].iloc[i]).sentiment
    df_True_text['polarity'].iloc[i]= sentiment.polarity
    df_True_text['subjectivity'].iloc[i] = sentiment.subjectivity

In [None]:
df_True_text.head()

Unnamed: 0,text,numSentence,numWords,totSentiment,avgSentiment,positive_Sentiment,negative_Sentiment,neutral_Sentiment,parses,politeness,premises_conclusions,subjectivity,polarity
0,the head of a conservative republican faction ...,31.0,730.0,63.0,2.032258,10.0,9.0,12.0,['(ROOT\n (S\n (NP\n (NP\n (NP...,0.456962,45,0.41025,0.0370833
1,transgender people will be allowed for the fir...,22.0,614.0,38.0,1.727273,4.0,10.0,8.0,['(ROOT\n (S\n (S\n (NP (JJ transgend...,0.458206,23,0.308401,0.0443537
2,the special counsel investigation of links bet...,19.0,452.0,39.0,2.052632,5.0,4.0,10.0,['(ROOT\n (S\n (S\n (NP\n (NP ...,0.449653,23,0.316798,0.11593
3,trump campaign adviser george papadopoulos tol...,16.0,374.0,30.0,1.875,4.0,6.0,6.0,['(ROOT\n (S\n (S\n (NP (NN trump) (N...,0.461039,28,0.306569,0.0359684
4,president donald trump called on the u.s. post...,45.0,827.0,77.0,1.711111,8.0,21.0,16.0,['(ROOT\n (S\n (NP\n (NP (NN presiden...,0.450035,47,0.398611,0.0343216


In [None]:
#save the dataframe
df_True_text.to_csv(os.path.join(FEATURES_FOLDER,'df_True_text2.csv'), index=False)

#### Fake news

We compute the subjectivity and polarity of the True news

In [None]:
#add empty columns for the polarity and subjectivity
df_Fake_text[['subjectivity']]= None
df_Fake_text[['polarity']]= None

In [None]:
df_Fake_text.head()

Unnamed: 0,text,numSentence,numWords,totSentiment,avgSentiment,positive_Sentiment,negative_Sentiment,neutral_Sentiment,parses,politeness,premises_conclusions,subjectivity,polarity
0,donald trump just couldn t wish all americans ...,27.0,462.0,52.0,1.925926,7.0,10.0,10.0,['(ROOT\n (S\n (NP\n (NP (JJ donald) ...,0.422506,18,,
1,house intelligence committee chairman devin nu...,10.0,308.0,12.0,1.2,0.0,8.0,2.0,['(ROOT\n (S\n (NP (NN house) (NN intellig...,0.455562,21,,
2,"on friday, it was revealed that former milwauk...",25.0,544.0,41.0,1.64,3.0,12.0,10.0,['(ROOT\n (S\n (PP (IN on)\n (NP (NNP...,0.44189,20,,
3,"on christmas day, donald trump announced that ...",17.0,411.0,26.0,1.529412,2.0,10.0,5.0,['(ROOT\n (S\n (PP (IN on)\n (NP (NNP...,0.470385,22,,
4,pope francis used his annual christmas day mes...,19.0,420.0,33.0,1.736842,6.0,11.0,2.0,['(ROOT\n (S\n (NP (NN pope) (NNS francis)...,0.45624,25,,


In [None]:
#for each news compute and store the polarity and subjectivity
for i in range( len(df_Fake_text)):
    sentiment = TextBlob(df_Fake_text['text'].iloc[i]).sentiment
    df_Fake_text['polarity'].iloc[i]= sentiment.polarity
    df_Fake_text['subjectivity'].iloc[i] = sentiment.subjectivity

In [None]:
df_Fake_text.head()

Unnamed: 0,text,numSentence,numWords,totSentiment,avgSentiment,positive_Sentiment,negative_Sentiment,neutral_Sentiment,parses,politeness,premises_conclusions,subjectivity,polarity
0,donald trump just couldn t wish all americans ...,27.0,462.0,52.0,1.925926,7.0,10.0,10.0,['(ROOT\n (S\n (NP\n (NP (JJ donald) ...,0.422506,18,0.59249,0.0824626
1,house intelligence committee chairman devin nu...,10.0,308.0,12.0,1.2,0.0,8.0,2.0,['(ROOT\n (S\n (NP (NN house) (NN intellig...,0.455562,21,0.334098,-0.00500448
2,"on friday, it was revealed that former milwauk...",25.0,544.0,41.0,1.64,3.0,12.0,10.0,['(ROOT\n (S\n (PP (IN on)\n (NP (NNP...,0.44189,20,0.542352,0.00967144
3,"on christmas day, donald trump announced that ...",17.0,411.0,26.0,1.529412,2.0,10.0,5.0,['(ROOT\n (S\n (PP (IN on)\n (NP (NNP...,0.470385,22,0.372581,0.00913978
4,pope francis used his annual christmas day mes...,19.0,420.0,33.0,1.736842,6.0,11.0,2.0,['(ROOT\n (S\n (NP (NN pope) (NNS francis)...,0.45624,25,0.495222,-0.0117222


In [None]:
#save the dataframe
df_Fake_text.to_csv(os.path.join(FEATURES_FOLDER,'df_Fake_text.csv'), index=False)

### Expansion, contingency and comparison  

Compute the expansion, contingency and comparison   for the two datasets. 


The discourse markers were collected from various websites on the internet to get the most representative sets. We also combined them with the extracted features from the diplomacy dataset.

In [None]:
#expansion markers 
expansion = set( ['generally', 'thereafter', 'perhaps', 'certainly', 'by', 'meantime', 'separately', 'surely', 'amazingly', 
'conversly', 'by then', 'typically', 'historically', 'collectively', 'initially', 'then', 'namely', 'unsurprisingly', 
'ironically', 'in turn', 'supposedly', 'evidently', 'originally', 'this', 'ultimately', 'because of that', 'nontheless', 
'together', 'especially', 'still', 'realistically', 'theoretically', 'normally', 'technically', 'so', 'frankly', 
'simultaneously', 'presumably', 'clearly', 'yet', 'naturally', 'coincidentally', 'in the meantime', 'curiously', 'now',
'admittedly', 'meaning', 'immediately', 'previously', 'currently', 'but', 'again', 'absolutely', 'and', 'or', 'probably', 
'well' ])

#diplomacy expansion markers
expansion_diplomacy = {'additionally',
 'also',
 'alternatively',
 'although',
 'as an alternative',
 'as if',
 'as though',
 'as well',
 'besides',
 'either or',
 'else',
 'except',
 'finally',
 'for example',
 'for instance',
 'further',
 'furthermore',
 'however',
 'in addition',
 'in fact',
 'in other words',
 'in particular',
 'in short',
 'in the end',
 'in turn',
 'indeed',
 'instead',
 'later',
 'lest',
 'likewise',
 'meantime',
 'meanwhile',
 'moreover',
 'much as',
 'next',
 'nonetheless',
 'nor',
 'on the other hand',
 'otherwise',
 'overall',
 'plus',
 'rather',
 'separately',
 'similarly',
 'specifically',
 'then',
 'ultimately',
 'unless',
 'until',
 'when',
 'while',
 'yet'}

#combined epansion sets
expansion = expansion | expansion_diplomacy

In [None]:
#explore the expansion markers
expansion

{'absolutely',
 'additionally',
 'admittedly',
 'again',
 'also',
 'alternatively',
 'although',
 'amazingly',
 'and',
 'as an alternative',
 'as if',
 'as though',
 'as well',
 'because of that',
 'besides',
 'but',
 'by',
 'by then',
 'certainly',
 'clearly',
 'coincidentally',
 'collectively',
 'conversly',
 'curiously',
 'currently',
 'either or',
 'else',
 'especially',
 'evidently',
 'except',
 'finally',
 'for example',
 'for instance',
 'frankly',
 'further',
 'furthermore',
 'generally',
 'historically',
 'however',
 'immediately',
 'in addition',
 'in fact',
 'in other words',
 'in particular',
 'in short',
 'in the end',
 'in the meantime',
 'in turn',
 'indeed',
 'initially',
 'instead',
 'ironically',
 'later',
 'lest',
 'likewise',
 'meaning',
 'meantime',
 'meanwhile',
 'moreover',
 'much as',
 'namely',
 'naturally',
 'next',
 'nonetheless',
 'nontheless',
 'nor',
 'normally',
 'now',
 'on the other hand',
 'or',
 'originally',
 'otherwise',
 'overall',
 'perhaps',
 'pl

In [None]:
#contingency set
contingency= set( [ 'by doing this', 'theoretically', 'realistically', 'so', 'clearly', 'because of that', 'surely', 
'in sum', 'frankly', 'perhaps','inevitably', 'obviously', 'certainly', 'immediately', 'in short', 'in turn', 
'increasingly', 'naturally', 'as a result', 'admittedly', 'moreover', 'evidently', 'namely', 'already', 'meaning', 
'presumably', 'in other words', 'in the meantime', 'now', 'by then', 'on the contrary', 'ultimately', 
'unfortunately', 'historically', 'undoubtedly', 'ironically', 'supposedly', 'yet', 'presently', 'nontheless', 
'still', 'indeed', 'essentially', 'arguably', 'rather', 'and', 'personally', 'instead', 'in fact', 'altogether', 
'meantime', 'colllectively', 'unsurprisingly', 'then', 'originally', 'significantly', 'for instance', 'currently', 
'separately', 'by contrast', 'initially', 'notably', 'meanwhile' ])

#diplomacy contingency set
contingency_diplomacy = {'accordingly',
 'as a result',
 'as long as',
 'because',
 'consequently',
 'hence',
 'if and when',
 'if then',
 'in the end',
 'in turn',
 'indeed',
 'insofar as',
 'lest',
 'now that',
 'once',
 'since',
 'so that',
 'then',
 'thereby',
 'therefore',
 'thus',
 'unless',
 'until',
 'when'}

#contingency combined set
contingency= contingency | contingency_diplomacy


In [None]:
#explore the contingency set
contingency

{'accordingly',
 'admittedly',
 'already',
 'altogether',
 'and',
 'arguably',
 'as a result',
 'as long as',
 'because',
 'because of that',
 'by contrast',
 'by doing this',
 'by then',
 'certainly',
 'clearly',
 'colllectively',
 'consequently',
 'currently',
 'essentially',
 'evidently',
 'for instance',
 'frankly',
 'hence',
 'historically',
 'if and when',
 'if then',
 'immediately',
 'in fact',
 'in other words',
 'in short',
 'in sum',
 'in the end',
 'in the meantime',
 'in turn',
 'increasingly',
 'indeed',
 'inevitably',
 'initially',
 'insofar as',
 'instead',
 'ironically',
 'lest',
 'meaning',
 'meantime',
 'meanwhile',
 'moreover',
 'namely',
 'naturally',
 'nontheless',
 'notably',
 'now',
 'now that',
 'obviously',
 'on the contrary',
 'once',
 'originally',
 'perhaps',
 'personally',
 'presently',
 'presumably',
 'rather',
 'realistically',
 'separately',
 'significantly',
 'since',
 'so',
 'so that',
 'still',
 'supposedly',
 'surely',
 'then',
 'theoretically',
 'th

In [None]:
#comparison set
comparison  = set([ 'however', 'but', 'like', 'likewise', 'same as', 'as well as' ,'also', 'too', 'likewise', 
'unlike', 'in contrast to', 'as opposed to', 'different from', 'whereas', 'both', 'comparatively', 'in the same way',
'in addition', 'just as', 'most important', 'similarly', 'although','besides','but', 'compared with', 'conversely', 'differ',
'even though', 'furthermore', 'however', 'in contrast to', 'instead', 'less than', 'more than', 'nevertheless', 'notwithstanding',
'on the other hand', 'otherwise', 'rather than', 'regardless', 'though', 'unless', 'unlike', 'while', 'yet' ])

#comparison dimplomacy set
comparison  _diplomacy = {'after',
 'although',
 'as if',
 'as though',
 'besides',
 'conversely',
 'earlier',
 'however',
 'in fact',
 'in the end',
 'indeed',
 'instead',
 'meanwhile',
 'much as',
 'nevertheless',
 'nonetheless',
 'nor',
 'on the contrary',
 'on the other hand',
 'previously',
 'rather',
 'regardless',
 'still',
 'then',
 'though',
 'when',
 'whereas',
 'while',
 'yet'}

#combined comparison   set
comparison   = comparison   | comparison  _diplomacy

In [None]:
#explore the comparison   set
comparison  

{'after',
 'also',
 'although',
 'as if',
 'as opposed to',
 'as though',
 'as well as',
 'besides',
 'both',
 'but',
 'comparatively',
 'compared with',
 'conversely',
 'differ',
 'different from',
 'earlier',
 'even though',
 'furthermore',
 'however',
 'in addition',
 'in contrast to',
 'in fact',
 'in the end',
 'in the same way',
 'indeed',
 'instead',
 'just as',
 'less than',
 'like',
 'likewise',
 'meanwhile',
 'more than',
 'most important',
 'much as',
 'nevertheless',
 'nonetheless',
 'nor',
 'notwithstanding',
 'on the contrary',
 'on the other hand',
 'otherwise',
 'previously',
 'rather',
 'rather than',
 'regardless',
 'same as',
 'similarly',
 'still',
 'then',
 'though',
 'too',
 'unless',
 'unlike',
 'when',
 'whereas',
 'while',
 'yet'}

#### True news

compute the expansion, contingency and comparison   for the true news

In [None]:
#add empty columns for the expansion, contingency and comparison  
df_True_text[['comparaison  ', 'contingency' , 'expansion']]= None

In [None]:
df_True_text.head()

Unnamed: 0,text,numSentence,numWords,totSentiment,avgSentiment,positive_Sentiment,negative_Sentiment,neutral_Sentiment,parses,politeness,premises_conclusions,subjectivity,polarity,comparaison,contingency,expansion
0,the head of a conservative republican faction ...,31.0,730.0,63.0,2.032258,10.0,9.0,12.0,['(ROOT\n (S\n (NP\n (NP\n (NP...,0.456962,45,0.41025,0.0370833,,,
1,transgender people will be allowed for the fir...,22.0,614.0,38.0,1.727273,4.0,10.0,8.0,['(ROOT\n (S\n (S\n (NP (JJ transgend...,0.458206,23,0.308401,0.0443537,,,
2,the special counsel investigation of links bet...,19.0,452.0,39.0,2.052632,5.0,4.0,10.0,['(ROOT\n (S\n (S\n (NP\n (NP ...,0.449653,23,0.316798,0.11593,,,
3,trump campaign adviser george papadopoulos tol...,16.0,374.0,30.0,1.875,4.0,6.0,6.0,['(ROOT\n (S\n (S\n (NP (NN trump) (N...,0.461039,28,0.306569,0.0359684,,,
4,president donald trump called on the u.s. post...,45.0,827.0,77.0,1.711111,8.0,21.0,16.0,['(ROOT\n (S\n (NP\n (NP (NN presiden...,0.450035,47,0.398611,0.0343216,,,


In [None]:
#count the nb of comparison  s in each news by checking the number of occurences of the merkers in that news 
df_True_text['comparaison  '] = df_True_text['text'].apply( lambda x :  sum([x.count(i) for i in comparison  ])  )

In [None]:
#count the nb of contingency in each news by checking the number of occurences of the merkers in that news 
df_True_text['contingency'] = df_True_text['text'].apply( lambda x :  sum([x.count(i) for i in contingency])  )

In [None]:
#count the nb of expansion in each news by checking the number of occurences of the merkers in that news 
df_True_text['expansion'] = df_True_text['text'].apply( lambda x :  sum([x.count(i) for i in expansion])  )

In [None]:
#save the dataframe
df_True_text.to_csv(os.path.join(FEATURES_FOLDER,'df_True_text2.csv'), index=False)

In [None]:
df_True_text.head()

Unnamed: 0,text,numSentence,numWords,totSentiment,avgSentiment,positive_Sentiment,negative_Sentiment,neutral_Sentiment,parses,politeness,premises_conclusions,subjectivity,polarity,comparaison,contingency,expansion
0,the head of a conservative republican faction ...,31.0,730.0,63.0,2.032258,10.0,9.0,12.0,['(ROOT\n (S\n (NP\n (NP\n (NP...,0.456962,45,0.41025,0.0370833,15,26,82
1,transgender people will be allowed for the fir...,22.0,614.0,38.0,1.727273,4.0,10.0,8.0,['(ROOT\n (S\n (S\n (NP (JJ transgend...,0.458206,23,0.308401,0.0443537,10,25,57
2,the special counsel investigation of links bet...,19.0,452.0,39.0,2.052632,5.0,4.0,10.0,['(ROOT\n (S\n (S\n (NP\n (NP ...,0.449653,23,0.316798,0.11593,9,20,49
3,trump campaign adviser george papadopoulos tol...,16.0,374.0,30.0,1.875,4.0,6.0,6.0,['(ROOT\n (S\n (S\n (NP (NN trump) (N...,0.461039,28,0.306569,0.0359684,8,22,51
4,president donald trump called on the u.s. post...,45.0,827.0,77.0,1.711111,8.0,21.0,16.0,['(ROOT\n (S\n (NP\n (NP (NN presiden...,0.450035,47,0.398611,0.0343216,20,34,85


#### Fake news

In [None]:
#add empty columns for the expansion, contingency and comparison  
df_Fake_text[['comparaison  ', 'contingency' , 'expansion']]= None

In [None]:
df_Fake_text.head()

Unnamed: 0,text,numSentence,numWords,totSentiment,avgSentiment,positive_Sentiment,negative_Sentiment,neutral_Sentiment,parses,politeness,premises_conclusions,subjectivity,polarity,comparaison,contingency,expansion
0,donald trump just couldn t wish all americans ...,27.0,462.0,52.0,1.925926,7.0,10.0,10.0,['(ROOT\n (S\n (NP\n (NP (JJ donald) ...,0.422506,18,0.59249,0.0824626,,,
1,house intelligence committee chairman devin nu...,10.0,308.0,12.0,1.2,0.0,8.0,2.0,['(ROOT\n (S\n (NP (NN house) (NN intellig...,0.455562,21,0.334098,-0.00500448,,,
2,"on friday, it was revealed that former milwauk...",25.0,544.0,41.0,1.64,3.0,12.0,10.0,['(ROOT\n (S\n (PP (IN on)\n (NP (NNP...,0.44189,20,0.542352,0.00967144,,,
3,"on christmas day, donald trump announced that ...",17.0,411.0,26.0,1.529412,2.0,10.0,5.0,['(ROOT\n (S\n (PP (IN on)\n (NP (NNP...,0.470385,22,0.372581,0.00913978,,,
4,pope francis used his annual christmas day mes...,19.0,420.0,33.0,1.736842,6.0,11.0,2.0,['(ROOT\n (S\n (NP (NN pope) (NNS francis)...,0.45624,25,0.495222,-0.0117222,,,


In [None]:
#count the nb of comparison  s and conclusions in each news by checking the number of occurences of the merkers in that news 
df_Fake_text['comparaison  '] = df_Fake_text['text'].apply( lambda x :  sum([x.count(i) for i in comparison  ])  )

In [None]:
#count the nb of contingency and conclusions in each news by checking the number of occurences of the merkers in that news 
df_Fake_text['contingency'] = df_Fake_text['text'].apply( lambda x :  sum([x.count(i) for i in contingency])  )

In [None]:
#count the nb of expansion in each news by checking the number of occurences of the merkers in that news 
df_Fake_text['expansion'] = df_Fake_text['text'].apply( lambda x :  sum([x.count(i) for i in expansion])  )

In [None]:
#save the dataframe
df_Fake_text.to_csv(os.path.join(FEATURES_FOLDER,'df_Fake_text.csv'), index=False)

In [None]:
df_Fake_text.head()

Unnamed: 0,text,numSentence,numWords,totSentiment,avgSentiment,positive_Sentiment,negative_Sentiment,neutral_Sentiment,parses,politeness,premises_conclusions,subjectivity,polarity,comparaison,contingency,expansion
0,donald trump just couldn t wish all americans ...,27.0,462.0,52.0,1.925926,7.0,10.0,10.0,['(ROOT\n (S\n (NP\n (NP (JJ donald) ...,0.422506,18,0.59249,0.0824626,8,27,50
1,house intelligence committee chairman devin nu...,10.0,308.0,12.0,1.2,0.0,8.0,2.0,['(ROOT\n (S\n (NP (NN house) (NN intellig...,0.455562,21,0.334098,-0.00500448,7,16,40
2,"on friday, it was revealed that former milwauk...",25.0,544.0,41.0,1.64,3.0,12.0,10.0,['(ROOT\n (S\n (PP (IN on)\n (NP (NNP...,0.44189,20,0.542352,0.00967144,9,24,47
3,"on christmas day, donald trump announced that ...",17.0,411.0,26.0,1.529412,2.0,10.0,5.0,['(ROOT\n (S\n (PP (IN on)\n (NP (NNP...,0.470385,22,0.372581,0.00913978,13,18,48
4,pope francis used his annual christmas day mes...,19.0,420.0,33.0,1.736842,6.0,11.0,2.0,['(ROOT\n (S\n (NP (NN pope) (NNS francis)...,0.45624,25,0.495222,-0.0117222,4,26,48
