<a href="https://colab.research.google.com/github/daniel-molina23/fake-news-decision-tree/blob/daniel-develop/Model2_w_newDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
import pandas as pd

# you can download this dataset from the github repo
df = pd.read_csv('polifact_mix_enumerated.csv')

poli_mix = pd.DataFrame(df)

poli_mix.shape

(1056, 6)

In [45]:
poli_mix.head(10)
# already used the poli_mix.dropna() on previous dataset, this is new dataset

Unnamed: 0,id,news_url,title,tweet_ids,internal_links,label
0,politifact14095,https://usawatchdog.com/clinton-foundation-lar...,Clinton Foundation Largest Unprosecuted Charit...,0,1444,1
1,politifact14085,usleader.net/index.php/2017/05/11/breaking-bil...,BREAKING: Bill O’Reilly In Critical Condition ...,0,1,1
2,politifact14187,http://rip.trendolizer.com/2017/06/breaking-mo...,BREAKING: Monica Lewinsky Found Dead In Roslin...,580,1,1
3,politifact13484,https://www.washingtonpost.com/graphics/politi...,Thinking about writing in a candidate on Elect...,1,234,1
4,politifact12120,http://politicaladarchive.org/ad/PolAd_BernieS...,Political TV Ad Archive » PolAd,0,71,0
5,politifact14506,https://yournewswire.com/tens-of-thousands-of-...,Tens of Thousands Of Scientists Declare Climat...,22,355,1
6,politifact13475,genius.com/discussions/219349-Donald-trump-pro...,Donald Trump Protester Speaks Out: “I Was Paid...,525,1,1
7,politifact5469,http://www.youtube.com/watch?v=TuYCknrJZ7A,White and Blue Fund ad,27,147,0
8,politifact14499,http://theeconomiccollapseblog.com/archives/hu...,Hurricane Irma: If There Was Such A Thing As A...,929,194,1
9,politifact95,http://findarticles.com/p/articles/mi_qn4155/i...,FindArticles.com,0,19,0


# Potential features...

*   com, co, gov, etc (domain parser)
*   https, http, or neither (https is far more secure!)

*   number of tweet id's per link
*   embedded links within news_url (already webscrapped)

In [46]:
# most common domains (45 total)
domain = [".com/", ".org/", ".edu/", ".gov/", ".uk/", ".net/", ".ca/", ".de/", ".jp/", ".fr/", ".au/", ".us/", ".ru/", ".ch/", ".it/", ".nl/", ".se/", ".no/", ".es/", ".mil/", ".ly/", ".tel/", ".kitchen/", ".email/", ".tech/", ".estate/", ".xyz/", ".codes/", ".bargains/", ".bid/", ".expert/", ".co/", ".name/", ".mobi/", ".asia/", ".biz/", ".arpa/", ".cat/", ".jobs/", ".info/", ".int/", ".pro/", ".aero/", ".travel/", ".coop/"]
d_encoding = [index for index in range(len(domain))] # encoded by index
print(len(domain) == len(d_encoding))

# Protocols for encryption
protocol = ["no_protocol", "http", "https"]
p_encoding = [0, 1, 2]

# Number of tweet id's per row


True


In [47]:
# data_domain method
def encodeDomain(link):
  num = len(domain) # none of the domains encoding, default
  if isinstance(link, float): # nan, therefore return
    return num
  for i in range(len(domain)):
    if domain[i] in link:
      num = i
  return num

# data_protocol method
def configProtocol(link):
  if isinstance(link, float): # nan, therefore return
    return 0
  elif link[:5] == "https":
    return 2
  elif link[:4] == "http":
    return 1
  else:
    return 0


####################

data_protocol = []
data_domain = []
for link in poli_mix["news_url"]:
  data_domain.append(encodeDomain(link)) # returns domain encoding
  data_protocol.append(configProtocol(link)) # returns either 'https', 'http', or 'neither'


label = poli_mix.label

# are all strings the same length????
print(len(data_domain) == len(data_protocol) == len(label))


True


In [48]:
# data_domain, all numbers
# d_encoding (indices of domain)
# domain (list of string domains)
d_freq = dict()
d_set = set(domain)
for i in data_domain:
  if i == len(domain):
    if "NULL" not in d_freq:
      d_freq["NULL"] = 1
    else:
      d_freq["NULL"] += 1
    continue # go to next iteration

  if domain[i] not in d_freq:
    d_freq[domain[i]] = 1
  else: # valid
    d_freq[domain[i]] += 1

display(d_freq)

{'.co/': 4,
 '.com/': 571,
 '.edu/': 4,
 '.gov/': 119,
 '.info/': 7,
 '.mil/': 1,
 '.net/': 17,
 '.org/': 217,
 '.ru/': 1,
 '.uk/': 6,
 '.us/': 12,
 '.xyz/': 1,
 'NULL': 96}

In [49]:
poli_mix.columns

Index(['id', 'news_url', 'title', 'tweet_ids', 'internal_links', 'label'], dtype='object')

In [50]:
clean_data = pd.DataFrame({
    "domain_type":data_domain,
    "protocol": data_protocol,
    "tweet_ids": poli_mix['tweet_ids'],
    "internal_links": poli_mix['internal_links'],
    "label": poli_mix.label
})

clean_data.head(10)

Unnamed: 0,domain_type,protocol,tweet_ids,internal_links,label
0,0,2,0,1444,1
1,5,0,0,1,1
2,0,1,580,1,1
3,0,2,1,234,1
4,1,1,0,71,0
5,0,2,22,355,1
6,0,0,525,1,1
7,0,1,27,147,0
8,0,1,929,194,1
9,0,1,0,19,0


In [51]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [52]:
features = ["domain_type", "protocol", "tweet_ids", "internal_links"]
X = clean_data[features]
y = clean_data.label

display(X.head())
display(y.head())

Unnamed: 0,domain_type,protocol,tweet_ids,internal_links
0,0,2,0,1444
1,5,0,0,1
2,0,1,580,1
3,0,2,1,234
4,1,1,0,71


0    1
1    1
2    1
3    1
4    0
Name: label, dtype: int64

In [53]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) # 75% training and 25% test

In [54]:
clf = DecisionTreeClassifier()

# train decision tree
clf = clf.fit(X_train, y_train)

# predict
y_pred = clf.predict(X_test)

In [55]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7878787878787878


#Using another type of decision Tree classifier

In [56]:
clf = DecisionTreeClassifier(criterion="entropy", max_depth=9) # the greater the max_depth, can cause overfitting

# Train DT
clf = clf.fit(X_train,y_train)

#Predict response for test dataset
y_pred = clf.predict(X_test)


print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8106060606060606
