In [1]:
from collections import OrderedDict
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import graphviz

In [2]:
import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.text import Tokenizer, one_hot

Using TensorFlow backend.


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
# from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.tree import export_text
from sklearn.tree import export_graphviz
from sklearn import metrics

In [4]:
df = pd.read_csv('aggregation.csv')
df.head()

Unnamed: 0,text,retweet_count,favorite_count,username,userdesc,verified,followers,created_at,has_urls,has_mentions,high_response
0,They are starting to get more and more despera...,24298,168648,GretaThunberg,17 year old climate and environmental activist...,True,4079169,2020-02-29 15:26:10,True,False,True
1,I do not believe we will defeat Donald Trump w...,18592,77895,BernieSanders,U.S. Senator from Vermont and candidate for Pr...,True,10951634,2020-03-02 20:30:56,False,False,True
2,Indigenous rights = Climate justice\n#Wetsuwet...,4609,21488,GretaThunberg,17 year old climate and environmental activist...,True,4086646,2020-02-08 13:36:48,True,False,True
3,Stop running away from your problem. Run into ...,2739,16317,pulte,The Philanthropist. Inventor of Twitter Philan...,True,2059165,2020-02-29 21:19:22,True,False,True
4,Support the Wet’suwet’en Nation and the pipeli...,2972,10035,GretaThunberg,17 year old climate and environmental activist...,True,4091979,2020-02-18 10:13:02,True,False,True


In [5]:
tweet_corpus = list(df['text'])

In [6]:
# Remove URL junk
url_pattern = re.compile("http[^\s]+", re.I)

for i in range(len(tweet_corpus)):
    tweet_corpus[i] = url_pattern.sub('', tweet_corpus[i])

In [7]:
non_ascii_pattern = re.compile("[^\u0000-\u2300]")

for i in range(len(tweet_corpus)):
    tweet_corpus[i] = non_ascii_pattern.sub('', tweet_corpus[i])

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tweet_corpus)

In [9]:
tokenizer.get_config()

{'num_words': None,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': False,
 'oov_token': None,
 'document_count': 4219,
 'word_counts': '{"they": 247, "are": 552, "starting": 7, "to": 1577, "get": 78, "more": 116, "and": 1162, "desperate": 1, "this": 578, "shows": 14, "that": 450, "we\\u2019re": 16, "winning": 3, "i": 331, "do": 101, "not": 430, "believe": 27, "we": 306, "will": 189, "defeat": 1, "donald": 1, "trump": 4, "with": 518, "a": 928, "candidate": 1, "like": 77, "joe": 5, "biden": 2, "who": 171, "supported": 9, "the": 3286, "iraq": 1, "war": 5, "indigenous": 342, "rights": 108, "climate": 47, "justice": 19, "wetsuwetenstrong": 435, "keepitintheground": 2, "stop": 55, "running": 12, "away": 16, "from": 288, "your": 133, "problem": 12, "run": 8, "into": 45, "it": 328, "suck": 5, "really": 38, "but": 156, "when": 96, "you": 406, "pop": 2, "out": 181, "the\\u2026": 77, "support": 142, "wet\\u2019suwet\\u2019en": 190, "nation": 101,

In [10]:
token_matrix = tokenizer.texts_to_matrix(tweet_corpus, "tfidf")

In [11]:
token_matrix

array([[0.        , 0.        , 1.44524466, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.07772501, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 2.57176912, 0.        , ..., 0.        , 7.65468017,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        7.65468017],
       [0.        , 1.07772501, 1.44524466, ..., 0.        , 0.        ,
        0.        ]])

In [12]:
token_matrix.shape

(4219, 9898)

In [13]:
df2 = df[['has_urls', 'has_mentions', 'high_response']].copy()
df2.head()

Unnamed: 0,has_urls,has_mentions,high_response
0,True,False,True
1,False,False,True
2,True,False,True
3,True,False,True
4,True,False,True


In [14]:
df2.shape

(4219, 3)

In [15]:
token_df = pd.DataFrame(data=token_matrix)

In [16]:
token_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9888,9889,9890,9891,9892,9893,9894,9895,9896,9897
0,0.0,0.0,1.445245,0.0,1.633177,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.077725,0.0,0.0,0.0,0.0,0.0,1.814638,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.824747,0.0,0.0,1.633177,1.730126,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
df2 = pd.concat([token_df, df2], axis=1)

In [18]:
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9891,9892,9893,9894,9895,9896,9897,has_urls,has_mentions,high_response
0,0.0,0.0,1.445245,0.0,1.633177,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,True
1,0.0,1.077725,0.0,0.0,0.0,0.0,0.0,1.814638,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,True
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,True
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,True
4,0.0,1.824747,0.0,0.0,1.633177,1.730126,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,True


In [19]:
target = df2['high_response']
input_cols = list(df2.columns[:-1])
inputs = df2[input_cols]

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(inputs, target, stratify=target)

In [22]:
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [23]:
prediction = tree.predict(X_test)
print(metrics.classification_report(Y_test, prediction,
                                    zero_division=0))
print(metrics.confusion_matrix(Y_test, prediction,
                               labels=tree.classes_))

              precision    recall  f1-score   support

       False       0.97      0.98      0.98      1024
        True       0.16      0.10      0.12        31

    accuracy                           0.96      1055
   macro avg       0.57      0.54      0.55      1055
weighted avg       0.95      0.96      0.95      1055

[[1008   16]
 [  28    3]]


In [24]:
tokens = tokenizer.word_index

In [25]:
tokens = list(tokens.keys())

In [26]:
tokens[:5]

['the', 'to', 'of', 'and', 'in']

In [27]:
feature_names = tokens + [''] + list(df2.columns[-3:-1])
feature_names[-6:]

['mandatory…', 'contaminated', 'educates', '', 'has_urls', 'has_mentions']

In [28]:
# r = export_text(tree, feature_names=list(df2.columns[:-1]))
r = export_text(tree, feature_names=feature_names)
print(r)

|--- tires <= 3.48
|   |--- cost <= 3.48
|   |   |--- refugee <= 3.62
|   |   |   |--- 430pm… <= 3.83
|   |   |   |   |--- keepitintheground <= 3.62
|   |   |   |   |   |--- saudi <= 3.83
|   |   |   |   |   |   |--- temper <= 3.83
|   |   |   |   |   |   |   |--- grocery <= 3.83
|   |   |   |   |   |   |   |   |--- becaus… <= 3.83
|   |   |   |   |   |   |   |   |   |--- exper… <= 3.83
|   |   |   |   |   |   |   |   |   |   |--- carribean <= 3.83
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 79
|   |   |   |   |   |   |   |   |   |   |--- carribean >  3.83
|   |   |   |   |   |   |   |   |   |   |   |--- class: True
|   |   |   |   |   |   |   |   |   |--- exper… >  3.83
|   |   |   |   |   |   |   |   |   |   |--- class: True
|   |   |   |   |   |   |   |   |--- becaus… >  3.83
|   |   |   |   |   |   |   |   |   |--- class: True
|   |   |   |   |   |   |   |--- grocery >  3.83
|   |   |   |   |   |   |   |   |--- class: True
|   |   |   |   |   |   |---

In [29]:
g = export_graphviz(tree, out_file=None,
                   class_names=['False', 'True'],
                   feature_names=feature_names, # default: list(df2.columns[:-1]
                   filled=True)

In [30]:
graph = graphviz.Source(g)

In [31]:
graph.render("tree_no_follow_count") # Accurate but meaningless

'tree_no_follow_count.pdf'

In [33]:
tree_gini = DecisionTreeClassifier(min_samples_split=8,
                                   random_state=0)
tree_gini.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=8,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [37]:
tree_entropy = DecisionTreeClassifier(min_samples_split=8,
                                      criterion='entropy',
                                      random_state=0)
tree_entropy.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=8,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [39]:
for tree in [tree_gini, tree_entropy]:
    prediction = tree.predict(X_test)
    print(tree.get_params()['criterion'])
    print(metrics.classification_report(Y_test, prediction,
                                        zero_division=0))
    print(metrics.confusion_matrix(Y_test, prediction,
                                   labels=tree.classes_))
    print()

gini
              precision    recall  f1-score   support

       False       0.97      0.99      0.98      1024
        True       0.15      0.06      0.09        31

    accuracy                           0.96      1055
   macro avg       0.56      0.53      0.54      1055
weighted avg       0.95      0.96      0.95      1055

[[1013   11]
 [  29    2]]

entropy
              precision    recall  f1-score   support

       False       0.97      0.99      0.98      1024
        True       0.21      0.13      0.16        31

    accuracy                           0.96      1055
   macro avg       0.59      0.56      0.57      1055
weighted avg       0.95      0.96      0.96      1055

[[1009   15]
 [  27    4]]



In [40]:
r = export_text(tree_entropy, feature_names=feature_names)
print(r)

|--- tires <= 3.48
|   |--- a <= 0.83
|   |   |--- has_mentions <= 0.50
|   |   |   |--- refugee <= 3.62
|   |   |   |   |--- cost <= 3.48
|   |   |   |   |   |--- railroad <= 3.08
|   |   |   |   |   |   |--- latest <= 2.76
|   |   |   |   |   |   |   |--- coming <= 2.89
|   |   |   |   |   |   |   |   |--- one <= 1.84
|   |   |   |   |   |   |   |   |   |--- fellow <= 3.02
|   |   |   |   |   |   |   |   |   |   |--- huge <= 3.20
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 40
|   |   |   |   |   |   |   |   |   |   |--- huge >  3.20
|   |   |   |   |   |   |   |   |   |   |   |--- class: True
|   |   |   |   |   |   |   |   |   |--- fellow >  3.02
|   |   |   |   |   |   |   |   |   |   |--- class: True
|   |   |   |   |   |   |   |   |--- one >  1.84
|   |   |   |   |   |   |   |   |   |--- wetsuwetenstrong <= 1.20
|   |   |   |   |   |   |   |   |   |   |--- helped <= 3.48
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
| 

In [41]:
g = export_graphviz(tree, out_file=None,
                   class_names=['False', 'True'],
                   feature_names=feature_names, # default: list(df2.columns[:-1]
                   filled=True)
graph = graphviz.Source(g)
graph.render("tree_no_follow_min_8")

'tree_no_follow_min_8.pdf'