In [2]:
import pandas as pd
import seaborn as sns

import unicodedata
from langdetect import detect
import langid
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

import numpy as np
import nltk
from nltk import bigrams
import itertools
import networkx as nx

import matplotlib.pyplot as plt

In [6]:
#Class created to preprocess the text of the tweets and get the tokens

class TextProcessor():

    def __init__(self):
        pass

    def detect_language(self, text):
        return detect(text)

    def fast_detect_language(self, text):
        return langid.classify(text)[0]

    def remove_accents(self, text):
        # remove accents in text
        #if isinstance(text, unicode):
        text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore')
        return text
        #return text


    def remove_hashtags(self, text):
        regex = r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"  # hash-tags
        text = re.sub(regex, ' ', text)
        return text


    def remove_mentions(self, text):
        regex = r'@[^\s]+'  # remove @-mentions
        text = re.sub(regex, ' ', text)
        return text


    def remove_urls(self, text):
        regex = r'http\S+'  # remove url
        text = re.sub(regex, ' ', text.decode('utf-8'))
        return text


    def remove_punctuation(self, text):
        # r'[.]',#remove points
        regex = r'[%s]' % re.escape(string.punctuation)  # signsPattern
        text = re.sub(regex, ' ', text)
        return text


    def remove_special_issues(self, text):
        regex_issues = [r'haha*', r'hehe*', r'[0-5][7-9]*']
        for regex in regex_issues:
            text = re.sub(regex, ' ', text)
        return text


    def remove_stop_words(self, tokens):
        stop = stopwords.words('english')
        return [token for token in tokens if token not in stop]


    def normalize_vocals(self, text):
        re.sub(r'a*', 'a', text)
        re.sub(r'e*', 'e', text)
        re.sub(r'i*', 'i', text)
        re.sub(r'o*', 'o', text)
        re.sub(r'u*', 'u', text)
        return text


    def stemming(self, tokens):
        return self.stemmer.apply(tokens)


    def tokenization(self, text):
        # this function pre-process de text of a tweet return a list of tokens
        tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
        text = text.lower()  # text to lower
        text = text.rstrip('\n')
        text = self.remove_accents(text)  # remove accents
        text = self.remove_urls(text)  # remove urls
        text = self.remove_punctuation(text)  # remove sign puntuation
        text = self.remove_special_issues(text)
        text = self.normalize_vocals(text)
        tokens = tknzr.tokenize(text)

        tokens = self.remove_stop_words(tokens)  # remove stop words

        tokens = [token for token in tokens if len(token) > 2]
        text = " ".join(tokens)

        return text

In [3]:
#Read the dataset
maga = pd.read_excel("the MAGA corpus.xlsx",index_col=0)

In [4]:
#Choose a theme
repvsdem = maga[maga["Theme"]=="Republicans vs Democrats"]

In [7]:
#Preprocess the text of the tweet
tp = TextProcessor()
repvsdem["processed_tweet"] = repvsdem["Tweet"].apply(lambda x: tp.tokenization(x).split())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [8]:
#Generate the chains with a co_ocurrence matrix
def generate_co_occurrence_matrix(corpus):
    vocab = set(corpus)
    vocab = list(vocab)
    vocab_index = {word: i for i, word in enumerate(vocab)}
 
    # Create bigrams from all words in corpus
    bi_grams = list(bigrams(corpus))
 
    # Frequency distribution of bigrams ((word1, word2), num_occurrences)
    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))
 
    # Initialise co-occurrence matrix
    # co_occurrence_matrix[current][previous]
    co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))
 
    # Loop through the bigrams taking the current and previous word,
    # and the number of occurrences of the bigram.
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]
        pos_current = vocab_index[current]
        pos_previous = vocab_index[previous]
        co_occurrence_matrix[pos_current][pos_previous] = count
    co_occurrence_matrix = np.matrix(co_occurrence_matrix)
 
    # return the matrix and the index
    return co_occurrence_matrix, vocab_index
 
text_data = repvsdem["processed_tweet"].to_list()
 
# Create one list using many lists
data = list(itertools.chain.from_iterable(text_data))
matrix, vocab_index = generate_co_occurrence_matrix(data)
 
df = pd.DataFrame(matrix, index=vocab_index,
                             columns=vocab_index)


In [34]:
#Creation of  a graph to get the spanning trees that will generate chains for the visualization

source = []
target = []
weight = []
for i in range(len(idx)):
    weight.append(matrix[idx[i],idy[i]])
    source.append(list(vocab_index.keys())[idx[i]])   
    target.append(list(vocab_index.keys())[idy[i]])

In [35]:
edges = pd.DataFrame({'source':source,'target': target,'weight':weight})
G = nx.from_pandas_edgelist(edges, edge_attr=True)

In [67]:
# That's it. You can search any word here and will give you the chains of words
X = nx.dfs_successors(G,"realdonaldtrump")

In [68]:
#This as json will create the sunburst visualization
X

{'realdonaldtrump': ['without'],
 'without': ['hired'],
 'hired': ['hunt'],
 'hunt': ['plain'],
 'plain': ['shirts'],
 'shirts': ['disgusting'],
 'disgusting': ['separating'],
 'separating': ['stop'],
 'stop': ['wont'],
 'wont': ['wrong'],
 'wrong': ['doubt'],
 'doubt': ['wga'],
 'wga': ['maga'],
 'maga': ['amnesty',
  'retiredcdnrjb',
  'tkewalt',
  'thelastscout',
  'dtruth',
  'seinfeld',
  'cuck',
  'worldwide',
  'tagging',
  'therendernba',
  'reploubarletta',
  'hashtagginf',
  'vakruta',
  'realization',
  'carminezozzora',
  'amysuds',
  'julia',
  'trumpconspired',
  'justicedems',
  'veritasvirtusx',
  'emmakennedy',
  'cantaloupenews',
  'richdborges',
  'kayajones',
  'lasvegas',
  'joefreedomlove',
  'mcmickey',
  'karxnka',
  'thejordanrachel',
  'poldeek',
  'superracist',
  'kimdotcom',
  'bettierose',
  'trusttheprocess',
  'judirymer'],
 'amnesty': ['immediate'],
 'immediate': ['deserve'],
 'deserve': ['doesnt', 'cambridge'],
 'doesnt': ['putin'],
 'putin': ['roll', 