In [None]:
!pip install scattertext
!pip install wordcloud
!pip install networkx
!pip install textblob
!pip install spacy
!pip install nltk
!pip install sklearn
!pip install pyvis


In [None]:
import re 
import os
import ast
from random import sample
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from typing import Tuple,List,Dict
import json 
from ast import literal_eval

import wordcloud
from wordcloud import WordCloud, STOPWORDS
import scattertext as st


from textblob import TextBlob
import spacy
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import sklearn
from sklearn import preprocessing
import pyvis 
from pyvis.network import Network
import networkx as nx

In [None]:
nltk.download('vader_lexicon')
pd.set_option('display.max_colwidth', 0)

In [None]:
!dir

# Get nodes and edges dataframes

In [None]:
root = "./drive/MyDrive/OFFICE/School/ISYE6748/"
nodesPath = "./drive/MyDrive/OFFICE/School/ISYE6748/data/graph/nodes_full_id.csv"
edgePath = "./drive/MyDrive/OFFICE/School/ISYE6748/data/graph/edges_full_id.csv"
twitterRoot = os.path.join(root,"data/twitter-data")
pathToData = os.path.join(twitterRoot,"twitter_large_dataset.csv")

cleanMediumDataPath = os.path.join(twitterRoot,"clean-data/clean_twitter_medium_dataset.csv")
cleanFullDataPath = os.path.join(twitterRoot,"clean-data/clean_twitter_large_dataset.txt")

labeledFullDataPath = os.path.join(twitterRoot,"labeled-data")

labeledFilePath = os.path.join(labeledFullDataPath,"twitter-large-dataset-labeled-cnn-1-model-2-dropout.csv")

# build colors

In [None]:
sentimentColorMap  = {
    'anger': (1,0.498039215686275,0,1),
    'fear': (0.894117647058824, 0.101960784313725, 0.109803921568627, 1.0),
    'joy': (0.215686274509804, 0.494117647058824, 0.72156862745098, 1.0),
    'love': (0.301960784313725, 0.686274509803922, 0.290196078431373, 1.0),
    'sadness': (0.596078431372549, 0.305882352941176, 0.63921568627451, 1.0),
    'surprise': (0.890196078431372,0.501960784313725,0.509803921568627,1)
    }

In [None]:
dfFull

In [None]:
dfFull = pd.read_csv(pathToData, index_col=0)[["id","reply_to_id","screen_name"]]
dfFull = dfFull.astype({"id":pd.Int64Dtype(),"reply_to_id":pd.Int64Dtype()})
dfFull = dfFull.astype({"id":str,"reply_to_id":str})
dfFull["reply_to_id"] = dfFull["reply_to_id"].replace("<NA>",-1)

In [None]:
df = pd.read_csv(labeledFilePath, index_col= None, encoding="utf-8")
df = df.astype({"id":pd.Int64Dtype()})
df = df.astype({"id":str})

In [None]:
df = dfFull.merge(df, how = "left", on = "id")

In [None]:
df["color"] = df.sentiment.apply(lambda x: sentimentColorMap[x])

In [None]:
nodesDf = pd.read_csv(nodesPath, index_col= 0, encoding="utf-8")

### Update color RGBA to Hex

In [None]:
nodesDf["color"] = nodesDf["color"].apply(lambda x: matplotlib.colors.to_hex(literal_eval(x), keep_alpha=True))

In [None]:
twitterScores = pd.DataFrame({"nodeId":nodesDf.index, "color":nodesDf.color}).reset_index(drop = True)

In [None]:
twitterScores.head(4)

### Stage the relational data for tree

In [None]:
#get origin id and reply id for data when origin id replies to reply id
replyToId = df[df["reply_to_id"]!=-1].reply_to_id.to_list()
ids = df[df["reply_to_id"]!=-1].id.to_list()

In [None]:
# Get the origin ids without a source tweet (original tweets)
originIds = df[df["reply_to_id"]==-1].id.to_list()
originIds

In [None]:
originNodes = np.array(originIds)
sourceNodes = df[df["reply_to_id"]!=-1].dropna().id.to_numpy()
targetNodes = df[df["reply_to_id"]!=-1].dropna().reply_to_id.to_numpy()

In [None]:
# Get all the nodes in one np array 
nodes = np.union1d(np.union1d(originNodes,sourceNodes),targetNodes)

### add hex colors to nodes and save json

In [None]:
for key in sentimentColorMap:
  sentimentColorMap[key] = matplotlib.colors.to_hex(sentimentColorMap[key], keep_alpha=True)

sentimentColorMap

In [None]:
x_coordinates = [1,2,3,4,5,6] # Added missing datapoint
y_coordinates = [0,0,0,-1,-1,-1] # Added missing datapoint
size_map = [50,100,200,400,800,1200] # Added missing datapoint
colors = list(sentimentColorMap.values())
color_map = [color for color in colors[:len(x_coordinates)]]
plt.figure(figsize=(10,10))
plt.scatter(x_coordinates,y_coordinates, s = size_map, c = color_map)

# The following two lines generate custom fake lines that will be used as legend entries:
markers = [plt.Line2D([0,0],[0,0],color=color, marker='o', linestyle='') for color in sentimentColorMap.values()]
lgnd = plt.legend(markers, sentimentColorMap.keys(), numpoints=1, fontsize="x-large")
for dot in lgnd.legendHandles:
  dot._legmarker.set_markersize(18)
plt.show()

# Build nodes

In [None]:
twitterScores["nodeId"] = twitterScores.nodeId.astype(int)
nodes = [int(id) for id in nodes[0:-1]]

In [None]:
# Create a node id to a color map
%%time
nodeColorMap = {}
for id in nodes:
  
  result = twitterScores[twitterScores.nodeId == id]
  
  if len(result): 
    nodeColorMap[str(id)] = str(twitterScores[twitterScores.nodeId == int(id)].iloc[0,1])
  else:

    nodeColorMap[str(id)] = "#cecece"

In [None]:
# Save nodeId to color map as json, it took a 
import json
nodeColorsPath = "./drive/MyDrive/OFFICE/School/ISYE6748/data/graph/node_full_colors.json"
with open(nodeColorsPath, 'w') as fp:
    json.dump(colors, fp)

# Build Relational Tree with a recursive function

In [None]:
def buildTree(node, depth):
  """This is a recursive function"""
  
  indices = np.where(targetNodes == node)
  count = len(indices[0].tolist())
  
  if not count:
    if nodeTree["treeDepth"]<depth:
      nodeTree["treeDepth"] = depth
    return
  
  else: 
    repliesToNode = sourceNodes[indices]
    
    for replyNode in repliesToNode:
      nodeTree["edges"].append((node,replyNode))
      nodeTree["nodes"].append(replyNode)
      nodeTree["color"].append(nodeColorMap[replyNode])
      nodeTree["edgeCount"]+=1
      nodeTree["nodeCount"]+=1
      
      return buildTree(replyNode, depth+1)

# Grow Relational Tree
1. Start at origin nodes, and try to build the tree as far as it goes

2. look at what tweets repied to the origin tweet,

3. for each tweet id that replied to the origin, find tweets that replied to it

4. ... do that until no more replies are present in data

This is the process that I call Growing a Relational Tree

In [None]:
originNodes

In [None]:
# Start at origin nodes, and try to build the tree as far as it goes
# 1. look at what tweets repied to the origin tweet,
# 2. for each tweet id that replied to the origin, find tweets that replied to it
# 3. ... do that until no more replies are present in data
# This is the process that I call Growing a Relational Tree
relationalTrees = []

for node in originNodes: 
  node = node
  nodeTree = {"originNode":node,
              "treeDepth":0, 
              "edgeCount":0,
              "nodeCount":1,
              "edges":[],
              "nodes":[],
              "color":[]}
  
  nodeTree["nodes"].append(node)
  nodeTree["color"].append(nodeColorMap[node])
  
  indices = np.where(targetNodes == node)
  repliesToNode = sourceNodes[indices]

  for replyNode in repliesToNode: 
    nodeTree["edges"].append((node,replyNode))
    nodeTree["nodes"].append(replyNode)
    nodeTree["color"].append(nodeColorMap[replyNode])
    nodeTree["edgeCount"]+=1
    nodeTree["nodeCount"]+=1
    depth = 1
    
    buildTree(replyNode,depth+1)

  relationalTrees.append(nodeTree)

len(relationalTrees)

# Save Relational Tree

In [None]:
treePath ="./drive/MyDrive/OFFICE/School/ISYE6748/data/graph/trees_full.json"
with open(treePath, 'w') as fp:
    json.dump(relationalTrees, fp)

In [None]:
treePath ="./drive/MyDrive/OFFICE/School/ISYE6748/data/graph/trees_full.json"
with open(treePath, 'r') as f:
    relationalTrees = json.load(f)

# Get branches that split to other branches
Stage for visualizing results

In [None]:
depth4Trees = [tree for tree in relationalTrees if (tree["edgeCount"]>0) and (tree["treeDepth"]>3)]
len(depth4Trees)

# Build Graph and Visualize Results

In [None]:
g = Network(height=800,width=800, notebook=True)
g.toggle_hide_edges_on_drag(True)
g.barnes_hut()

In [None]:
for tree in depth4Trees:
  for i in range(len(tree["nodes"])):
    nodeInt = int(tree["nodes"][i])
    color = tree["color"][i]
    if i == 0: 
      g.add_node(nodeInt, color = color, size = 300, node_shape = "s") 
    else:
      g.add_node(nodeInt, color = color,size = 200) 
      
      
  for edge in tree["edges"]:
    source = int(edge[0])
    target = int(edge[1])
    g.add_edge(source,target)

In [None]:
g.show_buttons(filter_=['physics'])

from IPython.core.display import display, HTML

g.show('network.html')
display(HTML('network.html'))