In [None]:
!pip install scattertext
!pip install wordcloud
!pip install networkx
!pip install textblob
!pip install spacy
!pip install nltk
!pip install sklearn
!pip install pyvis

In [None]:
import re 
import os
import ast
from random import sample
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from typing import Tuple,List,Dict

import wordcloud
from wordcloud import WordCloud, STOPWORDS
import scattertext as st


from textblob import TextBlob
import spacy
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import sklearn
from sklearn import preprocessing
import pyvis 
from pyvis.network import Network
import networkx as nx

In [None]:
nltk.download('vader_lexicon')
pd.set_option('display.max_colwidth', 0)

In [None]:
root = "./drive/MyDrive/OFFICE/School/ISYE6748/"

twitterRoot = os.path.join(root,"data/twitter-data")
pathToData = os.path.join(twitterRoot,"twitter_large_dataset.csv")

cleanMediumDataPath = os.path.join(twitterRoot,"clean-data/clean_twitter_medium_dataset.csv")
cleanFullDataPath = os.path.join(twitterRoot,"clean-data/clean_twitter_large_dataset.txt")

labeledFullDataPath = os.path.join(twitterRoot,"labeled-data")

labeledFilePath = os.path.join(labeledFullDataPath,"twitter-large-dataset-labeled-cnn-1-model-2-dropout.csv")

In [None]:
sentimentColorMap  = {
    'anger': (1,0.498039215686275,0,1),
    'fear': (0.894117647058824, 0.101960784313725, 0.109803921568627, 1.0),
    'joy': (0.215686274509804, 0.494117647058824, 0.72156862745098, 1.0),
    'love': (0.301960784313725, 0.686274509803922, 0.290196078431373, 1.0),
    'sadness': (0.596078431372549, 0.305882352941176, 0.63921568627451, 1.0),
    'surprise': (0.890196078431372,0.101960784313725,0.109803921568627,1)
    }

In [None]:
dfFull = pd.read_csv(pathToData, index_col=0)[["id","reply_to_id","screen_name"]]
dfFull = dfFull.astype({"id":pd.Int64Dtype(),"reply_to_id":pd.Int64Dtype()})
dfFull = dfFull.astype({"id":str,"reply_to_id":str})

In [None]:
df = pd.read_csv(labeledFilePath, index_col= None, encoding="utf-8")
df = df.astype({"id":pd.Int64Dtype()})
df = df.astype({"id":str})

In [None]:
df = df.merge(dfFull, how = "left", on = "id")

In [None]:
df["color"] = df.sentiment.apply(lambda x: sentimentColorMap[x])

In [None]:
df.head(4)

In [None]:
maxIndex = len(df.id.unique())

In [None]:
df["reply_to_id"] = df["reply_to_id"].replace("<NA>",-1)

df["edges"] = tuple(zip(df["id"],df["reply_to_id"]))

dfEdges = df.groupby("edges")[["id"]].count().reset_index().sort_values(by="id", ascending = False)

dfEdges[["source","target"]] = pd.DataFrame(dfEdges['edges'].tolist(), index = dfEdges.index)


In [None]:
nodeIdA = df.groupby('id')[["screen_name"]].count().reset_index().rename(columns={"screen_name":"count"})
nodeIdB = df.groupby('reply_to_id')[["screen_name"]].count().reset_index().rename(columns={"screen_name":"count", "reply_to_id":"id"})

In [None]:
nodeIdA = nodeIdA.append(nodeIdB, ignore_index=True)

# Assign Colors to nodes

In [None]:
nodes = nodeIdA.groupby("id")[["count"]].sum().reset_index()
nodes = nodes.astype({"id":str})
nodes = nodes.merge(df[["id","color"]], on = "id", how="left")
nodes["color"] = nodes["color"].fillna("(0.807, 0.807, 0.807,1)")
nodes.head(5)

In [None]:
nodes = nodes[1:-1]
nodes

In [None]:
nodesPath = "./drive/MyDrive/OFFICE/School/ISYE6748/data/graph/nodes_full_id.csv"
nodes.to_csv(nodesPath,header = True, index = False)

In [None]:
dfEdges = dfEdges[dfEdges["target"]!=-1][["source","target"]].reset_index(drop = True)

In [None]:
edgePath = "./drive/MyDrive/OFFICE/School/ISYE6748/data/graph/edges_full_id.csv"
dfEdges.to_csv(edgePath,header = True, index = False)

# Build Nodes and Edges for Relational Graph

In [None]:
g = Network(height=800,width=800, notebook=True)
g.toggle_hide_edges_on_drag(True)
g.barnes_hut()

### Add nodes to the graph

In [None]:
for i in nodes.index:
  g.add_node(nodes.id[i])
len(g.nodes)

### Add edges to the graph

In [None]:
nodeIds = nodes.id.to_list()
for i in dfEdges.index:
  source = dfEdges.source[i]
  target = dfEdges.target[i]
  if source in nodeIds and target in nodeIds:
    g.add_edge(source,target)
  else: 
    print("either target or source not in nodelist", edges)
len(g.edges)

In [None]:
g.show_buttons(filter_=['physics'])

from IPython.core.display import display, HTML

g.show('network.html')
display(HTML('network.html'))