# Neo4j data loading setup
Using neo4j, we’ll load in twitter data to create a network showing the relationships between legislators’ tweets, other users they mention, and hashtags they mention. 

***

In [1]:
import pandas as pd
import numpy as np
import sys
import os

In [2]:
ROOT_DIR = "../../"
DATA_DIR = ROOT_DIR + "data/"

In [3]:
# Load utils
sys.path.insert(0, os.path.abspath(ROOT_DIR + "/src"))
from utils import *

### Import local password and NEO4J_DATA_DIR
Create a .py file containing the following declarations:
- password = <Your neo4j server password>
- NEO4J_DATA_DIR = <C:/Users/PC USERNAME/AppData/Local/Neo4j/Relate/Data/dbmss/SERVER ID/import/data/>

In [4]:
from pword import *

### Connect to local neo4j database

In [18]:
from neo4j import GraphDatabase
uri = 'neo4j://localhost:7687'
driver = GraphDatabase.driver(uri, auth=("neo4j", password))
session = driver.session()

***

## Data loading

In [6]:
legislators_fname = DATA_DIR + "legislators-current.csv"
full_tweets_fname = DATA_DIR + "tweets.json"
tweets_fname = DATA_DIR + "legis_tweets.csv"

### Load legislators

In [None]:
legis_df = pd.read_csv(legislators_fname)

### Create tweets subset for only legislators present in the dataset

In [None]:
legis_tweets = []
legislator_names = set(legis_df.twitter.unique())
loaded_rows = 0

for chunk in pd.read_json(full_tweets_fname, lines=True, chunksize=1000):
    loaded_rows += chunk.shape[0]
    print(f"Loaded {loaded_rows} rows", end='\r')
    for row in range(chunk.shape[0]):
        if chunk.iloc[row].screen_name in legislator_names:
            row = chunk.iloc[row].to_dict()
            # Remove all single and double quotes from text
            row["text"] = row["text"].replace('\'', '').replace("\"", "").replace('’', '').replace('\\', '/')
            # Add a hashtag key
            row["htags"] = row["entities"]["hashtags"]
            # Add a user_mentions key
            row["user_mentions"] = row["entities"]["user_mentions"]
            
            legis_tweets.append(row)

In [None]:
legis_tweets_df = pd.DataFrame(legis_tweets)

In [None]:
legis_tweets_df.to_csv(tweets_fname, index=False)

### Load tweets

In [20]:
def upload_chunk(fname):
    q = """
    WITH \"file:///""" + fname + """\" AS url
    LOAD CSV WITH HEADERS FROM url AS row
    
    WITH row WHERE NOT row.in_reply_to_screen_name IS NULL
    
    MERGE (p1:Person {twitter:row.screen_name})
    MERGE (p2:Person {twitter:row.in_reply_to_screen_name})
        
    CREATE (p1) - [r:REPLIED] -> (p2)
    
    SET r.create_date = row.created_at
    SET r.text = row.text
    
    RETURN r
    """
    out = session.run(q)
    
    return out.data()

In [21]:
chunk_num = 0
rows_uploaded = 0
# NEO4J_DATA_DIR is defined in pword.py, stored in the same directory as this notebook.
if not os.path.exists(NEO4J_DATA_DIR + "temp/"):
    os.mkdir(NEO4J_DATA_DIR + "temp/")

for chunk in pd.read_csv(tweets_fname, chunksize=10000):
    fname = "temp/" + f"tweets_chunk_{chunk_num}.csv"
    print(" "*50, end='\r')
    print("Saving temp file", end='\r')
    chunk.to_csv(NEO4J_DATA_DIR + fname, index=False)
    print(" "*50, end='\r')
    print(f"Uploading chunk {chunk_num} data", end='\r')
    upload_chunk(DATA_DIR + fname)
    print(" "*50, end='\r')
    print("Removing temp file", end='\r')
    os.remove(NEO4J_DATA_DIR + fname)
    
    chunk_num += 1
    rows_uploaded += chunk.shape[0]
    
print("Tweet upload complete")
print(f"{chunk_num + 1} chunks processed")
print(f"{rows_uploaded} rows uploaded")

Tweet upload complete                             
72 chunks processed
707034 rows uploaded


In [52]:
q = """
MATCH (p1:Person {twitter:'SenSherrodBrown'}) - [r:REPLIED] -> (p2:Person)
RETURN DISTINCT r.text
"""
out = session.run(q)

In [53]:
out = out.data()

In [54]:
len(out)

166

In [55]:
out

[{'r.text': 'As @clevelanddotcom reports: House plan cuts $12mil from Ohio schools each year - money schools need for speech therapy, wheelchairs &amp; more.'},
 {'r.text': '@SenKamalaHarris Were just getting warmed up - see you Sunday. #DefendTheLand'},
 {'r.text': '@facebook @ColumbusChamber By connecting Ohio entrepreneurs with Facebook experts, we can help Ohio businesses grow and continue to create jobs. -SB'},
 {'r.text': 'Passing the INTERDICT Act is one concrete step we can take to help stop fentanyl from destroying any more Ohio families. -SB'},
 {'r.text': '@clevelanddotcom Whatever your thoughts on the #ACA, we can all agree that punishing Ohio students is not the solution.'},
 {'r.text': '@UAW We must never forget the pain endured by these workers, &amp; the countless others who fought to ensure all workers have the right to unionize.'},
 {'r.text': 'It is our duty to honor them, on this day and every day, by supporting veterans, servicemembers &amp; families.'},
 {'r.text'