In [8]:
import pandas as pd
from py2neo import Graph, Node, Relationship
from tqdm import tqdm

from src.twitter_processing import TwitterProcessing
from src.text_processing import TextProcessing
from src.twitter2neo import Twitter2Neo

twitter_p = TwitterProcessing()
t2neo = Twitter2Neo()

Carregando dados

In [9]:
path = 'data/tweets_violencia_infantil.csv'
df = pd.read_csv(path)
df.shape

(100287, 9)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100287 entries, 0 to 100286
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   post_id        100287 non-null  int64 
 1   post_datetime  100287 non-null  object
 2   author         100286 non-null  object
 3   followers      100287 non-null  int64 
 4   sentiment      100287 non-null  object
 5   tweet          100287 non-null  object
 6   tweet_url      100287 non-null  object
 7   topic          100287 non-null  object
 8   screen_name    100287 non-null  object
dtypes: int64(2), object(7)
memory usage: 6.9+ MB


In [11]:
df.head()

Unnamed: 0,post_id,post_datetime,author,followers,sentiment,tweet,tweet_url,topic,screen_name
0,316990280,2021-03-17 17:19:34,Mc Tristeza🌙,1454,neutro,"RT @CrimesReais: Passadas algumas horas, o rec...",http://twitter.com/naah__moraes/status/1372281...,NÃO IDENTIFICADO,naah__moraes
1,317598323,2021-03-22 09:05:27,lua.🗡,582,neutro,achei que era pra somar os pontos vei tchau,http://twitter.com/feysant/statuses/1373969066...,VIOLENCIA INFANTIL,feysant
2,316991927,2021-03-17 17:36:43,vamBIro,330,neutro,"RT @angie_sai: Odiei essa notícia aqui.\n""Mora...",http://twitter.com/Aniahconda/status/137228579...,JOVENS,Aniahconda
3,315287986,2021-03-03 16:53:41,gimoli៹,172,neutro,mano o akutami é um doente do caralho. primeir...,http://twitter.com/jjkthange/statuses/13672015...,PEDOFILIA,jjkthange
4,315089002,2021-03-02 11:30:27,"Dra. Deusa em isolamento social, A Menina do L...",500,neutro,@nirgellep @Zan4rkZNK @avdkdvra Pedofilia é an...,http://twitter.com/FemeaLetrada/statuses/13667...,PEDOFILIA,FemeaLetrada


Gerando amostra

In [12]:
df = df.sample(10000)
df.shape

(10000, 9)

Processamento dos dados para extração de features

In [13]:
df['rt'] = df['tweet'].apply(lambda x: twitter_p.getRTScreenName(tweet_text=x))

Processando dados para ingestão

In [34]:
## Lista de usuários (screen_name e rt)
users = df['screen_name'].unique().tolist() + [u for u in df['rt'].unique().tolist() if u != '']
users = set(users)
pd.DataFrame(users, columns=['user']).to_csv('data/neo4j/users.csv', encoding='utf8', index=False)

## Lista de sentimentos
sentiments = df['sentiment'].unique().tolist()
pd.DataFrame(sentiments, columns=['sentiment']).to_csv('data/neo4j/sentiments.csv', encoding='utf8', index=False)

## Lista de tópicos
topics = df['topic'].unique().tolist()
pd.DataFrame(topics, columns=['topic']).to_csv('data/neo4j/topics.csv', encoding='utf8', index=False)

len(users), len(sentiments), len(topics)

(11002, 2, 10)

In [35]:
## Lista de Tweets
## Relacionamento: Tweet -> Sentimento
## Relacionamento: Tweet -> Tópico
## Relacionamento: User -> POST -> Tweet
## Relcionamento: User -> RETWEET -> Tweet

Ingestão Neo4j

In [7]:
uri = "bolt://54.172.32.171:7687"
user = "neo4j"
password = "death-swimmers-flash"

graph = Graph(uri, user=user, password=password)

Nós
- Tweet: id (post_id), date (post_date), text (tweet), url (tweet_url)
- User: username (screen_name ou rt)
- Sentiment: positivo, negativo, neutro
- Topic

Relacionamentos
- POST: User -> POST - Tweet
- RETWEET: User -> RETWEET -Tweet
- SENTIMENT: Tweet-> SENTIMENT - Sentiment
- TALK: Tweet -> TALK - Topic

Criando constraints

In [67]:
graph.run('CREATE CONSTRAINT ON (t:Tweet) ASSERT t.id IS UNIQUE')
graph.run('CREATE CONSTRAINT ON (u:User) ASSERT u.username IS UNIQUE')
graph.run('CREATE CONSTRAINT ON (s:Sentiment) ASSERT s.name IS UNIQUE')
graph.run('CREATE CONSTRAINT ON (t:Topic) ASSERT t.name is UNIQUE')

Processamento dos dados para ingestão

(10958, 2, 12)

In [69]:
#Criando Nós Usuários
for u in tqdm(users):
    user = Node('User', id=u, name=u)
    graph.create(user)

100%|██████████| 10958/10958 [1:37:48<00:00,  1.87it/s]


In [56]:
df[['screen_name', 'tweet', 'rt', 'sentiment', 'topic']].shape

(10000, 5)

In [29]:
for i,r in df.head(2).iterrows():
    try:
        tweet = Node('Tweet', id=r['post_id'], text=r['tweet']) #'post_datetime', 'tweet_url'
        graph.create(tweet)
    except Exception as ex:
        #Tratar erros
        pass

    


    
#a = Node("Person", name="Alice", age=33)
#b = Node("Person", name="Bob", age=44)
#c = Node("Person", name="Matheus", age=44)
#KNOWS = Relationship.type("KNOWS")
#graph.merge(KNOWS(c, a), "Person", "name")
#print(df.columns)
#df['topic'].value_counts()

In [15]:
df.head()

Unnamed: 0,post_id,post_datetime,author,followers,sentiment,tweet,tweet_url,topic,screen_name,rt
46214,311697042,2021-01-20 17:32:53,Telma Reis,204,neutro,RT @Cristia12114557: Biden começou muito bem.....,http://twitter.com/telminhareis/statuses/13519...,PEDOFILIA,telminhareis,@Cristia12114557
20158,315742989,2021-03-06 21:43:57,mad.,64,neutro,caras eu vo pra terapeuta pq meu tio me assedi...,http://twitter.com/95kthdior/statuses/13683617...,PEDOFILIA,95kthdior,
19666,313403940,2021-02-08 07:35:36,Fabiano Silva 🇧🇷🥛,109,neutro,RT @Maxcardosobr: Como editor do portal Terça ...,http://twitter.com/bihannotop/statuses/1358726...,PEDOFILIA,bihannotop,@Maxcardosobr
20281,317418308,2021-03-20 15:04:06,𝚖𝚘𝚘𝚗⁷,1089,neutro,RT @alinadurso: só se importam com mulheres se...,http://twitter.com/mjklight/statuses/137333455...,PEDOFILIA,mjklight,@alinadurso
12723,311749929,2021-01-21 14:04:20,Ltannus,698,neutro,RT @bernardokuster2: Twitter bloqueia conta da...,http://twitter.com/ltannus/statuses/1352301010...,GENERO INFANTIL,ltannus,@bernardokuster2
